Locked History Actions

attachment:copy_spinn3r_to_hdfs.pl of Spinn3rFormat

Attachment 'copy_spinn3r_to_hdfs.pl'

Download

   1 #!/usr/bin/perl
   2 
   3 ### Example usage:
   4 # ./copy_spinn3r_to_hdfs.pl \
   5 # /afs/cs/group/infolab/datasets/snap-private/spinn3r/spinn3r-full5/web/2012-06/web-2012-06-05T15-00-00Z.txt
   6 
   7 use DateTime;
   8 
   9 my $inFile = $ARGV[0] or die "Please specify an input file!\n";
  10 
  11 # Location of Spinn3rToHadoopWriter.jar.
  12 my $JAR_DIR = $ENV{'HOME'} . '/repo/spinn3r/code/java/Spinn3rToHadoopWriter_deploy';
  13 
  14 # Arguments for Spinn3rToHadoopWriter.jar
  15 my $HDFS_BASEDIR = 'hdfs://ilhead2:9000/user/west1/spinn3rTest';
  16 my $LANG_PROFILE_BASEDIR = "$JAR_DIR/include/langdetect/";
  17 my $UNICODE_DEGARBLER_TABLE  = "$JAR_DIR/include/unicode_error_table.tsv";
  18 my $LOGGING_PROPERTIES_FILE = "$JAR_DIR/include/logging.properties";
  19 
  20 if ($inFile =~ m{/((web|facebook|twitter)/....-../(web|fb|tw)-(....)-(..)-(..)T(..)-00-00Z\.txt)$}) {
  21 	my ($suffix, $contentType, $year, $month, $day, $hour) = ($1, $2, $4, $5, $6, $7);
  22 	my $outFile = "$HDFS_BASEDIR/$suffix";
  23 	#my $outFile = "/tmp/spinn3r.out";
  24 	my $hourString = "$year$month$day$hour";
  25 	my $spinn3rVersion = inferSpinn3rVersion($year, $month, $day);
  26 	my $cmd = join (' ',
  27 		"java -jar '$JAR_DIR/Spinn3rToHadoopWriter.jar' '$inFile' '$outFile' '$hourString'",
  28 		"'$contentType' '$spinn3rVersion' '$LANG_PROFILE_BASEDIR' '$UNICODE_DEGARBLER_TABLE'",
  29 		"'$LOGGING_PROPERTIES_FILE'"
  30 		);
  31 	print "$cmd\n";
  32 	`$cmd`;
  33 } else {
  34 	die "Illegal file name: $inFile\n";
  35 }
  36 
  37 # ERROR to resolve:
  38 # Call to ilhead2/10.79.15.100:9000 failed on local exception: java.io.EOFException
  39 
  40 sub inferSpinn3rVersion {
  41 	my $year = shift;
  42 	my $month = shift;
  43 	my $day = shift;
  44 	my $dateTime = $dt = DateTime->new(year => $year, month => $month, day => $day);
  45 	my $VERSION_B_START = DateTime->new(year => 2010, month => 7, day => 14);
  46 	my $VERSION_C_START = DateTime->new(year => 2010, month => 7, day => 27);
  47 	my $VERSION_D_START = DateTime->new(year => 2013, month => 4, day => 29);
  48 	my $VERSION_E_START = DateTime->new(year => 2014, month => 5, day => 22);
  49 	if ($dateTime < $VERSION_B_START) {
  50 		return 'A';
  51 	} elsif ($dateTime >= $VERSION_B_START && $dateTime < $VERSION_C_START) {
  52 		return 'B';
  53 	} elsif ($dateTime >= $VERSION_C_START && $dateTime < $VERSION_D_START) {
  54 		return 'C';
  55 	} elsif ($dateTime >= $VERSION_D_START && $dateTime < $VERSION_E_START) {
  56 		return 'D';
  57 	} else {
  58 		return 'E';
  59 	}
  60 }

Attached Files

To refer to attachments on a page, use attachment:filename, as shown below in the list of files. Do NOT use the URL of the [get] link, since this is subject to change and can break easily.
  • [get | view] (2014-08-08 16:14:15, 5332.2 KB) [[attachment:Main.jar.F4v3-20140521]]
  • [get | view] (2014-08-08 16:13:04, 5328.9 KB) [[attachment:Main.jar.F4v4-20140808]]
  • [get | view] (2014-09-16 23:02:02, 82133.1 KB) [[attachment:Spinn3rToHadoopWriterV2.jar]]
  • [get | view] (2014-09-16 23:10:10, 84977.9 KB) [[attachment:Spinn3rToHadoopWriterV2.tar.gz]]
  • [get | view] (2014-09-16 23:02:30, 3.3 KB) [[attachment:copy.sh]]
  • [get | view] (2014-08-08 16:26:34, 2.2 KB) [[attachment:copy_spinn3r_to_hdfs.pl]]
  • [get | view] (2014-09-16 23:02:46, 0.7 KB) [[attachment:handle_one.sh]]
  • [get | view] (2014-08-08 16:33:00, 8.9 KB) [[attachment:notes.txt]]
  • [get | view] (2014-09-16 23:02:56, 3.3 KB) [[attachment:run_java.pl]]
  • [get | view] (2014-09-16 23:03:17, 90566.0 KB) [[attachment:spinn3rToHadoopAllTogether.tar.gz]]
  • [get | view] (2014-08-08 16:26:29, 2231.9 KB) [[attachment:spinn3rhadoop_java.tgz]]
  • [get | view] (2014-08-08 16:16:20, 8.4 KB) [[attachment:spinn3rreaderd.tgz.F4v3-20140521]]
  • [get | view] (2014-08-08 16:24:58, 2.9 KB) [[attachment:unicode_history.txt]]

You are not allowed to attach a file to this page.