Locked History Actions

attachment:run_java.pl of Spinn3rFormat

Attachment 'run_java.pl'

Download

   1 #!/usr/bin/perl
   2 
   3 ### Example usage:
   4 # ./one_spinn3r_doc_to_hdfs.pl \
   5 # /afs/cs/group/infolab/datasets/snap-private/spinn3r/spinn3r-full5/web/2012-06/web-2012-06-05T15-00-00Z.txt
   6 
   7 use DateTime;
   8 
   9 my $inFile = $ARGV[0] or die "Please specify an input file!\n";
  10 
  11 # Location of Spinn3rToHadoopWriter.jar.
  12 my $JAR_DIR = '.';
  13 
  14 # Arguments for Spinn3rToHadoopWriter.jar
  15 #my $HDFS_BASEDIR = 'hdfs://ilhead2.stanford.edu:9000/user/niko/spinn3rTest4';
  16 my $HDFS_BASEDIR = 'hdfs://ilhadoop1v1.stanford.edu:8020/dataset/spinn3r2';
  17 my $LANG_PROFILE_BASEDIR = "$JAR_DIR/include/langdetect/";
  18 my $UNICODE_DEGARBLER_TABLE  = "$JAR_DIR/include/unicode_error_table.tsv";
  19 my $LOGGING_PROPERTIES_FILE = "$JAR_DIR/include/logging.properties";
  20 #my $LOGGING_PROPERTIES_FILE = "$JAR_DIR/include/log4j.properties";
  21 
  22 if ($inFile =~ m{/((web|fb|tw)-(....)-(..)-(..)T(..)-00-00Z\.txt)$}) {
  23         my ($fileName, $contentType, $year, $month, $day, $hour) = ($1, $2, $3, $4, $5, $6);
  24         #print "fileName: $fileName; contentType: $contentType; year: $year; month: $month; day: $day; hour: $hour\n";
  25         my $longName = inferLongName($contentType);
  26         #print "$longName/$year-$month/$fileName\n";
  27         my $outFile = "$HDFS_BASEDIR/$longName/$year-$month/$fileName";
  28         my $hourString = "$year$month$day$hour";
  29         my $spinn3rVersion = inferSpinn3rVersion($year, $month, $day);
  30         my $cmd = join (' ',
  31                 "java -Xmx2500M -Xms800M -jar '$JAR_DIR/Spinn3rToHadoopWriterV2.jar' '$inFile' '$outFile' '$hourString'",
  32                 "'$longName' '$spinn3rVersion' '$LANG_PROFILE_BASEDIR' '$UNICODE_DEGARBLER_TABLE'",
  33                 "'$LOGGING_PROPERTIES_FILE'"
  34                 );
  35         #print "$cmd\n";
  36         `$cmd`;
  37 } else {
  38         die "Illegal file name: $inFile\n";
  39 }
  40 
  41 # ERROR to resolve:
  42 # Call to ilhead2/10.79.15.100:9000 failed on local exception: java.io.EOFException
  43 # SOLUTION: in my case this problem was caused by incompatible version of hadoop
  44 # When I had v2.0 and snce the old server is v1.X the problem was present, since the 
  45 # JARs incuded do not match. When I switched to the same version that server is running
  46 # there was no more errors.
  47 
  48 sub inferSpinn3rVersion {
  49         my $year = shift;
  50         my $month = shift;
  51         my $day = shift;
  52         my $dateTime = $dt = DateTime->new(year => $year, month => $month, day => $day);
  53         my $VERSION_B_START = DateTime->new(year => 2010, month => 7, day => 14);
  54         my $VERSION_C_START = DateTime->new(year => 2010, month => 7, day => 27);
  55         my $VERSION_D_START = DateTime->new(year => 2013, month => 5, day => 1);
  56         my $VERSION_E_START = DateTime->new(year => 2014, month => 6, day => 1);
  57         if ($dateTime < $VERSION_B_START) {
  58                 return 'A';
  59         } elsif ($dateTime >= $VERSION_B_START && $dateTime < $VERSION_C_START) {
  60                 return 'B';
  61         } elsif ($dateTime >= $VERSION_C_START && $dateTime < $VERSION_D_START) {
  62                 return 'C';
  63         } elsif ($dateTime >= $VERSION_D_START && $dateTime < $VERSION_E_START) {
  64                 return 'D';
  65         } else {
  66                 return 'E';
  67         }
  68 }
  69 sub inferLongName {
  70         my $in = shift;
  71         use Switch;
  72         switch ($in) {
  73                 case "fb" { return "facebook"}
  74                 case "tw" { return "twitter"}
  75                 case "web" { return "web"}
  76         }
  77 }

Attached Files

To refer to attachments on a page, use attachment:filename, as shown below in the list of files. Do NOT use the URL of the [get] link, since this is subject to change and can break easily.
  • [get | view] (2014-08-08 16:14:15, 5332.2 KB) [[attachment:Main.jar.F4v3-20140521]]
  • [get | view] (2014-08-08 16:13:04, 5328.9 KB) [[attachment:Main.jar.F4v4-20140808]]
  • [get | view] (2014-09-16 23:02:02, 82133.1 KB) [[attachment:Spinn3rToHadoopWriterV2.jar]]
  • [get | view] (2014-09-16 23:10:10, 84977.9 KB) [[attachment:Spinn3rToHadoopWriterV2.tar.gz]]
  • [get | view] (2014-09-16 23:02:30, 3.3 KB) [[attachment:copy.sh]]
  • [get | view] (2014-08-08 16:26:34, 2.2 KB) [[attachment:copy_spinn3r_to_hdfs.pl]]
  • [get | view] (2014-09-16 23:02:46, 0.7 KB) [[attachment:handle_one.sh]]
  • [get | view] (2014-08-08 16:33:00, 8.9 KB) [[attachment:notes.txt]]
  • [get | view] (2014-09-16 23:02:56, 3.3 KB) [[attachment:run_java.pl]]
  • [get | view] (2014-09-16 23:03:17, 90566.0 KB) [[attachment:spinn3rToHadoopAllTogether.tar.gz]]
  • [get | view] (2014-08-08 16:26:29, 2231.9 KB) [[attachment:spinn3rhadoop_java.tgz]]
  • [get | view] (2014-08-08 16:16:20, 8.4 KB) [[attachment:spinn3rreaderd.tgz.F4v3-20140521]]
  • [get | view] (2014-08-08 16:24:58, 2.9 KB) [[attachment:unicode_history.txt]]

You are not allowed to attach a file to this page.