Attachment 'run_java.pl'
Download 1 #!/usr/bin/perl
2
3 ### Example usage:
4 # ./one_spinn3r_doc_to_hdfs.pl \
5 # /afs/cs/group/infolab/datasets/snap-private/spinn3r/spinn3r-full5/web/2012-06/web-2012-06-05T15-00-00Z.txt
6
7 use DateTime;
8
9 my $inFile = $ARGV[0] or die "Please specify an input file!\n";
10
11 # Location of Spinn3rToHadoopWriter.jar.
12 my $JAR_DIR = '.';
13
14 # Arguments for Spinn3rToHadoopWriter.jar
15 #my $HDFS_BASEDIR = 'hdfs://ilhead2.stanford.edu:9000/user/niko/spinn3rTest4';
16 my $HDFS_BASEDIR = 'hdfs://ilhadoop1v1.stanford.edu:8020/dataset/spinn3r2';
17 my $LANG_PROFILE_BASEDIR = "$JAR_DIR/include/langdetect/";
18 my $UNICODE_DEGARBLER_TABLE = "$JAR_DIR/include/unicode_error_table.tsv";
19 my $LOGGING_PROPERTIES_FILE = "$JAR_DIR/include/logging.properties";
20 #my $LOGGING_PROPERTIES_FILE = "$JAR_DIR/include/log4j.properties";
21
22 if ($inFile =~ m{/((web|fb|tw)-(....)-(..)-(..)T(..)-00-00Z\.txt)$}) {
23 my ($fileName, $contentType, $year, $month, $day, $hour) = ($1, $2, $3, $4, $5, $6);
24 #print "fileName: $fileName; contentType: $contentType; year: $year; month: $month; day: $day; hour: $hour\n";
25 my $longName = inferLongName($contentType);
26 #print "$longName/$year-$month/$fileName\n";
27 my $outFile = "$HDFS_BASEDIR/$longName/$year-$month/$fileName";
28 my $hourString = "$year$month$day$hour";
29 my $spinn3rVersion = inferSpinn3rVersion($year, $month, $day);
30 my $cmd = join (' ',
31 "java -Xmx2500M -Xms800M -jar '$JAR_DIR/Spinn3rToHadoopWriterV2.jar' '$inFile' '$outFile' '$hourString'",
32 "'$longName' '$spinn3rVersion' '$LANG_PROFILE_BASEDIR' '$UNICODE_DEGARBLER_TABLE'",
33 "'$LOGGING_PROPERTIES_FILE'"
34 );
35 #print "$cmd\n";
36 `$cmd`;
37 } else {
38 die "Illegal file name: $inFile\n";
39 }
40
41 # ERROR to resolve:
42 # Call to ilhead2/10.79.15.100:9000 failed on local exception: java.io.EOFException
43 # SOLUTION: in my case this problem was caused by incompatible version of hadoop
44 # When I had v2.0 and snce the old server is v1.X the problem was present, since the
45 # JARs incuded do not match. When I switched to the same version that server is running
46 # there was no more errors.
47
48 sub inferSpinn3rVersion {
49 my $year = shift;
50 my $month = shift;
51 my $day = shift;
52 my $dateTime = $dt = DateTime->new(year => $year, month => $month, day => $day);
53 my $VERSION_B_START = DateTime->new(year => 2010, month => 7, day => 14);
54 my $VERSION_C_START = DateTime->new(year => 2010, month => 7, day => 27);
55 my $VERSION_D_START = DateTime->new(year => 2013, month => 5, day => 1);
56 my $VERSION_E_START = DateTime->new(year => 2014, month => 6, day => 1);
57 if ($dateTime < $VERSION_B_START) {
58 return 'A';
59 } elsif ($dateTime >= $VERSION_B_START && $dateTime < $VERSION_C_START) {
60 return 'B';
61 } elsif ($dateTime >= $VERSION_C_START && $dateTime < $VERSION_D_START) {
62 return 'C';
63 } elsif ($dateTime >= $VERSION_D_START && $dateTime < $VERSION_E_START) {
64 return 'D';
65 } else {
66 return 'E';
67 }
68 }
69 sub inferLongName {
70 my $in = shift;
71 use Switch;
72 switch ($in) {
73 case "fb" { return "facebook"}
74 case "tw" { return "twitter"}
75 case "web" { return "web"}
76 }
77 }
Attached Files
To refer to attachments on a page, use attachment:filename, as shown below in the list of files. Do NOT use the URL of the [get] link, since this is subject to change and can break easily.- [get | view] (2014-08-08 16:14:15, 5332.2 KB) [[attachment:Main.jar.F4v3-20140521]]
- [get | view] (2014-08-08 16:13:04, 5328.9 KB) [[attachment:Main.jar.F4v4-20140808]]
- [get | view] (2014-09-16 23:02:02, 82133.1 KB) [[attachment:Spinn3rToHadoopWriterV2.jar]]
- [get | view] (2014-09-16 23:10:10, 84977.9 KB) [[attachment:Spinn3rToHadoopWriterV2.tar.gz]]
- [get | view] (2014-09-16 23:02:30, 3.3 KB) [[attachment:copy.sh]]
- [get | view] (2014-08-08 16:26:34, 2.2 KB) [[attachment:copy_spinn3r_to_hdfs.pl]]
- [get | view] (2014-09-16 23:02:46, 0.7 KB) [[attachment:handle_one.sh]]
- [get | view] (2014-08-08 16:33:00, 8.9 KB) [[attachment:notes.txt]]
- [get | view] (2014-09-16 23:02:56, 3.3 KB) [[attachment:run_java.pl]]
- [get | view] (2014-09-16 23:03:17, 90566.0 KB) [[attachment:spinn3rToHadoopAllTogether.tar.gz]]
- [get | view] (2014-08-08 16:26:29, 2231.9 KB) [[attachment:spinn3rhadoop_java.tgz]]
- [get | view] (2014-08-08 16:16:20, 8.4 KB) [[attachment:spinn3rreaderd.tgz.F4v3-20140521]]
- [get | view] (2014-08-08 16:24:58, 2.9 KB) [[attachment:unicode_history.txt]]
You are not allowed to attach a file to this page.