Attachment 'copy_spinn3r_to_hdfs.pl'
Download 1 #!/usr/bin/perl
2
3 ### Example usage:
4 # ./copy_spinn3r_to_hdfs.pl \
5 # /afs/cs/group/infolab/datasets/snap-private/spinn3r/spinn3r-full5/web/2012-06/web-2012-06-05T15-00-00Z.txt
6
7 use DateTime;
8
9 my $inFile = $ARGV[0] or die "Please specify an input file!\n";
10
11 # Location of Spinn3rToHadoopWriter.jar.
12 my $JAR_DIR = $ENV{'HOME'} . '/repo/spinn3r/code/java/Spinn3rToHadoopWriter_deploy';
13
14 # Arguments for Spinn3rToHadoopWriter.jar
15 my $HDFS_BASEDIR = 'hdfs://ilhead2:9000/user/west1/spinn3rTest';
16 my $LANG_PROFILE_BASEDIR = "$JAR_DIR/include/langdetect/";
17 my $UNICODE_DEGARBLER_TABLE = "$JAR_DIR/include/unicode_error_table.tsv";
18 my $LOGGING_PROPERTIES_FILE = "$JAR_DIR/include/logging.properties";
19
20 if ($inFile =~ m{/((web|facebook|twitter)/....-../(web|fb|tw)-(....)-(..)-(..)T(..)-00-00Z\.txt)$}) {
21 my ($suffix, $contentType, $year, $month, $day, $hour) = ($1, $2, $4, $5, $6, $7);
22 my $outFile = "$HDFS_BASEDIR/$suffix";
23 #my $outFile = "/tmp/spinn3r.out";
24 my $hourString = "$year$month$day$hour";
25 my $spinn3rVersion = inferSpinn3rVersion($year, $month, $day);
26 my $cmd = join (' ',
27 "java -jar '$JAR_DIR/Spinn3rToHadoopWriter.jar' '$inFile' '$outFile' '$hourString'",
28 "'$contentType' '$spinn3rVersion' '$LANG_PROFILE_BASEDIR' '$UNICODE_DEGARBLER_TABLE'",
29 "'$LOGGING_PROPERTIES_FILE'"
30 );
31 print "$cmd\n";
32 `$cmd`;
33 } else {
34 die "Illegal file name: $inFile\n";
35 }
36
37 # ERROR to resolve:
38 # Call to ilhead2/10.79.15.100:9000 failed on local exception: java.io.EOFException
39
40 sub inferSpinn3rVersion {
41 my $year = shift;
42 my $month = shift;
43 my $day = shift;
44 my $dateTime = $dt = DateTime->new(year => $year, month => $month, day => $day);
45 my $VERSION_B_START = DateTime->new(year => 2010, month => 7, day => 14);
46 my $VERSION_C_START = DateTime->new(year => 2010, month => 7, day => 27);
47 my $VERSION_D_START = DateTime->new(year => 2013, month => 4, day => 29);
48 my $VERSION_E_START = DateTime->new(year => 2014, month => 5, day => 22);
49 if ($dateTime < $VERSION_B_START) {
50 return 'A';
51 } elsif ($dateTime >= $VERSION_B_START && $dateTime < $VERSION_C_START) {
52 return 'B';
53 } elsif ($dateTime >= $VERSION_C_START && $dateTime < $VERSION_D_START) {
54 return 'C';
55 } elsif ($dateTime >= $VERSION_D_START && $dateTime < $VERSION_E_START) {
56 return 'D';
57 } else {
58 return 'E';
59 }
60 }
Attached Files
To refer to attachments on a page, use attachment:filename, as shown below in the list of files. Do NOT use the URL of the [get] link, since this is subject to change and can break easily.- [get | view] (2014-08-08 16:14:15, 5332.2 KB) [[attachment:Main.jar.F4v3-20140521]]
- [get | view] (2014-08-08 16:13:04, 5328.9 KB) [[attachment:Main.jar.F4v4-20140808]]
- [get | view] (2014-09-16 23:02:02, 82133.1 KB) [[attachment:Spinn3rToHadoopWriterV2.jar]]
- [get | view] (2014-09-16 23:10:10, 84977.9 KB) [[attachment:Spinn3rToHadoopWriterV2.tar.gz]]
- [get | view] (2014-09-16 23:02:30, 3.3 KB) [[attachment:copy.sh]]
- [get | view] (2014-08-08 16:26:34, 2.2 KB) [[attachment:copy_spinn3r_to_hdfs.pl]]
- [get | view] (2014-09-16 23:02:46, 0.7 KB) [[attachment:handle_one.sh]]
- [get | view] (2014-08-08 16:33:00, 8.9 KB) [[attachment:notes.txt]]
- [get | view] (2014-09-16 23:02:56, 3.3 KB) [[attachment:run_java.pl]]
- [get | view] (2014-09-16 23:03:17, 90566.0 KB) [[attachment:spinn3rToHadoopAllTogether.tar.gz]]
- [get | view] (2014-08-08 16:26:29, 2231.9 KB) [[attachment:spinn3rhadoop_java.tgz]]
- [get | view] (2014-08-08 16:16:20, 8.4 KB) [[attachment:spinn3rreaderd.tgz.F4v3-20140521]]
- [get | view] (2014-08-08 16:24:58, 2.9 KB) [[attachment:unicode_history.txt]]
You are not allowed to attach a file to this page.