#!/usr/bin/perl

### Example usage:
# ./one_spinn3r_doc_to_hdfs.pl \
# /afs/cs/group/infolab/datasets/snap-private/spinn3r/spinn3r-full5/web/2012-06/web-2012-06-05T15-00-00Z.txt

use DateTime;

my $inFile = $ARGV[0] or die "Please specify an input file!\n";

# Location of Spinn3rToHadoopWriter.jar.
my $JAR_DIR = '.';

# Arguments for Spinn3rToHadoopWriter.jar
#my $HDFS_BASEDIR = 'hdfs://ilhead2.stanford.edu:9000/user/niko/spinn3rTest4';
my $HDFS_BASEDIR = 'hdfs://ilhadoop1v1.stanford.edu:8020/dataset/spinn3r2';
my $LANG_PROFILE_BASEDIR = "$JAR_DIR/include/langdetect/";
my $UNICODE_DEGARBLER_TABLE  = "$JAR_DIR/include/unicode_error_table.tsv";
my $LOGGING_PROPERTIES_FILE = "$JAR_DIR/include/logging.properties";
#my $LOGGING_PROPERTIES_FILE = "$JAR_DIR/include/log4j.properties";

if ($inFile =~ m{/((web|fb|tw)-(....)-(..)-(..)T(..)-00-00Z\.txt)$}) {
        my ($fileName, $contentType, $year, $month, $day, $hour) = ($1, $2, $3, $4, $5, $6);
        #print "fileName: $fileName; contentType: $contentType; year: $year; month: $month; day: $day; hour: $hour\n";
        my $longName = inferLongName($contentType);
        #print "$longName/$year-$month/$fileName\n";
        my $outFile = "$HDFS_BASEDIR/$longName/$year-$month/$fileName";
        my $hourString = "$year$month$day$hour";
        my $spinn3rVersion = inferSpinn3rVersion($year, $month, $day);
        my $cmd = join (' ',
                "java -Xmx2500M -Xms800M -jar '$JAR_DIR/Spinn3rToHadoopWriterV2.jar' '$inFile' '$outFile' '$hourString'",
                "'$longName' '$spinn3rVersion' '$LANG_PROFILE_BASEDIR' '$UNICODE_DEGARBLER_TABLE'",
                "'$LOGGING_PROPERTIES_FILE'"
                );
        #print "$cmd\n";
        `$cmd`;
} else {
        die "Illegal file name: $inFile\n";
}

# ERROR to resolve:
# Call to ilhead2/10.79.15.100:9000 failed on local exception: java.io.EOFException
# SOLUTION: in my case this problem was caused by incompatible version of hadoop
# When I had v2.0 and snce the old server is v1.X the problem was present, since the 
# JARs incuded do not match. When I switched to the same version that server is running
# there was no more errors.

sub inferSpinn3rVersion {
        my $year = shift;
        my $month = shift;
        my $day = shift;
        my $dateTime = $dt = DateTime->new(year => $year, month => $month, day => $day);
        my $VERSION_B_START = DateTime->new(year => 2010, month => 7, day => 14);
        my $VERSION_C_START = DateTime->new(year => 2010, month => 7, day => 27);
        my $VERSION_D_START = DateTime->new(year => 2013, month => 5, day => 1);
        my $VERSION_E_START = DateTime->new(year => 2014, month => 6, day => 1);
        if ($dateTime < $VERSION_B_START) {
                return 'A';
        } elsif ($dateTime >= $VERSION_B_START && $dateTime < $VERSION_C_START) {
                return 'B';
        } elsif ($dateTime >= $VERSION_C_START && $dateTime < $VERSION_D_START) {
                return 'C';
        } elsif ($dateTime >= $VERSION_D_START && $dateTime < $VERSION_E_START) {
                return 'D';
        } else {
                return 'E';
        }
}
sub inferLongName {
        my $in = shift;
        use Switch;
        switch ($in) {
                case "fb" { return "facebook"}
                case "tw" { return "twitter"}
                case "web" { return "web"}
        }
}

