#!/usr/bin/perl

### Example usage:
# ./copy_spinn3r_to_hdfs.pl \
# /afs/cs/group/infolab/datasets/snap-private/spinn3r/spinn3r-full5/web/2012-06/web-2012-06-05T15-00-00Z.txt

use DateTime;

my $inFile = $ARGV[0] or die "Please specify an input file!\n";

# Location of Spinn3rToHadoopWriter.jar.
my $JAR_DIR = $ENV{'HOME'} . '/repo/spinn3r/code/java/Spinn3rToHadoopWriter_deploy';

# Arguments for Spinn3rToHadoopWriter.jar
my $HDFS_BASEDIR = 'hdfs://ilhead2:9000/user/west1/spinn3rTest';
my $LANG_PROFILE_BASEDIR = "$JAR_DIR/include/langdetect/";
my $UNICODE_DEGARBLER_TABLE  = "$JAR_DIR/include/unicode_error_table.tsv";
my $LOGGING_PROPERTIES_FILE = "$JAR_DIR/include/logging.properties";

if ($inFile =~ m{/((web|facebook|twitter)/....-../(web|fb|tw)-(....)-(..)-(..)T(..)-00-00Z\.txt)$}) {
	my ($suffix, $contentType, $year, $month, $day, $hour) = ($1, $2, $4, $5, $6, $7);
	my $outFile = "$HDFS_BASEDIR/$suffix";
	#my $outFile = "/tmp/spinn3r.out";
	my $hourString = "$year$month$day$hour";
	my $spinn3rVersion = inferSpinn3rVersion($year, $month, $day);
	my $cmd = join (' ',
		"java -jar '$JAR_DIR/Spinn3rToHadoopWriter.jar' '$inFile' '$outFile' '$hourString'",
		"'$contentType' '$spinn3rVersion' '$LANG_PROFILE_BASEDIR' '$UNICODE_DEGARBLER_TABLE'",
		"'$LOGGING_PROPERTIES_FILE'"
		);
	print "$cmd\n";
	`$cmd`;
} else {
	die "Illegal file name: $inFile\n";
}

# ERROR to resolve:
# Call to ilhead2/10.79.15.100:9000 failed on local exception: java.io.EOFException

sub inferSpinn3rVersion {
	my $year = shift;
	my $month = shift;
	my $day = shift;
	my $dateTime = $dt = DateTime->new(year => $year, month => $month, day => $day);
	my $VERSION_B_START = DateTime->new(year => 2010, month => 7, day => 14);
	my $VERSION_C_START = DateTime->new(year => 2010, month => 7, day => 27);
	my $VERSION_D_START = DateTime->new(year => 2013, month => 4, day => 29);
	my $VERSION_E_START = DateTime->new(year => 2014, month => 5, day => 22);
	if ($dateTime < $VERSION_B_START) {
		return 'A';
	} elsif ($dateTime >= $VERSION_B_START && $dateTime < $VERSION_C_START) {
		return 'B';
	} elsif ($dateTime >= $VERSION_C_START && $dateTime < $VERSION_D_START) {
		return 'C';
	} elsif ($dateTime >= $VERSION_D_START && $dateTime < $VERSION_E_START) {
		return 'D';
	} else {
		return 'E';
	}
}
