package edu.stanford.snap.spinn3rhadoop;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
//import java.io.RandomAccessFile;
import java.io.UnsupportedEncodingException;
import java.net.InetAddress;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.UnknownHostException;
//import java.nio.channels.FileChannel;
//import java.nio.channels.FileLock;
import java.text.DecimalFormat;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.logging.LogManager;
import java.util.logging.Logger;
import java.util.logging.Level;
//log4j
//import java.io.File;
//import org.apache.log4j.Logger;
//import org.apache.log4j.PropertyConfigurator;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;

import com.cybozu.labs.langdetect.Detector;
import com.cybozu.labs.langdetect.DetectorFactory;
import com.cybozu.labs.langdetect.LangDetectException;
import com.cybozu.labs.langdetect.Language;

/*
 * Create one object of this class per Spinn3r file (i.e., per hour of data).
 * This is important, since the docIds will be generated in a sequential fashion starting at 0.
 * 
 * Also, Twitter, Facebook, and Web must be separated before handing the data to this class.
 * 
 * It seems that we don't have to take care not to split records across chunks, since the reader
 * will seek across chunks:
 * http://stackoverflow.com/questions/14291170/how-does-hadoop-process-records-records-split-across-block-boundaries
 *
 * See this article for how to read multi-line records.
 * http://hadoopi.wordpress.com/2013/05/31/custom-recordreader-processing-string-pattern-delimited-records/
 * 
 * Will degarbling screw up indices of quotes and links?
 * 
 */
public class Spinn3rToHadoopWriter {

	private Spinn3rDocumentReader docReader;
	private String hour;
	private Spinn3rDoc.ContentType contentType;
	private BufferedWriter bufferedWriter;
	private UnicodeDegarbler degarbler;
	private final static Logger LOGGER = Logger.getLogger(Spinn3rToHadoopWriter.class.getName());

	// NO LANG THRESHOLD any more.
	// Tunable parameter: the minimum probability a detected language must have to be accepted.
	//private static final double LANG_THRESHOLD = 0.8;

	// This is only relevant when using the Latin1ToUtf8Degarbler, i.e., for spinn3rVersion "A".
	// We take a string to be garbled if its fraction of ASCII characters less than MIN_ASCII_RATIO;
	// a quick test on some Wikipedia snippets showed that German, French, and Spanish all have an
	// ASCII fraction of around 97%.
	private static final double MIN_ASCII_RATIO = 0.8;

	// This is only relevant when using the NullDegarbler.
	// We take a string to be garbled if its fraction of ASCII characters less than
	// MIN_NON_QUESTIONMARK_RATIO.
	private static final double MIN_NON_QUESTIONMARK_RATIO = 0.8;

	// hour: e.g., 2011072314 (i.e., 14:00 on July 23, 2011).
	public Spinn3rToHadoopWriter(InputStream in, OutputStream out, String hour,
			Spinn3rDoc.ContentType contentType, Spinn3rDoc.Spinn3rVersion spinn3rVersion,
			String langProfileBaseDir, String unicodeDegarblerTable)
					throws LangDetectException, IOException {
		try {
			// Important: use UTF-8 encoding.
			this.bufferedWriter = new BufferedWriter(new OutputStreamWriter(out, "UTF-8"));
		} catch (UnsupportedEncodingException e) {
			throw new RuntimeException(e);
		}
		this.hour = hour;
		this.contentType = contentType;
		// Choose the right model for language detection.
		// (According to
		// http://stackoverflow.com/questions/12007603/java-language-detection-with-langdetect-how-to-load-profiles,
		// 'profiles.sm' is better suited for short messages, 'profiles' for longer ones.)
		String profiles;
		switch (contentType) {
		case TWITTER:
		case FACEBOOK:
			profiles = "profiles.sm";
			break;
		default:
			profiles = "profiles";
		}
		DetectorFactory.loadProfile(langProfileBaseDir + "/" + profiles);
		switch (spinn3rVersion) {
		case A:
			degarbler = new UnicodeDegarbler.Latin1ToUtf8Degarbler(unicodeDegarblerTable,
					MIN_ASCII_RATIO);
			docReader = new Spinn3rDocumentReader.MultiLineReader(in, spinn3rVersion, "UTF-8",
					degarbler);
			break;
		case B:
			degarbler = new UnicodeDegarbler.NullDegarbler(MIN_NON_QUESTIONMARK_RATIO);
			docReader = new Spinn3rDocumentReader.MultiLineReader(in, spinn3rVersion, "ISO-8859-1",
					degarbler);
			break;
		case C:
			degarbler = new UnicodeDegarbler.NullDegarbler(MIN_NON_QUESTIONMARK_RATIO);
			docReader = new Spinn3rDocumentReader.MultiLineReader(in, spinn3rVersion, "UTF-8",
					degarbler);
			break;
		case D:
		case E:
			degarbler = new UnicodeDegarbler.NullDegarbler(MIN_NON_QUESTIONMARK_RATIO);
			docReader = new Spinn3rDocumentReader.SingleLineReader(in, spinn3rVersion, "UTF-8",
					degarbler);
		}
	}

	// Call this method once per instance.
	// Writes the data and returns the number of bytes written.
	public long write() throws IOException, IllegalArgumentException{
		Spinn3rDoc doc;
		long id = 0;
		long numBytes = 0;
		int numDuplicates = 0;
		int numDocuments = 0;
		//long before = System.currentTimeMillis();
		HashSet<String> seenUrls = new HashSet<String>();
		HashSet<Integer> seenContentHashes = new HashSet<Integer>();
		while (true) {
			// ////// Just for debugging! Stop early.
			//if (id == 1000)
			//	break;

			// ////// Just for debugging! Print progress.
			//if (id % 10000 == 0){
			//	System.out.format("\r%.1f MB written in %.1f seconds", numBytes / 1024.0 / 1024.0,
			//			(System.currentTimeMillis() - before) / 1000.0);
			//}


			// The errors thrown should be caught and handled in the main method!
			doc = docReader.read();
			// End of file reached.
			if (doc == null) {
				// It is very important to CLOSE the document, otherwise the whole content is NOT written. 
				bufferedWriter.close();
				break;
			}
			numDocuments++;
			// Skip this document if we've already seen it (represented as the hash code of its
			// url + content) in the current hour.
			String hashableContent = doc.url + "\t" + (doc.content == null ? "" : doc.content);
			if (seenUrls.contains(doc.url)
					&& seenContentHashes.contains(hashableContent.hashCode())) {
				//LOGGER.log(Level.INFO, "Duplicate document: " + doc.url);
				numDuplicates++;
				continue;
			}
			// Remember this article as seen.
			seenUrls.add(doc.url);
			seenContentHashes.add(hashableContent.hashCode());
			doc.docId = String.format("%s_%08d_%s", hour, id, contentType);
			++id;
			try {
				// We use title and content to detect the language.
				String text = (doc.title == null ? "" : doc.title) + " "
						+ (doc.content == null ? "" : doc.content);
				// Store if it is garbled and also store the fraction of useful characters
				doc.isGarbled = degarbler.isGarbled(text);
				doc.nonGarbageFraction = degarbler.getNonGarbageFraction(text);
				// Store all languages and their probabilities
				List<Language> lngs = detectLang(text);
				for(Language l : lngs){
					doc.appendLang(l.lang, l.prob);
				}
			} catch (LangDetectException e) {
				// skip no features in text errors, otherwise the log files are just too large
				if(!e.getMessage().contains("no features in text")){
					LOGGER.severe(e.getClass().getName() + ": " + e.getMessage());
					// log4j
					//LOGGER.error(e.getClass().getName() + ": " + e.getMessage());
				}
			} catch (IllegalArgumentException e) {
				LOGGER.severe(e.getClass().getName() + ": " + e.getMessage());
				// log4j
				//LOGGER.error(e.getClass().getName() + ": " + e.getMessage());
			}
			String toWrite = doc.toString() + "\n";
			bufferedWriter.write(toWrite);
			numBytes += toWrite.getBytes("UTF-8").length;
		}
		DecimalFormat df = new DecimalFormat("###.##");
		LOGGER.log(Level.INFO, "Number of duplicate documents: " + numDuplicates 
				+ ". Number of all documents: " + numDocuments + ". Duplicates percentage: " 
				+ df.format( ((float)numDuplicates/numDocuments)*100 ) + "%.");
		// log4j
		//LOGGER.info("Number of duplicate documents in this file: " + numDuplicates);
		//System.out.format("\r");
		return numBytes;
	}

	public List<Language> detectLang(String text) throws LangDetectException, IllegalArgumentException {
		// Only this statement can throw LangDetectException towards the outside.
		Detector detector = DetectorFactory.create();
		detector.append(text);
		List<Language> langs;
		try {
			langs = detector.getProbabilities();
		} catch (LangDetectException e) {
			throw new LangDetectException(e.getCode(), e.getMessage() + ": " + text);
		}
		if (langs.isEmpty()) {
			throw new IllegalArgumentException("No valid language detected: " + text);
		}
		// The checking for garbled text is now gone; we store all languages along 
		// with the information if the text is garbled, so the user can decide 
		// whether to use the information or not.
		return langs;
	}

	// fs: hdfs://ilhead2:9000
	// outFile: /user/west1/spinn3rTest.txt
	private static OutputStream getHdfsOutputStream(String fs, String dir, String file)
			throws IOException, URISyntaxException {
		Configuration conf = new Configuration();
		FileSystem hdfs = FileSystem.get(new URI(fs), conf);
		Path dirPath = new Path(fs + dir);
		if (!hdfs.exists(dirPath)) {
			hdfs.mkdirs(dirPath);
		}
		Path filePath = new Path(fs + dir + file);
		if (hdfs.exists(filePath)) {
			hdfs.delete(filePath, true);
		}
		// Consider using a Progressable here.
		return hdfs.create(filePath);
	}

	private static void printUsage() {
		// TODO!!!
	}
	
	private static void safeSystemErr(String s){
		/** TODO: improve
		 * 	This is a quick hack and should be improved.
		 *  The purpose of this file is to do synchronization between several JVM
		 *  so that several running programs are synchronized on System.err and do
		 *  not write one over an other.
		 * */
		
		/**
		 * THIS IS NOT NEEDED ANY MORE, SO THIS METHOD JUST PRINTS TO ERR!
		 * */
		
		/**
		File file = new File("key.lock");
		FileChannel channel;
		try {
			RandomAccessFile randomAccessFile = new RandomAccessFile(file, "rw");
			channel = randomAccessFile.getChannel();
			FileLock lock = channel.lock();
			
			// Print progress
			System.err.format(s);
			System.err.flush();
			
			// release the lock
			lock.release();
			randomAccessFile.close();
		} catch (Exception e) {
			// Print progress
			System.err.format(s);
			System.err.flush();
		}
		*/
		System.err.format(s);
		System.err.flush();
	}

	/**
	 * @param in
	 *            , out, hour, contentType, spinn3rVersion, langProfileBaseDir,
	 *            unicodeDegarblerTable, loggingPropertiesFile
	 * @throws IOException 
	 */
	public static void main(String[] args){
		// ////////////////
		// Set args manually for testing.
		boolean DEBUG = true;
		if (DEBUG && args.length == 0) {
			// A version
			/**
			args = new String[8];
			args[0] = "/Users/Niko/Documents/workspace/spinn3rhadoop_java/datasets/web/2009-09/web-2009-09-12T12-00-00Z.txt";
			args[1] = "/Users/Niko/Documents/workspace/spinn3rhadoop_java/web-2009.out";
			args[2] = "2009091212";
			args[3] = "web";
			args[4] = "A";
			args[5] = "/Users/Niko/Documents/workspace/spinn3rhadoop_java/exportv2/include/langdetect";
			args[6] = "/Users/Niko/Documents/workspace/spinn3rhadoop_java/exportv2/include/unicode_error_table.tsv";
			args[7] = "/Users/Niko/Documents/workspace/spinn3rhadoop_java/exportv2/include/logging.properties";
			*/

			// B version
			/**
			args = new String[8];
			args[0] = "/Users/Niko/Documents/workspace/spinn3rhadoop_java/datasets/web/2010-07/web-2010-07-20T14-00-00Z-HALF.txt";
			args[1] = "/Users/Niko/Documents/workspace/spinn3rhadoop_java/web-2010.out";
			args[2] = "2010072014";
			args[3] = "web";
			args[4] = "B";
			args[5] = "/Users/Niko/Documents/workspace/spinn3rhadoop_java/exportv2/include/langdetect";
			args[6] = "/Users/Niko/Documents/workspace/spinn3rhadoop_java/exportv2/include/unicode_error_table.tsv";
			args[7] = "/Users/Niko/Documents/workspace/spinn3rhadoop_java/exportv2/include/logging.properties";
			 */

			// C version
			/**
			args = new String[8];
			args[0] = "/Users/Niko/Documents/workspace/spinn3rhadoop_java/datasets/web/2013-02/web-2013-02-08T15-00-00Z.txt";
			args[1] = "/Users/Niko/Documents/workspace/spinn3rhadoop_java/web-2013-C.out";
			args[2] = "2013020815";
			args[3] = "web";
			args[4] = "C";
			args[5] = "/Users/Niko/Documents/workspace/spinn3rhadoop_java/exportv2/include/langdetect";
			args[6] = "/Users/Niko/Documents/workspace/spinn3rhadoop_java/exportv2/include/unicode_error_table.tsv";
			args[7] = "/Users/Niko/Documents/workspace/spinn3rhadoop_java/exportv2/include/logging.properties";
			 */

			// D version
			/**
			args = new String[8];
			//args[0] = "/Users/Niko/Documents/workspace/spinn3rhadoop_java/datasets/web/2013-04/web-2013-04-29T17-00-00Z.txt";
			args[0] = "/Users/Niko/Documents/workspace/spinn3rhadoop_java/datasets/web/2013-04/small-in";
			args[1] = "/Users/Niko/Documents/workspace/spinn3rhadoop_java/web-2013-D.out";
			args[2] = "2013042917";
			args[3] = "web";
			args[4] = "D";
			args[5] = "/Users/Niko/Documents/workspace/spinn3rhadoop_java/exportv2/include/langdetect";
			args[6] = "/Users/Niko/Documents/workspace/spinn3rhadoop_java/exportv2/include/unicode_error_table.tsv";
			args[7] = "/Users/Niko/Documents/workspace/spinn3rhadoop_java/exportv2/include/logging.properties";
			 */

			// E version
			/**
			args = new String[8];
			args[0] = "/Users/Niko/Documents/workspace/spinn3rhadoop_java/datasets/web/2014-06/web-2014-06-21T15-00-00Z.txt";
			args[1] = "/Users/Niko/Documents/workspace/spinn3rhadoop_java/web-2014.out";
			args[2] = "2014062115";
			args[3] = "web";
			args[4] = "E";
			args[5] = "/Users/Niko/Documents/workspace/spinn3rhadoop_java/exportv2/include/langdetect";
			args[6] = "/Users/Niko/Documents/workspace/spinn3rhadoop_java/exportv2/include/unicode_error_table.tsv";
			args[7] = "/Users/Niko/Documents/workspace/spinn3rhadoop_java/exportv2/include/logging.properties";
			 */
			
			// Broken lines
			args = new String[8];
			args[0] = "/Users/Niko/Documents/workspace/spinn3rhadoop_java/datasets/web/2010-08/web-2010-08-01T00-00-00Z.txt";
			args[1] = "/Users/Niko/Documents/workspace/spinn3rhadoop_java/web-2010-brokenLines.out";
			args[2] = "2010080100";
			args[3] = "web";
			args[4] = "C";
			args[5] = "/Users/Niko/Documents/workspace/spinn3rhadoop_java/exportv2/include/langdetect";
			args[6] = "/Users/Niko/Documents/workspace/spinn3rhadoop_java/exportv2/include/unicode_error_table.tsv";
			args[7] = "/Users/Niko/Documents/workspace/spinn3rhadoop_java/exportv2/include/logging.properties";
		}
		// ////////////////

		InputStream in;
		OutputStream out;
		String hour;
		Spinn3rDoc.ContentType contentType;
		Spinn3rDoc.Spinn3rVersion spinn3rVersion;
		String langProfileBaseDir;
		String unicodeDegarblerTable;
		String host = "unknown";
		try {
			host = InetAddress.getLocalHost().getHostName();
		} catch (UnknownHostException e1) {
		}
		try {
			String arg_in = args[0];
			String arg_out = args[1];
			String arg_hour = args[2];
			String arg_contentType = args[3];
			String arg_spinn3rVersion = args[4];
			String arg_langProfileBaseDir = args[5];
			String arg_unicodeDegarblerTable = args[6];
			String arg_loggingPropertiesFile = args[7];
			// Set logging properties and log filename.
			LogManager.getLogManager().readConfiguration(new FileInputStream(arg_loggingPropertiesFile));
			LOGGER.log(Level.INFO, "\n*\nCurrent input file is: '"+args[0]+"'.\n*");
			// log4j
			//File log4jfile = new File(arg_loggingPropertiesFile);
			//PropertyConfigurator.configure(log4jfile.getAbsolutePath());
			//LOGGER.info("\n*\n*\n*\nCurrent input file is: '"+args[0]+"'.\n*\n*\n*");

			// Input.
			if (arg_in.equals("System.in")) {
				in = System.in;
			} else {
				in = new FileInputStream(arg_in);
			}
			// Output.
			if (arg_out.equals("System.out")) {
				out = System.out;
			} else if (arg_out.startsWith("hdfs://")) {
				Pattern p = Pattern.compile("(hdfs://[^:]+:[0-9]+)(/.*/)(.*)");
				Matcher m = p.matcher(arg_out);
				if (m.matches()) {
					out = getHdfsOutputStream(m.group(1), m.group(2), m.group(3));
				} else {
					throw new IllegalArgumentException("Illegal HDFS location");
				}
			} else {
				out = new FileOutputStream(arg_out);
			}
			// Hour.
			if (arg_hour.matches("\\d{10}")) {
				hour = arg_hour;
			} else {
				throw new IllegalArgumentException("Illegal hour specification; must be YYYYMMDDHH");
			}
			// Content type.
			if (arg_contentType.toLowerCase().equals("twitter")) {
				contentType = Spinn3rDoc.ContentType.TWITTER;
			} else if (arg_contentType.toLowerCase().equals("facebook")) {
				contentType = Spinn3rDoc.ContentType.FACEBOOK;
			} else if (arg_contentType.toLowerCase().equals("web")) {
				contentType = Spinn3rDoc.ContentType.WEB;
			} else {
				throw new IllegalArgumentException(
						"Illegal content type; must be web, twitter, or facebook");
			}
			// Spinn3r version.
			if (arg_spinn3rVersion.matches("^[ABCDE]$")) {
				spinn3rVersion = Spinn3rDoc.Spinn3rVersion.valueOf(arg_spinn3rVersion);
			} else {
				throw new IllegalArgumentException("Illegal Spinn3r version; must be A, B, C, D, or E");
			}
			// Unicode degarbler table.
			unicodeDegarblerTable = arg_unicodeDegarblerTable;
			// Language profile base directory.
			langProfileBaseDir = arg_langProfileBaseDir;

			// writing to HDFS
			long before = System.currentTimeMillis();
			Spinn3rToHadoopWriter writer = new Spinn3rToHadoopWriter(in, out, hour, contentType,
					spinn3rVersion, langProfileBaseDir, unicodeDegarblerTable);

			// Write the file and report success if successful
			long numBytes = writer.write();

			File arg_in_f = new File(arg_in);

			/** Print progress */
			String message = String.format("__results__\t__SUCCESS__\t%s\t__%s__\t%.1fMB\t%.1fs\n",  
					arg_in_f.getName(),
					host,
					numBytes / 1024.0 / 1024.0,
					(System.currentTimeMillis() - before) / 1000.0);
			safeSystemErr(message);

		} catch (Exception e) {
			File arg_in_f = new File(args[0]);
			String message = String.format("__results__\t__ERROR__\t%s\t__%s__\n", 
						arg_in_f.getName(),
						host);
			safeSystemErr(message);
			LOGGER.log(Level.SEVERE, "ERROR while pocessing file '"+args[0]+"'.");
			LOGGER.log(Level.SEVERE, e.getMessage());
			LOGGER.log(Level.SEVERE, Arrays.toString(e.getStackTrace()));	
			// log4j
			//LOGGER.error("ERROR while pocessing file '"+args[0]+"'.");
			//LOGGER.error(e.getMessage());
			//LOGGER.error(Arrays.toString(e.getStackTrace()));
			printUsage();
			return;
		}
	}
}
