package edu.stanford.snap.spinn3rhadoop;

import java.io.BufferedWriter;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.HashSet;
import java.util.List;
import java.util.logging.LogManager;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;

import com.cybozu.labs.langdetect.Detector;
import com.cybozu.labs.langdetect.DetectorFactory;
import com.cybozu.labs.langdetect.LangDetectException;
import com.cybozu.labs.langdetect.Language;

/*
 * Create one object of this class per Spinn3r file (i.e., per hour of data).
 * This is important, since the docIds will be generated in a sequential fashion starting at 0.
 * 
 * Also, Twitter, Facebook, and Web must be separated before handing the data to this class.
 * 
 * It seems that we don't have to take care not to split records across chunks, since the reader
 * will seek across chunks:
 * http://stackoverflow.com/questions/14291170/how-does-hadoop-process-records-records-split-across-block-boundaries
 *
 * See this article for how to read multi-line records.
 * http://hadoopi.wordpress.com/2013/05/31/custom-recordreader-processing-string-pattern-delimited-records/
 * 
 * Will degarbling screw up indices of quotes and links?
 * 
 * TODO: set logger output dir in logging.properties
 */
public class Spinn3rToHadoopWriter {

	private Spinn3rDocumentReader docReader;
	private String hour;
	private Spinn3rDoc.ContentType contentType;
	private BufferedWriter bufferedWriter;
	private UnicodeDegarbler degarbler;
	private final static Logger LOGGER = Logger.getLogger(Spinn3rToHadoopWriter.class.getName());

	// Tunable parameter: the minimum probability a detected language must have to be accepted.
	private static final double LANG_THRESHOLD = 0.8;

	// This is only relevant when using the Latin1ToUtf8Degarbler, i.e., for spinn3rVersion "A".
	// We take a string to be garbled if its fraction of ASCII characters less than MIN_ASCII_RATIO;
	// a quick test on some Wikipedia snippets showed that German, French, and Spanish all have an
	// ASCII fraction of around 97%.
	private static final double MIN_ASCII_RATIO = 0.8;

	// This is only relevant when using the NullDegarbler.
	// We take a string to be garbled if its fraction of ASCII characters less than
	// MIN_NON_QUESTIONMARK_RATIO.
	private static final double MIN_NON_QUESTIONMARK_RATIO = 0.8;

	// hour: e.g., 2011072314 (i.e., 14:00 on July 23, 2011).
	public Spinn3rToHadoopWriter(InputStream in, OutputStream out, String hour,
			Spinn3rDoc.ContentType contentType, Spinn3rDoc.Spinn3rVersion spinn3rVersion,
			String langProfileBaseDir, String unicodeDegarblerTable, String loggingPropertiesFile)
			throws LangDetectException, IOException {
		LogManager.getLogManager().readConfiguration(new FileInputStream(loggingPropertiesFile));
		try {
			// Important: use UTF-8 encoding.
			this.bufferedWriter = new BufferedWriter(new OutputStreamWriter(out, "UTF-8"));
		} catch (UnsupportedEncodingException e) {
			throw new RuntimeException(e);
		}
		this.hour = hour;
		this.contentType = contentType;
		// Choose the right model for language detection.
		// (According to
		// http://stackoverflow.com/questions/12007603/java-language-detection-with-langdetect-how-to-load-profiles,
		// 'profiles.sm' is better suited for short messages, 'profiles' for longer ones.)
		String profiles;
		switch (contentType) {
		case TWITTER:
		case FACEBOOK:
			profiles = "profiles.sm";
			break;
		default:
			profiles = "profiles";
		}
		DetectorFactory.loadProfile(langProfileBaseDir + "/" + profiles);
		switch (spinn3rVersion) {
		case A:
			degarbler = new UnicodeDegarbler.Latin1ToUtf8Degarbler(unicodeDegarblerTable,
					MIN_ASCII_RATIO);
			docReader = new Spinn3rDocumentReader.MultiLineReader(in, spinn3rVersion, "UTF-8",
					degarbler);
			break;
		case B:
			degarbler = new UnicodeDegarbler.NullDegarbler(MIN_NON_QUESTIONMARK_RATIO);
			docReader = new Spinn3rDocumentReader.MultiLineReader(in, spinn3rVersion, "ISO-8859-1",
					degarbler);
			break;
		case C:
			degarbler = new UnicodeDegarbler.NullDegarbler(MIN_NON_QUESTIONMARK_RATIO);
			docReader = new Spinn3rDocumentReader.MultiLineReader(in, spinn3rVersion, "UTF-8",
					degarbler);
			break;
		case D:
		case E:
			degarbler = new UnicodeDegarbler.NullDegarbler(MIN_NON_QUESTIONMARK_RATIO);
			docReader = new Spinn3rDocumentReader.SingleLineReader(in, spinn3rVersion, "UTF-8",
					degarbler);
		}
	}

	// Call this method once per instance.
	// Writes the data and returns the number of bytes written.
	public long write() {
		Spinn3rDoc doc;
		long id = 0;
		long numBytes = 0;
		HashSet<String> seenUrls = new HashSet<String>();
		HashSet<Integer> seenContentHashes = new HashSet<Integer>();
		while (true) {
			// ////// Just for debugging!
			if (id == 1000)
				break;

			try {
				doc = docReader.read();
				// End of file reached.
				if (doc == null) {
					break;
				}
				// Skip this document if we've already seen it (represented as the hash code of its
				// url + content) in the current hour.
				String hashableContent = doc.url + "\t" + (doc.content == null ? "" : doc.content);
				if (seenUrls.contains(doc.url)
						&& seenContentHashes.contains(hashableContent.hashCode())) {
					LOGGER.info("Duplicate document: " + doc.url);
					continue;
				}
				// Remember this article as seen.
				seenUrls.add(doc.url);
				seenContentHashes.add(hashableContent.hashCode());
				doc.docId = String.format("%s_%08d_%s", hour, id, contentType);
				++id;
				try {
					// We use title and content to detect the language.
					String text = (doc.title == null ? "" : doc.title) + " "
							+ (doc.content == null ? "" : doc.content);
					doc.lang = detectLang(text).lang;
				} catch (LangDetectException e) {
					LOGGER.severe(e.getClass().getName() + ": " + e.getMessage());
				} catch (IllegalArgumentException e) {
					LOGGER.info(e.getClass().getName() + ": " + e.getMessage());
				}
				String toWrite = doc.toString() + "\n";
				bufferedWriter.write(toWrite);
				numBytes += toWrite.getBytes("UTF-8").length;
			} catch (IOException e) {
				LOGGER.warning(e.getClass().getName() + ": " + e.getMessage());
				continue;
			} catch (IllegalArgumentException e) {
				LOGGER.warning(e.getClass().getName() + ": " + e.getMessage());
				continue;
			}
		}
		return numBytes;
	}

	public Language detectLang(String text) throws LangDetectException {
		// Only this statement can throw LangDetectException towards the outside.
		Detector detector = DetectorFactory.create();
		detector.append(text);
		List<Language> langs;
		try {
			langs = detector.getProbabilities();
		} catch (LangDetectException e) {
			throw new IllegalArgumentException(e.getMessage() + ": " + text);
		}
		if (langs.isEmpty() || langs.get(0).prob < LANG_THRESHOLD) {
			throw new IllegalArgumentException("No valid language detected: " + text);
		} else if (degarbler.isGarbled(text)) {
			throw new IllegalArgumentException(degarbler.getClass().getName()
					+ " detected garbled text: " + text);
		}
		return langs.get(0);
	}

	// fs: hdfs://ilhead2:9000
	// outFile: /user/west1/spinn3rTest.txt
	private static OutputStream getHdfsOutputStream(String fs, String dir, String file)
			throws IOException, URISyntaxException {
		Configuration conf = new Configuration();
		FileSystem hdfs = FileSystem.get(new URI(fs), conf);
		Path dirPath = new Path(fs + dir);
		if (!hdfs.exists(dirPath)) {
			hdfs.mkdirs(dirPath);
		}
		Path filePath = new Path(fs + dir + file);
		if (hdfs.exists(filePath)) {
			hdfs.delete(filePath, true);
		}
		// Consider using a Progressable here.
		return hdfs.create(filePath);
	}

	private static void printUsage() {
		// TODO!!!
	}

	/**
	 * @param in
	 *            , out, hour, contentType, spinn3rVersion, langProfileBaseDir,
	 *            unicodeDegarblerTable, loggingPropertiesFile
	 */
	public static void main(String[] args) throws Exception {
		// ////////////////
		// Set args manually for testing.
		boolean DEBUG = false;
		if (DEBUG && args.length == 0) {
			args = new String[8];
			// args[0] = "/tmp/2014-05-23T21-00-00Z_2014-05-23T22-00-00Z.txt";
			// args[0] = "/tmp/web-2010-04-17T09-00-00Z.txt";
			args[0] = System.getenv("HOME")
			// + "/repo/spinn3r/data/spinn3r/web-2010-04-13T07-00-00Z.txt";
			// + "/repo/spinn3r/data/spinn3r/web-2011-07-23T23-00-00Z.txt";
			// + "/repo/spinn3r/data/spinn3r/web-2010-07-17T09-00-00Z.txt";
					+ "/repo/spinn3r/data/spinn3r/web-2014-06-05T15-00-00Z_HEAD.txt";
			args[1] = "/tmp/spinn3r.out";
			// "hdfs://ilhead2:9000/user/west1/spinn3rTest/spinn3rTest.txt";
			args[2] = "2010041709";
			args[3] = "web";
			args[4] = "E";
			args[5] = System.getenv("HOME") + "/repo/lib/langdetect";
			args[6] = System.getenv("HOME") + "/repo/spinn3r/data/spinn3r/unicode_error_table.tsv";
			args[7] = System.getenv("HOME")
					+ "/repo/spinn3r/code/java/edu/stanford/snap/spinn3rhadoop/logging.properties";
		}
		// ////////////////
		InputStream in;
		OutputStream out;
		String hour;
		Spinn3rDoc.ContentType contentType;
		Spinn3rDoc.Spinn3rVersion spinn3rVersion;
		String langProfileBaseDir;
		String unicodeDegarblerTable;
		String loggingPropertiesFile;
		try {
			String arg_in = args[0];
			String arg_out = args[1];
			String arg_hour = args[2];
			String arg_contentType = args[3];
			String arg_spinn3rVersion = args[4];
			String arg_langProfileBaseDir = args[5];
			String arg_unicodeDegarblerTable = args[6];
			String arg_loggingPropertiesFile = args[7];
			// Input.
			if (arg_in.equals("System.in")) {
				in = System.in;
			} else {
				in = new FileInputStream(arg_in);
			}
			// Output.
			if (arg_out.equals("System.out")) {
				out = System.out;
			} else if (arg_out.startsWith("hdfs://")) {
				Pattern p = Pattern.compile("(hdfs://[^:]+:[0-9]+)(/.*/)(.*)");
				Matcher m = p.matcher(arg_out);
				if (m.matches()) {
					out = getHdfsOutputStream(m.group(1), m.group(2), m.group(3));
				} else {
					throw new IllegalArgumentException("Illegal HDFS location");
				}
			} else {
				out = new FileOutputStream(arg_out);
			}
			// Hour.
			if (arg_hour.matches("\\d{10}")) {
				hour = arg_hour;
			} else {
				throw new IllegalArgumentException("Illegal hour specification; must be YYYYMMDDHH");
			}
			// Content type.
			if (arg_contentType.toLowerCase().equals("twitter")) {
				contentType = Spinn3rDoc.ContentType.TWITTER;
			} else if (arg_contentType.toLowerCase().equals("facebook")) {
				contentType = Spinn3rDoc.ContentType.FACEBOOK;
			} else if (arg_contentType.toLowerCase().equals("web")) {
				contentType = Spinn3rDoc.ContentType.WEB;
			} else {
				throw new IllegalArgumentException(
						"Illegal content type; must be web, twitter, or facebook");
			}
			// Spinn3r version.
			if (arg_spinn3rVersion.matches("^[ABCDE]$")) {
				spinn3rVersion = Spinn3rDoc.Spinn3rVersion.valueOf(arg_spinn3rVersion);
			} else {
				throw new IllegalArgumentException("Illegal Spinn3r version; must be A, B, C, D, or E");
			}
			// Unicode degarbler table.
			unicodeDegarblerTable = arg_unicodeDegarblerTable;
			// Language profile base directory.
			langProfileBaseDir = arg_langProfileBaseDir;
			// Logging properties file.
			loggingPropertiesFile = arg_loggingPropertiesFile;
		} catch (Exception e) {
			System.err.println(e.getMessage());
			printUsage();
			return;
		}
		long before = System.currentTimeMillis();
		Spinn3rToHadoopWriter writer = new Spinn3rToHadoopWriter(in, out, hour, contentType,
				spinn3rVersion, langProfileBaseDir, unicodeDegarblerTable, loggingPropertiesFile);
		long numBytes = writer.write();
		System.err.format("%.1f MB written in %.1f seconds\n", numBytes / 1024.0 / 1024.0,
				(System.currentTimeMillis() - before) / 1000.0);
	}

}
