package edu.stanford.snap.spinn3rhadoop;

import java.io.File;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.Scanner;

public abstract class UnicodeDegarbler {

	public abstract String degarble(String text);
	public abstract boolean isGarbled(String text);
	public abstract double getNonGarbageFraction(String text);
	
	public static class NullDegarbler extends UnicodeDegarbler {
		
		private double minNonQuestionMarkRatio;
		
		public NullDegarbler(double minNonQuestionMarkRatio) {
			this.minNonQuestionMarkRatio = minNonQuestionMarkRatio;
		}

		@Override
		public String degarble(String text) {
			return text;
		}

		// when modifying the isGarbled method make sure you modify the getNonGarbageFraction method as well
		@Override
		public boolean isGarbled(String text) {
			int n = text.length();
			int nonQuestionMarks = 0;
			for (int i = 0; i < n; ++i) {
				if (text.charAt(i) != '?') {
					++nonQuestionMarks;
				}
			}
			return (float) nonQuestionMarks / n < minNonQuestionMarkRatio;
		}
		
		// the same method as isGarbled, just that this returns the ratio not T/F
		@Override
		public double getNonGarbageFraction(String text) {
			int n = text.length();
			int nonQuestionMarks = 0;
			for (int i = 0; i < n; ++i) {
				if (text.charAt(i) != '?') {
					++nonQuestionMarks;
				}
			}
			return (double) nonQuestionMarks / n ;
		}

	}

	public static class Latin1ToUtf8Degarbler extends UnicodeDegarbler {

		private Map<String, String> table = new HashMap<String, String>();
		private double minAsciiRatio;

		public Latin1ToUtf8Degarbler(String tableFile, double minAsciiRatio) throws IOException {
			this.minAsciiRatio = minAsciiRatio;
			Scanner sc = new Scanner(new File(tableFile), "UTF-8").useDelimiter("\n");
			while (sc.hasNext()) {
				String line = sc.next();
				// Skip comments.
				if (line.startsWith("#")) {
					continue;
				}
				String[] tokens = line.split("\t", 5);
				// First replace SPACE with NBSP.
				String pattern = tokens[4].replace(' ', '\u00A0');
				pattern = java.net.URLDecoder.decode(pattern, "ISO-8859-1").toLowerCase();
				// Delete the NBSP.
				pattern = pattern.replaceAll("\u00A0", "");
				table.put(pattern, tokens[2].toLowerCase());
			}
			sc.close();
		}
		
		@Override
		public String degarble(String text) {
			StringBuffer result = new StringBuffer();
			int i = 0;
			int n = text.length();
			outer: while (i < n) {
				for (int k = Math.min(3, n - i); k >= 2; --k) {
					String sub = text.substring(i, i + k);
					if (table.containsKey(sub)) {
						result.append(table.get(sub));
						i += k;
						continue outer;
					}
				}
				result.append(text.charAt(i));
				i += 1;
			}
			return result.toString();
		}

		// when modifying the isGarbled method make sure you modify the getNonGarbageFraction method as well
		@Override
		public boolean isGarbled(String text) {
			int n = text.length();
			int ascii = 0;
			for (int i = 0; i < n; ++i) {
				if (text.charAt(i) < 128) {
					++ascii;
				}
			}
			return (float) ascii / n < minAsciiRatio;
		}
		
		// the same method as isGarbled, just that this returns the ratio  not T/F
		@Override
		public double getNonGarbageFraction(String text) {
			int n = text.length();
			int ascii = 0;
			for (int i = 0; i < n; ++i) {
				if (text.charAt(i) < 128) {
					++ascii;
				}
			}
			return (double) ascii / n;
		}

	}

	/**
	 * @param args
	 * @throws Exception
	 */
	public static void main(String[] args) throws Exception {
		String table = "/Users/Niko/Documents/workspace/spinn3rhadoop_java/export/include/unicode_error_table.tsv";
		Latin1ToUtf8Degarbler degarbler = new Latin1ToUtf8Degarbler(table, 0.8);
		String text = "ungl√£¬ºcklich k√£¬§mpfte s√£¬©bastien buemi";
		text = "ç<9c><9f>æ<98>¯æ<9c><8d>äº<86>è<87>ªå·±(ç»<88>æ<96>¼100%) - i.hobby—æ¨¡å<9e><8b>å<8c>º - nintendo world bbs gba|nds|psp|wii|æ¸¸æ<88><8f>ç<8e><8b>|æ¨¡å<9e><8b>1/100 exiaå<8f><91>å<94>® ç­<89>é<9b>ªå´©ï¼<8c>bandaiå<88>«æ<83>³éª<97>æ<88><91>é<92>± 1/100é<9b>ªå´©å<8f><91>å<94>® ç­<89>mgï¼<8c>bandaiå<88>«æ<83>³éª<97>æ<88><91>é<92>± mg exiaå<8f><91>å<94>® ç­<89>r2å<95>¦ mg exiar2å<8f><91>å<94>® ç­<89>å<87>ºmgé<9b>ªå´© mgé<9b>ªå´©å<8f><91>å<94>® ç­<89>å<87>ºpgï¼<8c>æ<88><91>æ<98>¯ä¸<8d>ä¼<9a>ä¸<8a>ž<93>ç<9a><84> 1/100 exiaå<8f><91>å<94>® ç­<89>é<9b>ªå´©ï¼<8c>bandaiå<88>«æ<83>³éª<97>æ<88><91>é<92>± 1/100é<9b>ªå´©å<8f><91>å<94>® ç­<89>mgï¼<8c>bandaiå<88>«æ<83>³éª<97>æ<88><91>é<92>± mg exiaå<8f><91>å<94>® ç­<89>r2å<95>¦ mg exiar2å<8f><91>å<94>® ç­<89>å<87>ºmgé<9b>ªå´© mgé<9b>ªå´©å<8f><91>å<94>® ç­<89>å<87>ºpgï¼<8c>æ<88><91>æ<98>¯ä¸<8d>ä¼<9a>ä¸<8a>ž<93>ç<9a><84>";
		System.out.println(degarbler.degarble(text));
		System.out.println(new NullDegarbler(0.8).isGarbled(text));
	}

}
