package edu.stanford.snap.spinn3rhadoop;

import java.io.File;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.Scanner;

public abstract class UnicodeDegarbler {

	public abstract String degarble(String text);
	public abstract boolean isGarbled(String text);
	
	public static class NullDegarbler extends UnicodeDegarbler {
		
		private double minNonQuestionMarkRatio;
		
		public NullDegarbler(double minNonQuestionMarkRatio) {
			this.minNonQuestionMarkRatio = minNonQuestionMarkRatio;
		}

		@Override
		public String degarble(String text) {
			return text;
		}

		@Override
		public boolean isGarbled(String text) {
			int n = text.length();
			int nonQuestionMarks = 0;
			for (int i = 0; i < n; ++i) {
				if (text.charAt(i) != '?') {
					++nonQuestionMarks;
				}
			}
			return (float) nonQuestionMarks / n < minNonQuestionMarkRatio;
		}

	}

	public static class Latin1ToUtf8Degarbler extends UnicodeDegarbler {

		private Map<String, String> table = new HashMap<String, String>();
		private double minAsciiRatio;

		public Latin1ToUtf8Degarbler(String tableFile, double minAsciiRatio) throws IOException {
			this.minAsciiRatio = minAsciiRatio;
			Scanner sc = new Scanner(new File(tableFile), "UTF-8").useDelimiter("\n");
			while (sc.hasNext()) {
				String line = sc.next();
				// Skip comments.
				if (line.startsWith("#")) {
					continue;
				}
				String[] tokens = line.split("\t", 5);
				// First replace SPACE with NBSP.
				String pattern = tokens[4].replace(' ', '\u00A0');
				pattern = java.net.URLDecoder.decode(pattern, "ISO-8859-1").toLowerCase();
				// Delete the NBSP.
				pattern = pattern.replaceAll("\u00A0", "");
				table.put(pattern, tokens[2].toLowerCase());
			}
			sc.close();
		}
		
		@Override
		public String degarble(String text) {
			StringBuffer result = new StringBuffer();
			int i = 0;
			int n = text.length();
			outer: while (i < n) {
				for (int k = Math.min(3, n - i); k >= 2; --k) {
					String sub = text.substring(i, i + k);
					if (table.containsKey(sub)) {
						result.append(table.get(sub));
						i += k;
						continue outer;
					}
				}
				result.append(text.charAt(i));
				i += 1;
			}
			return result.toString();
		}

		@Override
		public boolean isGarbled(String text) {
			int n = text.length();
			int ascii = 0;
			for (int i = 0; i < n; ++i) {
				if (text.charAt(i) < 128) {
					++ascii;
				}
			}
			return (float) ascii / n < minAsciiRatio;
		}

	}

	/**
	 * @param args
	 * @throws Exception
	 */
	public static void main(String[] args) throws Exception {
		String table = System.getenv("HOME") + "/repo/spinn3r/data/spinn3r/unicode_error_table.tsv";
		Latin1ToUtf8Degarbler degarbler = new Latin1ToUtf8Degarbler(table, 0.8);
		String text = "unglã¼cklich kã¤mpfte sã©bastien buemi";
		text = "lennon???????: ??????????? ????????? lennon??????? ???????????????????????? ????????????????????? 07/23??? | top july 24, 2011 ??????????? ???????????????????? ??????? ????????????? ???????????? ??????? ????????????? ????????? ??? ??????????? ????????????? ????? ???????????? ?????????? ???????????? ???????????? ???????? ??????????? ????????? 07/23??? ?????????????iii??? ?????????? 07/17 win5?? ?????????????? posted by lennon at 07:25&nbsp; ? comments(0) ? trackback(0) | ?? | | ??blog????? ? ???? ? ?????????????url ????????????????????????????? ????????????????????????? ????????????????????????? ??: ???: url: ????: ?????: ????????????????????? ?????????????? ?? &nbsp; ??? ?????????????????????? &nbsp; &nbsp; ????????? july,2011 >> s m t w t f s &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; profile lennon ?????????? ????? ????????????????????????? ??????????????????????????????? ??????????????????????????? ????????? ???????????????? written by ????? recent entries ??????????? 07/23??? ????????????? ?????????? ?????? recent comments ????????? &nbsp;(??) ????!! &nbsp;(???) ????????!! &nbsp;(??) ????????? &nbsp;(?????) ????????? &nbsp;(???) recent trackbacks ??????????????????????????????????????????????? &nbsp;(???????????????????????????????????????????????) &nbsp; ??????????????? ????????????????????????????? &nbsp;(?????????????????????????????) &nbsp; ?13? ??????????(????)?? ??? ??? ?? - daiyu???? &nbsp;(???????148%???) &nbsp; ??????????????? ??6 &nbsp;(??????????) &nbsp; ?????????????? ??????????????????????????????????????????????? &nbsp;(???????????????????????????????????????????????) &nbsp; 06/12 ?????? archives july 2011 june 2011 may 2011 april 2011 march 2011 february 2011 january 2011 december 2010 november 2010 october 2010 september 2010 august 2010 july 2010 june 2010 may 2010 april 2010 march 2010 february 2010 january 2010 december 2009 november 2009 october 2009 september 2009 august 2009 july 2009 june 2009 may 2009 april 2009 march 2009 february 2009 january 2009 december 2008 november 2008 categories ??&nbsp;(9) ??&nbsp;(868) smart phone&nbsp;(2) ?????&nbsp;(20) ??&nbsp;(28) pc&nbsp;(23) jack-in-the-box&nbsp;(7) ??&nbsp;(1) ?????&nbsp;(6) ?&nbsp;(29) ???????&nbsp;(33) ??&nbsp;(429) ???&nbsp;(85) my favorite things&nbsp;(21) music&nbsp;(17) ??&nbsp;(30) ??&nbsp;(1) ???&nbsp;(8) search in blog favorite blogs ?????? links jra big?toto????????? ???????? ????? lennon??????? rem?digitalcaravan ??????? rem?digitalcaravan ????823????????????? au??????windows phone 7 ?? is1? windows phone 7 ??? ???vol.20?8/6?? windows phone 7???????????? yahoo!?????????? - ??? news!! ???????32? ???? ?????????????? ??????? ??80??? ??????? ?????? ??1?? ??????? -pr- ?????? ??????????! ??????????! javascript?on???????? ????????? ???? bloggame www.blogdeco.jp qrcord ?????blog???";
		System.out.println(degarbler.degarble(text));
		System.out.println(new NullDegarbler(0.8).isGarbled(text));
	}

}
