package edu.stanford.snap.spinn3rhadoop;

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;

import org.apache.commons.lang.StringEscapeUtils;

import edu.stanford.snap.spinn3rhadoop.Spinn3rDoc.Spinn3rVersion;

public abstract class Spinn3rDocumentReader {

  protected BufferedReader bufferedReader;
  protected Spinn3rDoc.Spinn3rVersion version;
  protected UnicodeDegarbler degarbler;

  public Spinn3rDocumentReader(InputStream in, Spinn3rDoc.Spinn3rVersion version,
      String charEncoding, UnicodeDegarbler degarbler) throws UnsupportedEncodingException {
    bufferedReader = new BufferedReader(new InputStreamReader(in, charEncoding));
    this.version = version;
    this.degarbler = degarbler;
  }

  /*
   * Returns null when there is no more data to be read.
   */
  public abstract Spinn3rDoc read() throws IOException;

  public static class SingleLineReader extends Spinn3rDocumentReader {

    public SingleLineReader(InputStream in, Spinn3rDoc.Spinn3rVersion version, String charEncoding,
        UnicodeDegarbler degarbler) throws UnsupportedEncodingException {
      super(in, version, charEncoding, degarbler);
    }

    @Override
    public Spinn3rDoc read() throws IOException {
      String line = bufferedReader.readLine();
      // Degarble the line.
      line = degarbler.degarble(line);
      if (line == null) {
        return null;
      } else {
        Spinn3rDoc doc = new Spinn3rDoc();
        doc.version = this.version;
        String[] columns = line.split("\t");
        for (String col : columns) {
          String[] tokens = col.split(":", 2);
          try {
	          String colType = tokens[0];
	          String value = tokens[1];
	          if (colType.equals("U")) {
	            doc.url = value;
	          } else if (colType.equals("D")) {
	            // Get date into a standard format: 2014-05-23T21:00:01Z -> 2014-05-23 21:00:01
	            value = value.replaceFirst("T", " ").replaceFirst("Z.*", "");
	            doc.date = value;
	          } else if (colType.equals("T")) {
	            doc.title = StringEscapeUtils.unescapeHtml(value);
	          } else if (colType.equals("F")) {
	            doc.title_raw = value;
	          } else if (colType.equals("C")) {
	            doc.content = StringEscapeUtils.unescapeHtml(value);
	          } else if (colType.equals("H")) {
	            doc.content_raw = value;
	          } else if (colType.equals("L")) {
	            String[] linkTokens = value.split(":", 3);
	            doc.links.add(new Spinn3rDoc.Link(Integer.parseInt(linkTokens[0]), Integer
	                .parseInt(linkTokens[1]), linkTokens[2]));
	          } else if (colType.equals("Q")) {
	            String[] quoteTokens = value.split(":", 3);
	            doc.quotes.add(new Spinn3rDoc.Quote(Integer.parseInt(quoteTokens[0]), Integer
	                .parseInt(quoteTokens[1]), quoteTokens[2]));
	          }
          } catch (Exception e) {
            throw new IOException(e.getClass().getName() + " : " + e.getMessage() + ": LINE:" + line);
          }
        }
        return doc;
      }
    }

  }

  public static class MultiLineReader extends Spinn3rDocumentReader {

    public MultiLineReader(InputStream in, Spinn3rDoc.Spinn3rVersion version, String charEncoding,
        UnicodeDegarbler degarbler) throws UnsupportedEncodingException {
      super(in, version, charEncoding, degarbler);
    }

    @Override
    public Spinn3rDoc read() throws IOException {
      String line;
      Spinn3rDoc doc = new Spinn3rDoc();
      doc.version = this.version;
      while (true) {
        line = bufferedReader.readLine();
        // When we reach the end of the file, degrabler can NOT handle the null line
        // therefore we have to exit before the degrable call!
        if (line == null) {
            return null;
        }
        // Degarble the line.
        line = degarbler.degarble(line);
        if (line.isEmpty()) {
          return doc;
        } else {
          String[] tokens = line.split("\t", 2);
          try {
            String lineType = tokens[0];
            String value = tokens[1];
            if (lineType.equals("U")) {
              doc.url = value;
            } else if (lineType.equals("D")) {
              doc.date = value;
            } else if (lineType.equals("T")) {
              doc.title = StringEscapeUtils.unescapeHtml(value);
              //doc.title = value;
            } else if (lineType.equals("C")) {
              doc.content = StringEscapeUtils.unescapeHtml(value);
              //doc.content = value;
            } else if (lineType.equals("L")) {
              String[] linkTokens = value.split("\t", 2);
              doc.links.add(new Spinn3rDoc.Link(Integer.parseInt(linkTokens[0]), linkTokens[1]));
            } else if (lineType.equals("Q")) {
              String[] quoteTokens = value.split("\t", 3);
              doc.quotes.add(new Spinn3rDoc.Quote(Integer.parseInt(quoteTokens[0]), Integer
                  .parseInt(quoteTokens[1]), quoteTokens[2]));
            }
          } catch (Exception e) {
            throw new IOException(e.getClass().getName() + " : " + e.getMessage() + ": LINE:" + line);
          }
        }
      }
    }
  }

  /**
   * @param args
   */
  public static void main(String[] args) throws Exception {
    Spinn3rDocumentReader reader = new MultiLineReader(new FileInputStream(
        "/tmp/web-2011-07-23T23-00-00Z.txt"), Spinn3rVersion.B, "UTF-8",
        new UnicodeDegarbler.NullDegarbler(0.8));
    for (int i = 0; i < 100; ++i)
      System.out.println(reader.read());
  }

}
