package edu.stanford.snap.spinn3rhadoop;

import java.util.ArrayList;
import java.util.List;

public class Spinn3rDoc {
  public String docId = null;
  public String url = null;
  public String date = null;
  public String title = null;
  public String title_raw = null;
  public String content = null;
  public String content_raw = null;
  public List<Link> links = new ArrayList<Link>();
  public List<Quote> quotes = new ArrayList<Quote>();
  public List<Lang> langs = new ArrayList<Lang>();
  public Spinn3rVersion version = null;
  public boolean isGarbled;
  public double nonGarbageFraction;

  public enum Spinn3rVersion {
    A, B, C, D, E;
  }

  public enum ContentType {
    WEB, TWITTER, FACEBOOK;

    public String toString() {
      switch (this) {
      case WEB:
        return "W";
      case TWITTER:
        return "T";
      case FACEBOOK:
        return "F";
      default:
        throw new IllegalArgumentException();
      }
    }
  }
  private static String escapeNewLines(String in){
	  return in.replaceAll("\n", " &#10; ");
  }

  /*
   * Prints the document in the "full5" multi-line format.
   * We also add languages and the information about garbage text.
   */
  @Override
  public String toString() {
    StringBuffer str = new StringBuffer();
    if (docId != null) {
      str.append("I\t").append(docId).append("\n");
    } else {
      throw new IllegalArgumentException("Document has no docId");
    }
    if (version != null) {
      str.append("V\t").append(version).append("\n");
    } else {
      throw new IllegalArgumentException("Document has no version");
    }
    for (Lang l : langs) {
      str.append("S\t").append(l.toString()).append("\n");
    }
    str.append("G\t").append(isGarbled+"\t").append(nonGarbageFraction+"\t").append("\n");
    if (url != null) {
      str.append("U\t").append(escapeNewLines(url)).append("\n");
    } else {
      throw new IllegalArgumentException("Document has no URL");
    }
    if (date != null) {
      str.append("D\t").append(escapeNewLines(date)).append("\n");
    } else {
      throw new IllegalArgumentException("Document has no date");
    }
    if (title != null) {
      str.append("T\t").append(escapeNewLines(title)).append("\n");
    }
    if (title_raw != null) {
      str.append("F\t").append(escapeNewLines(title_raw)).append("\n");
    }
    if (content != null) {
      str.append("C\t").append(escapeNewLines(content)).append("\n");
    }
    if (content_raw != null) {
      str.append("H\t").append(escapeNewLines(content_raw)).append("\n");
    }
    for (Link l : links) {
      str.append("L\t").append(l.toString()).append("\n");
    }
    for (Quote q : quotes) {
      str.append("Q\t").append(q.toString()).append("\n");
    }
    return str.toString();
  }
  
  public void appendLang(String lang, double prob){
	  this.langs.add(new Lang(lang, prob));
  }

  public static class Link {
    // A value of -1 for startPos/length means that the link appears in the title, not in the
    // content.
    public int startPos;
    // For the older versions (up to, and incl., full5), this is empty, i.e., null.
    public Integer length;
    public String url;

    public Link(int startPos, String url) {
      this.startPos = startPos;
      this.length = null;
      this.url = url;
    }

    public Link(int startPos, int length, String url) {
      this.startPos = startPos;
      this.length = length;
      this.url = url;
    }

    @Override
    public String toString() {
      return String.format("%d\t%s\t%s", startPos, length == null ? "" : length, escapeNewLines(url));
    }
  }

  public static class Quote {
    // A value of -1 for startPos/length means that the link appears in the title, not in the
    // content.
    public int startPos;
    public int length;
    public String text;

    public Quote(int startPos, int length, String text) {
      this.startPos = startPos;
      this.length = length;
      this.text = text;
    }

    @Override
    public String toString() {
      return String.format("%s\t%d\t%s", startPos, length, escapeNewLines(text));
    }
  }
  
  // Class for storing languages about this record
  public class Lang {
	  public String lang;
	  public double prob;
	  
	  public Lang(String lang, double prob){
		  this.lang = lang;
		  this.prob = prob;
	  }
	  @Override
	  public String toString(){
		  return String.format("%s\t%f", lang, prob);
	  }
  }
}
