/*
 * Decompiled with CFR 0.152.
 */
package edu.stanford.snap.spinn3rhadoop;

import com.cybozu.labs.langdetect.Detector;
import com.cybozu.labs.langdetect.DetectorFactory;
import com.cybozu.labs.langdetect.LangDetectException;
import com.cybozu.labs.langdetect.Language;
import edu.stanford.snap.spinn3rhadoop.Spinn3rDoc;
import edu.stanford.snap.spinn3rhadoop.Spinn3rDocumentReader;
import edu.stanford.snap.spinn3rhadoop.UnicodeDegarbler;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.PrintStream;
import java.io.UnsupportedEncodingException;
import java.net.InetAddress;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.UnknownHostException;
import java.text.DecimalFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.logging.Level;
import java.util.logging.LogManager;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;

public class Spinn3rToHadoopWriter {
    private Spinn3rDocumentReader docReader;
    private String hour;
    private Spinn3rDoc.ContentType contentType;
    private BufferedWriter bufferedWriter;
    private UnicodeDegarbler degarbler;
    private static final Logger LOGGER = Logger.getLogger(Spinn3rToHadoopWriter.class.getName());
    private static final double MIN_ASCII_RATIO = 0.8;
    private static final double MIN_NON_QUESTIONMARK_RATIO = 0.8;

    public Spinn3rToHadoopWriter(InputStream in, OutputStream out, String hour, Spinn3rDoc.ContentType contentType, Spinn3rDoc.Spinn3rVersion spinn3rVersion, String langProfileBaseDir, String unicodeDegarblerTable) throws LangDetectException, IOException {
        String profiles;
        try {
            this.bufferedWriter = new BufferedWriter(new OutputStreamWriter(out, "UTF-8"));
        }
        catch (UnsupportedEncodingException e) {
            throw new RuntimeException(e);
        }
        this.hour = hour;
        this.contentType = contentType;
        switch (contentType) {
            case TWITTER: 
            case FACEBOOK: {
                profiles = "profiles.sm";
                break;
            }
            default: {
                profiles = "profiles";
            }
        }
        DetectorFactory.loadProfile((String)(String.valueOf(langProfileBaseDir) + "/" + profiles));
        switch (spinn3rVersion) {
            case A: {
                this.degarbler = new UnicodeDegarbler.Latin1ToUtf8Degarbler(unicodeDegarblerTable, 0.8);
                this.docReader = new Spinn3rDocumentReader.MultiLineReader(in, spinn3rVersion, "UTF-8", this.degarbler);
                break;
            }
            case B: {
                this.degarbler = new UnicodeDegarbler.NullDegarbler(0.8);
                this.docReader = new Spinn3rDocumentReader.MultiLineReader(in, spinn3rVersion, "ISO-8859-1", this.degarbler);
                break;
            }
            case C: {
                this.degarbler = new UnicodeDegarbler.NullDegarbler(0.8);
                this.docReader = new Spinn3rDocumentReader.MultiLineReader(in, spinn3rVersion, "UTF-8", this.degarbler);
                break;
            }
            case D: 
            case E: {
                this.degarbler = new UnicodeDegarbler.NullDegarbler(0.8);
                this.docReader = new Spinn3rDocumentReader.SingleLineReader(in, spinn3rVersion, "UTF-8", this.degarbler);
            }
        }
    }

    public long write() throws IOException, IllegalArgumentException {
        long id = 0L;
        long numBytes = 0L;
        int numDuplicates = 0;
        int numDocuments = 0;
        HashSet<String> seenUrls = new HashSet<String>();
        HashSet<Integer> seenContentHashes = new HashSet<Integer>();
        while (true) {
            Spinn3rDoc doc;
            if ((doc = this.docReader.read()) == null) break;
            ++numDocuments;
            String hashableContent = String.valueOf(doc.url) + "\t" + (doc.content == null ? "" : doc.content);
            if (seenUrls.contains(doc.url) && seenContentHashes.contains(hashableContent.hashCode())) {
                ++numDuplicates;
                continue;
            }
            seenUrls.add(doc.url);
            seenContentHashes.add(hashableContent.hashCode());
            doc.docId = String.format("%s_%08d_%s", new Object[]{this.hour, id, this.contentType});
            ++id;
            try {
                String text = String.valueOf(doc.title == null ? "" : doc.title) + " " + (doc.content == null ? "" : doc.content);
                doc.isGarbled = this.degarbler.isGarbled(text);
                doc.nonGarbageFraction = this.degarbler.getNonGarbageFraction(text);
                List<Language> lngs = this.detectLang(text);
                for (Language l : lngs) {
                    doc.appendLang(l.lang, l.prob);
                }
            }
            catch (LangDetectException e) {
                if (!e.getMessage().contains("no features in text")) {
                    LOGGER.severe(String.valueOf(((Object)((Object)e)).getClass().getName()) + ": " + e.getMessage());
                }
            }
            catch (IllegalArgumentException e) {
                LOGGER.severe(String.valueOf(e.getClass().getName()) + ": " + e.getMessage());
            }
            String toWrite = String.valueOf(doc.toString()) + "\n";
            this.bufferedWriter.write(toWrite);
            numBytes += (long)toWrite.getBytes("UTF-8").length;
        }
        this.bufferedWriter.close();
        DecimalFormat df = new DecimalFormat("###.##");
        LOGGER.log(Level.INFO, "Number of duplicate documents: " + numDuplicates + ". Number of all documents: " + numDocuments + ". Duplicates percentage: " + df.format((float)numDuplicates / (float)numDocuments * 100.0f) + "%.");
        return numBytes;
    }

    public List<Language> detectLang(String text) throws LangDetectException, IllegalArgumentException {
        ArrayList langs;
        Detector detector = DetectorFactory.create();
        detector.append(text);
        try {
            langs = detector.getProbabilities();
        }
        catch (LangDetectException e) {
            throw new LangDetectException(e.getCode(), String.valueOf(e.getMessage()) + ": " + text);
        }
        if (langs.isEmpty()) {
            throw new IllegalArgumentException("No valid language detected: " + text);
        }
        return langs;
    }

    private static OutputStream getHdfsOutputStream(String fs, String dir, String file) throws IOException, URISyntaxException {
        Path filePath;
        Path dirPath;
        Configuration conf = new Configuration();
        FileSystem hdfs = FileSystem.get((URI)new URI(fs), (Configuration)conf);
        if (!hdfs.exists(dirPath = new Path(String.valueOf(fs) + dir))) {
            hdfs.mkdirs(dirPath);
        }
        if (hdfs.exists(filePath = new Path(String.valueOf(fs) + dir + file))) {
            hdfs.delete(filePath, true);
        }
        return hdfs.create(filePath);
    }

    private static void printUsage() {
    }

    private static void safeSystemErr(String s) {
        System.err.format(s, new Object[0]);
        System.err.flush();
    }

    /*
     * WARNING - void declaration
     * Enabled force condition propagation
     * Lifted jumps to return sites
     */
    public static void main(String[] args) {
        boolean DEBUG = true;
        if (DEBUG && args.length == 0) {
            args = new String[]{"/Users/Niko/Documents/workspace/spinn3rhadoop_java/datasets/web/2010-08/web-2010-08-01T00-00-00Z.txt", "/Users/Niko/Documents/workspace/spinn3rhadoop_java/web-2010-brokenLines.out", "2010080100", "web", "C", "/Users/Niko/Documents/workspace/spinn3rhadoop_java/exportv2/include/langdetect", "/Users/Niko/Documents/workspace/spinn3rhadoop_java/exportv2/include/unicode_error_table.tsv", "/Users/Niko/Documents/workspace/spinn3rhadoop_java/exportv2/include/logging.properties"};
        }
        String host = "unknown";
        try {
            host = InetAddress.getLocalHost().getHostName();
        }
        catch (UnknownHostException unknownHostException) {
            // empty catch block
        }
        try {
            void var3_19;
            Spinn3rDoc.ContentType contentType;
            String arg_in = args[0];
            String arg_out = args[1];
            String arg_hour = args[2];
            String arg_contentType = args[3];
            String arg_spinn3rVersion = args[4];
            String arg_langProfileBaseDir = args[5];
            String arg_unicodeDegarblerTable = args[6];
            String arg_loggingPropertiesFile = args[7];
            LogManager.getLogManager().readConfiguration(new FileInputStream(arg_loggingPropertiesFile));
            LOGGER.log(Level.INFO, "\n*\nCurrent input file is: '" + args[0] + "'.\n*");
            InputStream in = arg_in.equals("System.in") ? System.in : new FileInputStream(arg_in);
            if (arg_out.equals("System.out")) {
                PrintStream printStream = System.out;
            } else if (arg_out.startsWith("hdfs://")) {
                Pattern p = Pattern.compile("(hdfs://[^:]+:[0-9]+)(/.*/)(.*)");
                Matcher m = p.matcher(arg_out);
                if (!m.matches()) throw new IllegalArgumentException("Illegal HDFS location");
                OutputStream outputStream = Spinn3rToHadoopWriter.getHdfsOutputStream(m.group(1), m.group(2), m.group(3));
            } else {
                FileOutputStream fileOutputStream = new FileOutputStream(arg_out);
            }
            if (!arg_hour.matches("\\d{10}")) {
                throw new IllegalArgumentException("Illegal hour specification; must be YYYYMMDDHH");
            }
            String hour = arg_hour;
            if (arg_contentType.toLowerCase().equals("twitter")) {
                contentType = Spinn3rDoc.ContentType.TWITTER;
            } else if (arg_contentType.toLowerCase().equals("facebook")) {
                contentType = Spinn3rDoc.ContentType.FACEBOOK;
            } else {
                if (!arg_contentType.toLowerCase().equals("web")) throw new IllegalArgumentException("Illegal content type; must be web, twitter, or facebook");
                contentType = Spinn3rDoc.ContentType.WEB;
            }
            if (!arg_spinn3rVersion.matches("^[ABCDE]$")) {
                throw new IllegalArgumentException("Illegal Spinn3r version; must be A, B, C, D, or E");
            }
            Spinn3rDoc.Spinn3rVersion spinn3rVersion = Spinn3rDoc.Spinn3rVersion.valueOf(arg_spinn3rVersion);
            String unicodeDegarblerTable = arg_unicodeDegarblerTable;
            String langProfileBaseDir = arg_langProfileBaseDir;
            long before = System.currentTimeMillis();
            Spinn3rToHadoopWriter writer = new Spinn3rToHadoopWriter(in, (OutputStream)var3_19, hour, contentType, spinn3rVersion, langProfileBaseDir, unicodeDegarblerTable);
            long numBytes = writer.write();
            File arg_in_f = new File(arg_in);
            String message = String.format("__results__\t__SUCCESS__\t%s\t__%s__\t%.1fMB\t%.1fs\n", arg_in_f.getName(), host, (double)numBytes / 1024.0 / 1024.0, (double)(System.currentTimeMillis() - before) / 1000.0);
            Spinn3rToHadoopWriter.safeSystemErr(message);
            return;
        }
        catch (Exception e) {
            File arg_in_f = new File(args[0]);
            String message = String.format("__results__\t__ERROR__\t%s\t__%s__\n", arg_in_f.getName(), host);
            Spinn3rToHadoopWriter.safeSystemErr(message);
            LOGGER.log(Level.SEVERE, "ERROR while pocessing file '" + args[0] + "'.");
            LOGGER.log(Level.SEVERE, e.getMessage());
            LOGGER.log(Level.SEVERE, Arrays.toString(e.getStackTrace()));
            Spinn3rToHadoopWriter.printUsage();
            return;
        }
    }
}

