/*
 * Decompiled with CFR 0.152.
 */
package org.apache.nutch.parse;

import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Map;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.MapFile;
import org.apache.hadoop.io.MapWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.InvalidJobConfException;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputFormat;
import org.apache.hadoop.mapred.RecordWriter;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.util.Progressable;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.net.URLFilterException;
import org.apache.nutch.net.URLFilters;
import org.apache.nutch.net.URLNormalizers;
import org.apache.nutch.parse.Outlink;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.parse.ParseStatus;
import org.apache.nutch.parse.ParseText;
import org.apache.nutch.scoring.ScoringFilterException;
import org.apache.nutch.scoring.ScoringFilters;
import org.apache.nutch.util.StringUtil;
import org.apache.nutch.util.URLUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class ParseOutputFormat
implements OutputFormat<Text, Parse> {
    private static final Logger LOG = LoggerFactory.getLogger(ParseOutputFormat.class);
    private URLFilters filters;
    private URLNormalizers normalizers;
    private ScoringFilters scfilters;

    public void checkOutputSpecs(FileSystem fs, JobConf job) throws IOException {
        Path out = FileOutputFormat.getOutputPath((JobConf)job);
        if (out == null && job.getNumReduceTasks() != 0) {
            throw new InvalidJobConfException("Output directory not set in JobConf.");
        }
        if (fs == null) {
            fs = out.getFileSystem((Configuration)job);
        }
        if (fs.exists(new Path(out, "crawl_parse"))) {
            throw new IOException("Segment already parsed!");
        }
    }

    public RecordWriter<Text, Parse> getRecordWriter(FileSystem fs, JobConf job, String name, Progressable progress) throws IOException {
        if (job.getBoolean("parse.filter.urls", true)) {
            this.filters = new URLFilters((Configuration)job);
        }
        if (job.getBoolean("parse.normalize.urls", true)) {
            this.normalizers = new URLNormalizers((Configuration)job, "outlink");
        }
        this.scfilters = new ScoringFilters((Configuration)job);
        final int interval = job.getInt("db.fetch.interval.default", 2592000);
        final boolean ignoreExternalLinks = job.getBoolean("db.ignore.external.links", false);
        int maxOutlinksPerPage = job.getInt("db.max.outlinks.per.page", 100);
        final boolean isParsing = job.getBoolean("fetcher.parse", true);
        final int maxOutlinks = maxOutlinksPerPage < 0 ? Integer.MAX_VALUE : maxOutlinksPerPage;
        SequenceFile.CompressionType compType = SequenceFileOutputFormat.getOutputCompressionType((JobConf)job);
        Path out = FileOutputFormat.getOutputPath((JobConf)job);
        Path text = new Path(new Path(out, "parse_text"), name);
        Path data = new Path(new Path(out, "parse_data"), name);
        Path crawl = new Path(new Path(out, "crawl_parse"), name);
        final String[] parseMDtoCrawlDB = job.get("db.parsemeta.to.crawldb", "").split(" *, *");
        final MapFile.Writer textOut = new MapFile.Writer((Configuration)job, fs, text.toString(), Text.class, ParseText.class, SequenceFile.CompressionType.RECORD, progress);
        final MapFile.Writer dataOut = new MapFile.Writer((Configuration)job, fs, data.toString(), Text.class, ParseData.class, compType, progress);
        final SequenceFile.Writer crawlOut = SequenceFile.createWriter((FileSystem)fs, (Configuration)job, (Path)crawl, Text.class, CrawlDatum.class, (SequenceFile.CompressionType)compType, (Progressable)progress);
        return new RecordWriter<Text, Parse>(){

            /*
             * WARNING - void declaration
             */
            public void write(Text key, Parse parse) throws IOException {
                ParseData parseData;
                String fromHost;
                String fromUrl;
                block27: {
                    byte[] signature;
                    fromUrl = key.toString();
                    fromHost = null;
                    textOut.append((WritableComparable)key, (Writable)new ParseText(parse.getText()));
                    parseData = parse.getData();
                    String sig = parseData.getContentMeta().get("nutch.content.digest");
                    if (sig != null && (signature = StringUtil.fromHexString(sig)) != null) {
                        CrawlDatum d = new CrawlDatum(65, 0);
                        d.setSignature(signature);
                        crawlOut.append((Writable)key, (Writable)d);
                    }
                    CrawlDatum parseMDCrawlDatum = null;
                    for (String mdname : parseMDtoCrawlDB) {
                        String mdvalue = parse.getData().getParseMeta().get(mdname);
                        if (mdvalue == null) continue;
                        if (parseMDCrawlDatum == null) {
                            parseMDCrawlDatum = new CrawlDatum(68, 0);
                        }
                        parseMDCrawlDatum.getMetaData().put((Writable)new Text(mdname), (Writable)new Text(mdvalue));
                    }
                    if (parseMDCrawlDatum != null) {
                        crawlOut.append((Writable)key, parseMDCrawlDatum);
                    }
                    try {
                        ParseStatus pstatus = parseData.getStatus();
                        if (pstatus == null || !pstatus.isSuccess() || pstatus.getMinorCode() != 100) break block27;
                        String newUrl = pstatus.getMessage();
                        int refreshTime = Integer.valueOf(pstatus.getArgs()[1]);
                        try {
                            if (ParseOutputFormat.this.normalizers != null) {
                                newUrl = ParseOutputFormat.this.normalizers.normalize(newUrl, "fetcher");
                            }
                        }
                        catch (MalformedURLException mfue) {
                            newUrl = null;
                        }
                        if (ParseOutputFormat.this.filters != null && newUrl != null) {
                            newUrl = ParseOutputFormat.this.filters.filter(newUrl);
                        }
                        String url = key.toString();
                        if (newUrl != null && !newUrl.equals(url)) {
                            String reprUrl = URLUtil.chooseRepr(url, newUrl, refreshTime < 5);
                            CrawlDatum newDatum = new CrawlDatum();
                            newDatum.setStatus(67);
                            if (reprUrl != null && !reprUrl.equals(newUrl)) {
                                newDatum.getMetaData().put((Writable)Nutch.WRITABLE_REPR_URL_KEY, (Writable)new Text(reprUrl));
                            }
                            crawlOut.append((Writable)new Text(newUrl), (Writable)newDatum);
                        }
                    }
                    catch (URLFilterException e) {
                        // empty catch block
                    }
                }
                Outlink[] links = parseData.getOutlinks();
                int outlinksToStore = Math.min(maxOutlinks, links.length);
                if (ignoreExternalLinks) {
                    try {
                        fromHost = new URL(fromUrl).getHost().toLowerCase();
                    }
                    catch (MalformedURLException e) {
                        fromHost = null;
                    }
                } else {
                    fromHost = null;
                }
                int validCount = 0;
                CrawlDatum adjust = null;
                ArrayList<Map.Entry<Text, CrawlDatum>> targets = new ArrayList<Map.Entry<Text, CrawlDatum>>(outlinksToStore);
                ArrayList<Outlink> outlinkList = new ArrayList<Outlink>(outlinksToStore);
                for (int i = 0; i < links.length && validCount < outlinksToStore; ++i) {
                    void var15_23;
                    String string;
                    String string2 = links[i].getToUrl();
                    if (!isParsing && (string = ParseOutputFormat.filterNormalize(fromUrl, string2, fromHost, ignoreExternalLinks, ParseOutputFormat.this.filters, ParseOutputFormat.this.normalizers)) == null) continue;
                    CrawlDatum target = new CrawlDatum(67, interval);
                    Text targetUrl = new Text((String)var15_23);
                    MapWritable outlinkMD = links[i].getMetadata();
                    if (outlinkMD != null) {
                        target.getMetaData().putAll((Map)outlinkMD);
                    }
                    try {
                        ParseOutputFormat.this.scfilters.initialScore(targetUrl, target);
                    }
                    catch (ScoringFilterException e) {
                        LOG.warn("Cannot filter init score for url " + key + ", using default: " + e.getMessage());
                        target.setScore(0.0f);
                    }
                    targets.add(new SimpleEntry(targetUrl, target));
                    links[i].setUrl((String)var15_23);
                    outlinkList.add(links[i]);
                    ++validCount;
                }
                try {
                    adjust = ParseOutputFormat.this.scfilters.distributeScoreToOutlinks(key, parseData, targets, null, links.length);
                }
                catch (ScoringFilterException e) {
                    LOG.warn("Cannot distribute score from " + key + ": " + e.getMessage());
                }
                for (Map.Entry entry : targets) {
                    crawlOut.append((Writable)entry.getKey(), (Writable)entry.getValue());
                }
                if (adjust != null) {
                    crawlOut.append((Writable)key, (Writable)adjust);
                }
                Outlink[] filteredLinks = outlinkList.toArray(new Outlink[outlinkList.size()]);
                parseData = new ParseData(parseData.getStatus(), parseData.getTitle(), filteredLinks, parseData.getContentMeta(), parseData.getParseMeta());
                dataOut.append((WritableComparable)key, (Writable)parseData);
                if (!parse.isCanonical()) {
                    CrawlDatum crawlDatum = new CrawlDatum();
                    crawlDatum.setStatus(33);
                    String timeString = parse.getData().getContentMeta().get("_ftk_");
                    try {
                        crawlDatum.setFetchTime(Long.parseLong(timeString));
                    }
                    catch (Exception e) {
                        LOG.warn("Can't read fetch time for: " + key);
                        crawlDatum.setFetchTime(System.currentTimeMillis());
                    }
                    crawlOut.append((Writable)key, (Writable)crawlDatum);
                }
            }

            public void close(Reporter reporter) throws IOException {
                textOut.close();
                dataOut.close();
                crawlOut.close();
            }
        };
    }

    public static String filterNormalize(String fromUrl, String toUrl, String fromHost, boolean ignoreExternalLinks, URLFilters filters, URLNormalizers normalizers) {
        if (fromUrl.equals(toUrl)) {
            return null;
        }
        if (ignoreExternalLinks) {
            String toHost;
            try {
                toHost = new URL(toUrl).getHost().toLowerCase();
            }
            catch (MalformedURLException e) {
                toHost = null;
            }
            if (toHost == null || !toHost.equals(fromHost)) {
                return null;
            }
        }
        try {
            if (normalizers != null) {
                toUrl = normalizers.normalize(toUrl, "outlink");
            }
            if (filters != null) {
                toUrl = filters.filter(toUrl);
            }
            if (toUrl == null) {
                return null;
            }
        }
        catch (Exception e) {
            return null;
        }
        return toUrl;
    }

    private static class SimpleEntry
    implements Map.Entry<Text, CrawlDatum> {
        private Text key;
        private CrawlDatum value;

        public SimpleEntry(Text key, CrawlDatum value) {
            this.key = key;
            this.value = value;
        }

        @Override
        public Text getKey() {
            return this.key;
        }

        @Override
        public CrawlDatum getValue() {
            return this.value;
        }

        @Override
        public CrawlDatum setValue(CrawlDatum value) {
            this.value = value;
            return this.value;
        }
    }
}

