/*
 * Decompiled with CFR 0.152.
 */
package org.apache.nutch.parse;

import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.Iterator;
import java.util.Map;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.crawl.SignatureFactory;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseImpl;
import org.apache.nutch.parse.ParseOutputFormat;
import org.apache.nutch.parse.ParseResult;
import org.apache.nutch.parse.ParseStatus;
import org.apache.nutch.parse.ParseText;
import org.apache.nutch.parse.ParseUtil;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.scoring.ScoringFilterException;
import org.apache.nutch.scoring.ScoringFilters;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.StringUtil;
import org.apache.nutch.util.TimingUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class ParseSegment
extends Configured
implements Tool,
Mapper<WritableComparable<?>, Content, Text, ParseImpl>,
Reducer<Text, Writable, Text, Writable> {
    public static final Logger LOG = LoggerFactory.getLogger(ParseSegment.class);
    public static final String SKIP_TRUNCATED = "parser.skip.truncated";
    private ScoringFilters scfilters;
    private ParseUtil parseUtil;
    private boolean skipTruncated;
    private Text newKey = new Text();

    public ParseSegment() {
        this(null);
    }

    public ParseSegment(Configuration conf) {
        super(conf);
    }

    public void configure(JobConf job) {
        this.setConf((Configuration)job);
        this.scfilters = new ScoringFilters((Configuration)job);
        this.skipTruncated = job.getBoolean(SKIP_TRUNCATED, true);
    }

    public void close() {
    }

    public void map(WritableComparable<?> key, Content content, OutputCollector<Text, ParseImpl> output, Reporter reporter) throws IOException {
        int status;
        if (key instanceof Text) {
            this.newKey.set(key.toString());
            key = this.newKey;
        }
        if ((status = Integer.parseInt(content.getMetadata().get("_fst_"))) != 33) {
            LOG.debug("Skipping " + key + " as content is not fetched successfully");
            return;
        }
        if (this.skipTruncated && ParseSegment.isTruncated(content)) {
            return;
        }
        ParseResult parseResult = null;
        try {
            if (this.parseUtil == null) {
                this.parseUtil = new ParseUtil(this.getConf());
            }
            parseResult = this.parseUtil.parse(content);
        }
        catch (Exception e) {
            LOG.warn("Error parsing: " + key + ": " + StringUtils.stringifyException((Throwable)e));
            return;
        }
        for (Map.Entry<Text, Parse> entry : parseResult) {
            long start;
            Parse parse;
            Text url;
            block10: {
                url = entry.getKey();
                parse = entry.getValue();
                ParseStatus parseStatus = parse.getData().getStatus();
                start = System.currentTimeMillis();
                reporter.incrCounter("ParserStatus", ParseStatus.majorCodes[parseStatus.getMajorCode()], 1L);
                if (!parseStatus.isSuccess()) {
                    LOG.warn("Error parsing: " + key + ": " + parseStatus);
                    parse = parseStatus.getEmptyParse(this.getConf());
                }
                parse.getData().getContentMeta().set("nutch.segment.name", this.getConf().get("nutch.segment.name"));
                byte[] signature = SignatureFactory.getSignature(this.getConf()).calculate(content, parse);
                parse.getData().getContentMeta().set("nutch.content.digest", StringUtil.toHexString(signature));
                try {
                    this.scfilters.passScoreAfterParsing(url, content, parse);
                }
                catch (ScoringFilterException e) {
                    if (!LOG.isWarnEnabled()) break block10;
                    LOG.warn("Error passing score: " + url + ": " + e.getMessage());
                }
            }
            long end = System.currentTimeMillis();
            LOG.info("Parsed (" + Long.toString(end - start) + "ms):" + url);
            output.collect((Object)url, (Object)new ParseImpl(new ParseText(parse.getText()), parse.getData(), parse.isCanonical()));
        }
    }

    public static boolean isTruncated(Content content) {
        int inHeaderSize;
        byte[] contentBytes = content.getContent();
        if (contentBytes == null) {
            return false;
        }
        Metadata metadata = content.getMetadata();
        if (metadata == null) {
            return false;
        }
        String lengthStr = metadata.get("Content-Length");
        if (lengthStr != null) {
            lengthStr = lengthStr.trim();
        }
        if (StringUtil.isEmpty(lengthStr)) {
            return false;
        }
        String url = content.getUrl();
        try {
            inHeaderSize = Integer.parseInt(lengthStr);
        }
        catch (NumberFormatException e) {
            LOG.warn("Wrong contentlength format for " + url, (Throwable)e);
            return false;
        }
        int actualSize = contentBytes.length;
        if (inHeaderSize > actualSize) {
            LOG.info(url + " skipped. Content of size " + inHeaderSize + " was truncated to " + actualSize);
            return true;
        }
        if (LOG.isDebugEnabled()) {
            LOG.debug(url + " actualSize=" + actualSize + " inHeaderSize=" + inHeaderSize);
        }
        return false;
    }

    public void reduce(Text key, Iterator<Writable> values, OutputCollector<Text, Writable> output, Reporter reporter) throws IOException {
        output.collect((Object)key, (Object)values.next());
    }

    public void parse(Path segment) throws IOException {
        SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
        long start = System.currentTimeMillis();
        if (LOG.isInfoEnabled()) {
            LOG.info("ParseSegment: starting at " + sdf.format(start));
            LOG.info("ParseSegment: segment: " + segment);
        }
        NutchJob job = new NutchJob(this.getConf());
        job.setJobName("parse " + segment);
        FileInputFormat.addInputPath((JobConf)job, (Path)new Path(segment, "content"));
        job.set("nutch.segment.name", segment.getName());
        job.setInputFormat(SequenceFileInputFormat.class);
        job.setMapperClass(ParseSegment.class);
        job.setReducerClass(ParseSegment.class);
        FileOutputFormat.setOutputPath((JobConf)job, (Path)segment);
        job.setOutputFormat(ParseOutputFormat.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(ParseImpl.class);
        JobClient.runJob((JobConf)job);
        long end = System.currentTimeMillis();
        LOG.info("ParseSegment: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
    }

    public static void main(String[] args) throws Exception {
        int res = ToolRunner.run((Configuration)NutchConfiguration.create(), (Tool)new ParseSegment(), (String[])args);
        System.exit(res);
    }

    public int run(String[] args) throws Exception {
        String usage = "Usage: ParseSegment segment [-noFilter] [-noNormalize]";
        if (args.length == 0) {
            System.err.println(usage);
            System.exit(-1);
        }
        if (args.length > 1) {
            for (int i = 1; i < args.length; ++i) {
                String param = args[i];
                if ("-nofilter".equalsIgnoreCase(param)) {
                    this.getConf().setBoolean("parse.filter.urls", false);
                    continue;
                }
                if (!"-nonormalize".equalsIgnoreCase(param)) continue;
                this.getConf().setBoolean("parse.normalize.urls", false);
            }
        }
        Path segment = new Path(args[0]);
        this.parse(segment);
        return 0;
    }
}

