/*
 * Decompiled with CFR 0.152.
 */
package org.apache.nutch.tools.arc;

import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Map;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.NutchWritable;
import org.apache.nutch.crawl.SignatureFactory;
import org.apache.nutch.fetcher.FetcherOutputFormat;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.net.URLFilters;
import org.apache.nutch.net.URLNormalizers;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseImpl;
import org.apache.nutch.parse.ParseResult;
import org.apache.nutch.parse.ParseStatus;
import org.apache.nutch.parse.ParseText;
import org.apache.nutch.parse.ParseUtil;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.protocol.ProtocolStatus;
import org.apache.nutch.scoring.ScoringFilters;
import org.apache.nutch.tools.arc.ArcInputFormat;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.StringUtil;
import org.apache.nutch.util.TimingUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class ArcSegmentCreator
extends Configured
implements Tool,
Mapper<Text, BytesWritable, Text, NutchWritable> {
    public static final Logger LOG = LoggerFactory.getLogger(ArcSegmentCreator.class);
    public static final String URL_VERSION = "arc.url.version";
    private JobConf jobConf;
    private URLFilters urlFilters;
    private ScoringFilters scfilters;
    private ParseUtil parseUtil;
    private URLNormalizers normalizers;
    private int interval;
    private static SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMddHHmmss");

    public ArcSegmentCreator() {
    }

    public ArcSegmentCreator(Configuration conf) {
        this.setConf(conf);
    }

    public static synchronized String generateSegmentName() {
        try {
            Thread.sleep(1000L);
        }
        catch (Throwable throwable) {
            // empty catch block
        }
        return sdf.format(new Date(System.currentTimeMillis()));
    }

    public void configure(JobConf job) {
        this.jobConf = job;
        this.urlFilters = new URLFilters((Configuration)this.jobConf);
        this.scfilters = new ScoringFilters((Configuration)this.jobConf);
        this.parseUtil = new ParseUtil((Configuration)this.jobConf);
        this.normalizers = new URLNormalizers((Configuration)this.jobConf, "fetcher");
        this.interval = this.jobConf.getInt("db.fetch.interval.default", 2592000);
    }

    public void close() {
    }

    private ParseStatus output(OutputCollector<Text, NutchWritable> output, String segmentName, Text key, CrawlDatum datum, Content content, ProtocolStatus pstatus, int status) {
        datum.setStatus(status);
        datum.setFetchTime(System.currentTimeMillis());
        if (pstatus != null) {
            datum.getMetaData().put((Writable)Nutch.WRITABLE_PROTO_STATUS_KEY, (Writable)pstatus);
        }
        ParseResult parseResult = null;
        if (content != null) {
            Parse p;
            block16: {
                block15: {
                    Metadata metadata = content.getMetadata();
                    metadata.set("nutch.segment.name", segmentName);
                    try {
                        this.scfilters.passScoreBeforeParsing(key, datum, content);
                    }
                    catch (Exception e) {
                        if (!LOG.isWarnEnabled()) break block15;
                        LOG.warn("Couldn't pass score, url " + key + " (" + e + ")");
                    }
                }
                try {
                    parseResult = this.parseUtil.parse(content);
                }
                catch (Exception e) {
                    LOG.warn("Error parsing: " + key + ": " + StringUtils.stringifyException((Throwable)e));
                }
                if (parseResult == null) {
                    byte[] signature = SignatureFactory.getSignature(this.getConf()).calculate(content, new ParseStatus().getEmptyParse(this.getConf()));
                    datum.setSignature(signature);
                }
                try {
                    output.collect((Object)key, (Object)new NutchWritable((Writable)datum));
                    output.collect((Object)key, (Object)new NutchWritable(content));
                    if (parseResult == null) break block16;
                    for (Map.Entry<Text, Parse> entry : parseResult) {
                        Parse parse;
                        Text url;
                        block17: {
                            url = entry.getKey();
                            parse = entry.getValue();
                            ParseStatus parseStatus = parse.getData().getStatus();
                            if (!parseStatus.isSuccess()) {
                                LOG.warn("Error parsing: " + key + ": " + parseStatus);
                                parse = parseStatus.getEmptyParse(this.getConf());
                            }
                            byte[] signature = SignatureFactory.getSignature(this.getConf()).calculate(content, parse);
                            parse.getData().getContentMeta().set("nutch.segment.name", segmentName);
                            parse.getData().getContentMeta().set("nutch.content.digest", StringUtil.toHexString(signature));
                            parse.getData().getContentMeta().set("_ftk_", Long.toString(datum.getFetchTime()));
                            if (url.equals((Object)key)) {
                                datum.setSignature(signature);
                            }
                            try {
                                this.scfilters.passScoreAfterParsing(url, content, parse);
                            }
                            catch (Exception e) {
                                if (!LOG.isWarnEnabled()) break block17;
                                LOG.warn("Couldn't pass score, url " + key + " (" + e + ")");
                            }
                        }
                        output.collect((Object)url, (Object)new NutchWritable(new ParseImpl(new ParseText(parse.getText()), parse.getData(), parse.isCanonical())));
                    }
                }
                catch (IOException e) {
                    if (!LOG.isErrorEnabled()) break block16;
                    LOG.error("ArcSegmentCreator caught:" + StringUtils.stringifyException((Throwable)e));
                }
            }
            if (parseResult != null && !parseResult.isEmpty() && (p = parseResult.get(content.getUrl())) != null) {
                return p.getData().getStatus();
            }
        }
        return null;
    }

    private void logError(Text url, Throwable t) {
        if (LOG.isInfoEnabled()) {
            LOG.info("Conversion of " + url + " failed with: " + StringUtils.stringifyException((Throwable)t));
        }
    }

    public void map(Text key, BytesWritable bytes, OutputCollector<Text, NutchWritable> output, Reporter reporter) throws IOException {
        String[] headers = key.toString().split("\\s+");
        String urlStr = headers[0];
        String version = headers[2];
        String contentType = headers[3];
        if (urlStr.startsWith("filedesc://")) {
            LOG.info("Ignoring file header: " + urlStr);
            return;
        }
        LOG.info("Processing: " + urlStr);
        Text url = new Text();
        CrawlDatum datum = new CrawlDatum(2, this.interval, 1.0f);
        String segmentName = this.getConf().get("nutch.segment.name");
        try {
            urlStr = this.normalizers.normalize(urlStr, "fetcher");
            urlStr = this.urlFilters.filter(urlStr);
        }
        catch (Exception e) {
            if (LOG.isWarnEnabled()) {
                LOG.warn("Skipping " + url + ":" + e);
            }
            urlStr = null;
        }
        if (urlStr != null) {
            url.set(urlStr);
            try {
                ProtocolStatus status = ProtocolStatus.STATUS_SUCCESS;
                Content content = new Content(urlStr, urlStr, bytes.getBytes(), contentType, new Metadata(), this.getConf());
                content.getMetadata().set(URL_VERSION, version);
                ParseStatus pstatus = null;
                pstatus = this.output(output, segmentName, url, datum, content, status, 33);
                reporter.progress();
            }
            catch (Throwable t) {
                this.logError(url, t);
                this.output(output, segmentName, url, datum, null, null, 34);
            }
        }
    }

    public void createSegments(Path arcFiles, Path segmentsOutDir) throws IOException {
        SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
        long start = System.currentTimeMillis();
        if (LOG.isInfoEnabled()) {
            LOG.info("ArcSegmentCreator: starting at " + sdf.format(start));
            LOG.info("ArcSegmentCreator: arc files dir: " + arcFiles);
        }
        NutchJob job = new NutchJob(this.getConf());
        job.setJobName("ArcSegmentCreator " + arcFiles);
        String segName = ArcSegmentCreator.generateSegmentName();
        job.set("nutch.segment.name", segName);
        FileInputFormat.addInputPath((JobConf)job, (Path)arcFiles);
        job.setInputFormat(ArcInputFormat.class);
        job.setMapperClass(ArcSegmentCreator.class);
        FileOutputFormat.setOutputPath((JobConf)job, (Path)new Path(segmentsOutDir, segName));
        job.setOutputFormat(FetcherOutputFormat.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(NutchWritable.class);
        JobClient.runJob((JobConf)job);
        long end = System.currentTimeMillis();
        LOG.info("ArcSegmentCreator: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
    }

    public static void main(String[] args) throws Exception {
        int res = ToolRunner.run((Configuration)NutchConfiguration.create(), (Tool)new ArcSegmentCreator(), (String[])args);
        System.exit(res);
    }

    public int run(String[] args) throws Exception {
        String usage = "Usage: ArcSegmentCreator <arcFiles> <segmentsOutDir>";
        if (args.length < 2) {
            System.err.println(usage);
            return -1;
        }
        Path arcFiles = new Path(args[0]);
        Path segmentsOutDir = new Path(args[1]);
        try {
            this.createSegments(arcFiles, segmentsOutDir);
            return 0;
        }
        catch (Exception e) {
            LOG.error("ArcSegmentCreator: " + StringUtils.stringifyException((Throwable)e));
            return -1;
        }
    }
}

