/*
 * Decompiled with CFR 0.152.
 */
package org.apache.nutch.crawl;

import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.Iterator;
import java.util.Random;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.Counters;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.RunningJob;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.CrawlDb;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.TimingUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class DeduplicationJob
extends Configured
implements Tool {
    public static final Logger LOG = LoggerFactory.getLogger(DeduplicationJob.class);
    private static final Text urlKey = new Text("_URLTEMPKEY_");

    public int run(String[] args) throws IOException {
        if (args.length < 1) {
            System.err.println("Usage: DeduplicationJob <crawldb>");
            return 1;
        }
        String crawldb = args[0];
        SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
        long start = System.currentTimeMillis();
        LOG.info("DeduplicationJob: starting at " + sdf.format(start));
        Path tempDir = new Path(this.getConf().get("mapred.temp.dir", ".") + "/dedup-temp-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
        NutchJob job = new NutchJob(this.getConf());
        job.setJobName("Deduplication on " + crawldb);
        FileInputFormat.addInputPath((JobConf)job, (Path)new Path(crawldb, "current"));
        job.setInputFormat(SequenceFileInputFormat.class);
        FileOutputFormat.setOutputPath((JobConf)job, (Path)tempDir);
        job.setOutputFormat(SequenceFileOutputFormat.class);
        job.setMapOutputKeyClass(BytesWritable.class);
        job.setMapOutputValueClass(CrawlDatum.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(CrawlDatum.class);
        job.setMapperClass(DBFilter.class);
        job.setReducerClass(DedupReducer.class);
        try {
            RunningJob rj = JobClient.runJob((JobConf)job);
            Counters.Group g = rj.getCounters().getGroup("DeduplicationJobStatus");
            if (g != null) {
                long dups = g.getCounter("Documents marked as duplicate");
                LOG.info("Deduplication: " + (int)dups + " documents marked as duplicates");
            }
        }
        catch (Exception e) {
            LOG.error("DeduplicationJob: " + StringUtils.stringifyException((Throwable)e));
            return -1;
        }
        if (LOG.isInfoEnabled()) {
            LOG.info("Deduplication: Updating status of duplicate urls into crawl db.");
        }
        Path dbPath = new Path(crawldb);
        JobConf mergeJob = CrawlDb.createJob(this.getConf(), dbPath);
        FileInputFormat.addInputPath((JobConf)mergeJob, (Path)tempDir);
        mergeJob.setReducerClass(StatusUpdateReducer.class);
        try {
            JobClient.runJob((JobConf)mergeJob);
        }
        catch (Exception e) {
            LOG.error("DeduplicationMergeJob: " + StringUtils.stringifyException((Throwable)e));
            return -1;
        }
        CrawlDb.install(mergeJob, dbPath);
        FileSystem fs = FileSystem.get((Configuration)this.getConf());
        fs.delete(tempDir, true);
        long end = System.currentTimeMillis();
        LOG.info("Deduplication finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
        return 0;
    }

    public static void main(String[] args) throws Exception {
        int result = ToolRunner.run((Configuration)NutchConfiguration.create(), (Tool)new DeduplicationJob(), (String[])args);
        System.exit(result);
    }

    public static class StatusUpdateReducer
    implements Reducer<Text, CrawlDatum, Text, CrawlDatum> {
        private CrawlDatum old = new CrawlDatum();
        private CrawlDatum duplicate = new CrawlDatum();

        public void configure(JobConf job) {
        }

        public void close() {
        }

        public void reduce(Text key, Iterator<CrawlDatum> values, OutputCollector<Text, CrawlDatum> output, Reporter reporter) throws IOException {
            boolean duplicateSet = false;
            while (values.hasNext()) {
                CrawlDatum val = values.next();
                if (val.getStatus() == 7) {
                    this.duplicate.set(val);
                    duplicateSet = true;
                    continue;
                }
                this.old.set(val);
            }
            if (duplicateSet) {
                output.collect((Object)key, (Object)this.duplicate);
                return;
            }
            output.collect((Object)key, (Object)this.old);
        }
    }

    public static class DedupReducer
    implements Reducer<BytesWritable, CrawlDatum, Text, CrawlDatum> {
        private void writeOutAsDuplicate(CrawlDatum datum, OutputCollector<Text, CrawlDatum> output, Reporter reporter) throws IOException {
            datum.setStatus(7);
            Text key = (Text)datum.getMetaData().remove((Object)urlKey);
            reporter.incrCounter("DeduplicationJobStatus", "Documents marked as duplicate", 1L);
            output.collect((Object)key, (Object)datum);
        }

        public void reduce(BytesWritable key, Iterator<CrawlDatum> values, OutputCollector<Text, CrawlDatum> output, Reporter reporter) throws IOException {
            CrawlDatum existingDoc = null;
            while (values.hasNext()) {
                if (existingDoc == null) {
                    existingDoc = new CrawlDatum();
                    existingDoc.set(values.next());
                    continue;
                }
                CrawlDatum newDoc = values.next();
                if (existingDoc.getScore() < newDoc.getScore()) {
                    this.writeOutAsDuplicate(existingDoc, output, reporter);
                    existingDoc = new CrawlDatum();
                    existingDoc.set(newDoc);
                    continue;
                }
                if (existingDoc.getScore() > newDoc.getScore()) {
                    this.writeOutAsDuplicate(newDoc, output, reporter);
                    continue;
                }
                if (existingDoc.getFetchTime() > newDoc.getFetchTime()) {
                    this.writeOutAsDuplicate(newDoc, output, reporter);
                    continue;
                }
                if (existingDoc.getFetchTime() < newDoc.getFetchTime()) {
                    this.writeOutAsDuplicate(existingDoc, output, reporter);
                    existingDoc = new CrawlDatum();
                    existingDoc.set(newDoc);
                    continue;
                }
                String urlExisting = existingDoc.getMetaData().get((Object)urlKey).toString();
                String urlnewDoc = newDoc.getMetaData().get((Object)urlKey).toString();
                if (urlExisting.length() < urlnewDoc.length()) {
                    this.writeOutAsDuplicate(newDoc, output, reporter);
                    continue;
                }
                if (urlExisting.length() <= urlnewDoc.length()) continue;
                this.writeOutAsDuplicate(existingDoc, output, reporter);
                existingDoc = new CrawlDatum();
                existingDoc.set(newDoc);
            }
        }

        public void configure(JobConf arg0) {
        }

        public void close() throws IOException {
        }
    }

    public static class DBFilter
    implements Mapper<Text, CrawlDatum, BytesWritable, CrawlDatum> {
        public void configure(JobConf arg0) {
        }

        public void close() throws IOException {
        }

        public void map(Text key, CrawlDatum value, OutputCollector<BytesWritable, CrawlDatum> output, Reporter reporter) throws IOException {
            if (value.getStatus() == 2 || value.getStatus() == 6) {
                byte[] signature = value.getSignature();
                if (signature == null) {
                    return;
                }
                BytesWritable sig = new BytesWritable(signature);
                value.getMetaData().put((Writable)urlKey, (Writable)key);
                output.collect((Object)sig, (Object)value);
            }
        }
    }
}

