/*
 * Decompiled with CFR 0.152.
 */
package org.apache.nutch.tools;

import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.Generator;
import org.apache.nutch.crawl.URLPartitioner;
import org.apache.nutch.net.URLFilters;
import org.apache.nutch.net.URLNormalizers;
import org.apache.nutch.scoring.ScoringFilters;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.TimingUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class FreeGenerator
extends Configured
implements Tool {
    private static final Logger LOG = LoggerFactory.getLogger(FreeGenerator.class);
    private static final String FILTER_KEY = "free.generator.filter";
    private static final String NORMALIZE_KEY = "free.generator.normalize";

    public int run(String[] args) throws Exception {
        if (args.length < 2) {
            System.err.println("Usage: FreeGenerator <inputDir> <segmentsDir> [-filter] [-normalize]");
            System.err.println("\tinputDir\tinput directory containing one or more input files.");
            System.err.println("\t\tEach text file contains a list of URLs, one URL per line");
            System.err.println("\tsegmentsDir\toutput directory, where new segment will be created");
            System.err.println("\t-filter\trun current URLFilters on input URLs");
            System.err.println("\t-normalize\trun current URLNormalizers on input URLs");
            return -1;
        }
        boolean filter = false;
        boolean normalize = false;
        if (args.length > 2) {
            for (int i = 2; i < args.length; ++i) {
                if (args[i].equals("-filter")) {
                    filter = true;
                    continue;
                }
                if (args[i].equals("-normalize")) {
                    normalize = true;
                    continue;
                }
                LOG.error("Unknown argument: " + args[i] + ", exiting ...");
                return -1;
            }
        }
        SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
        long start = System.currentTimeMillis();
        LOG.info("FreeGenerator: starting at " + sdf.format(start));
        NutchJob job = new NutchJob(this.getConf());
        job.setBoolean(FILTER_KEY, filter);
        job.setBoolean(NORMALIZE_KEY, normalize);
        FileInputFormat.addInputPath((JobConf)job, (Path)new Path(args[0]));
        job.setInputFormat(TextInputFormat.class);
        job.setMapperClass(FG.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Generator.SelectorEntry.class);
        job.setPartitionerClass(URLPartitioner.class);
        job.setReducerClass(FG.class);
        String segName = Generator.generateSegmentName();
        job.setNumReduceTasks(job.getNumMapTasks());
        job.setOutputFormat(SequenceFileOutputFormat.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(CrawlDatum.class);
        job.setOutputKeyComparatorClass(Generator.HashComparator.class);
        FileOutputFormat.setOutputPath((JobConf)job, (Path)new Path(args[1], new Path(segName, "crawl_generate")));
        try {
            JobClient.runJob((JobConf)job);
        }
        catch (Exception e) {
            LOG.error("FAILED: " + StringUtils.stringifyException((Throwable)e));
            return -1;
        }
        long end = System.currentTimeMillis();
        LOG.info("FreeGenerator: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
        return 0;
    }

    public static void main(String[] args) throws Exception {
        int res = ToolRunner.run((Configuration)NutchConfiguration.create(), (Tool)new FreeGenerator(), (String[])args);
        System.exit(res);
    }

    public static class FG
    extends MapReduceBase
    implements Mapper<WritableComparable<?>, Text, Text, Generator.SelectorEntry>,
    Reducer<Text, Generator.SelectorEntry, Text, CrawlDatum> {
        private URLNormalizers normalizers = null;
        private URLFilters filters = null;
        private ScoringFilters scfilters;
        private CrawlDatum datum = new CrawlDatum();
        private Text url = new Text();
        private int defaultInterval = 0;
        Generator.SelectorEntry entry = new Generator.SelectorEntry();

        public void configure(JobConf job) {
            super.configure(job);
            this.defaultInterval = job.getInt("db.fetch.interval.default", 0);
            this.scfilters = new ScoringFilters((Configuration)job);
            if (job.getBoolean(FreeGenerator.FILTER_KEY, false)) {
                this.filters = new URLFilters((Configuration)job);
            }
            if (job.getBoolean(FreeGenerator.NORMALIZE_KEY, false)) {
                this.normalizers = new URLNormalizers((Configuration)job, "inject");
            }
        }

        public void map(WritableComparable<?> key, Text value, OutputCollector<Text, Generator.SelectorEntry> output, Reporter reporter) throws IOException {
            String urlString = value.toString();
            try {
                if (this.normalizers != null) {
                    urlString = this.normalizers.normalize(urlString, "inject");
                }
                if (urlString != null && this.filters != null) {
                    urlString = this.filters.filter(urlString);
                }
                if (urlString != null) {
                    this.url.set(urlString);
                    this.scfilters.injectedScore(this.url, this.datum);
                }
            }
            catch (Exception e) {
                LOG.warn("Error adding url '" + value.toString() + "', skipping: " + StringUtils.stringifyException((Throwable)e));
                return;
            }
            if (urlString == null) {
                if (LOG.isDebugEnabled()) {
                    LOG.debug("- skipping " + value.toString());
                }
                return;
            }
            this.entry.datum = this.datum;
            this.entry.url = this.url;
            this.entry.datum.setFetchInterval(this.defaultInterval);
            output.collect((Object)this.url, (Object)this.entry);
        }

        public void reduce(Text key, Iterator<Generator.SelectorEntry> values, OutputCollector<Text, CrawlDatum> output, Reporter reporter) throws IOException {
            HashMap<Text, CrawlDatum> unique = new HashMap<Text, CrawlDatum>();
            while (values.hasNext()) {
                Generator.SelectorEntry entry = values.next();
                unique.put(entry.url, entry.datum);
            }
            for (Map.Entry e : unique.entrySet()) {
                output.collect(e.getKey(), e.getValue());
            }
        }
    }
}

