/*
 * Decompiled with CFR 0.152.
 */
package org.apache.nutch.crawl;

import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.Iterator;
import java.util.Random;
import java.util.TreeMap;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.RunningJob;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.CrawlDb;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.net.URLFilters;
import org.apache.nutch.net.URLNormalizers;
import org.apache.nutch.scoring.ScoringFilterException;
import org.apache.nutch.scoring.ScoringFilters;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.TimingUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class Injector
extends Configured
implements Tool {
    public static final Logger LOG = LoggerFactory.getLogger(Injector.class);
    public static String nutchScoreMDName = "nutch.score";
    public static String nutchFetchIntervalMDName = "nutch.fetchInterval";
    public static String nutchFixedFetchIntervalMDName = "nutch.fetchInterval.fixed";

    public Injector() {
    }

    public Injector(Configuration conf) {
        this.setConf(conf);
    }

    public void inject(Path crawlDb, Path urlDir) throws IOException {
        SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
        long start = System.currentTimeMillis();
        if (LOG.isInfoEnabled()) {
            LOG.info("Injector: starting at " + sdf.format(start));
            LOG.info("Injector: crawlDb: " + crawlDb);
            LOG.info("Injector: urlDir: " + urlDir);
        }
        Path tempDir = new Path(this.getConf().get("mapred.temp.dir", ".") + "/inject-temp-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
        if (LOG.isInfoEnabled()) {
            LOG.info("Injector: Converting injected urls to crawl db entries.");
        }
        NutchJob sortJob = new NutchJob(this.getConf());
        sortJob.setJobName("inject " + urlDir);
        FileInputFormat.addInputPath((JobConf)sortJob, (Path)urlDir);
        sortJob.setMapperClass(InjectMapper.class);
        FileOutputFormat.setOutputPath((JobConf)sortJob, (Path)tempDir);
        sortJob.setOutputFormat(SequenceFileOutputFormat.class);
        sortJob.setOutputKeyClass(Text.class);
        sortJob.setOutputValueClass(CrawlDatum.class);
        sortJob.setLong("injector.current.time", System.currentTimeMillis());
        RunningJob mapJob = JobClient.runJob((JobConf)sortJob);
        long urlsInjected = mapJob.getCounters().findCounter("injector", "urls_injected").getValue();
        long urlsFiltered = mapJob.getCounters().findCounter("injector", "urls_filtered").getValue();
        LOG.info("Injector: total number of urls rejected by filters: " + urlsFiltered);
        LOG.info("Injector: total number of urls injected after normalization and filtering: " + urlsInjected);
        if (LOG.isInfoEnabled()) {
            LOG.info("Injector: Merging injected urls into crawl db.");
        }
        JobConf mergeJob = CrawlDb.createJob(this.getConf(), crawlDb);
        FileInputFormat.addInputPath((JobConf)mergeJob, (Path)tempDir);
        mergeJob.setReducerClass(InjectReducer.class);
        JobClient.runJob((JobConf)mergeJob);
        CrawlDb.install(mergeJob, crawlDb);
        FileSystem fs = FileSystem.get((Configuration)this.getConf());
        fs.delete(tempDir, true);
        long end = System.currentTimeMillis();
        LOG.info("Injector: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
    }

    public static void main(String[] args) throws Exception {
        int res = ToolRunner.run((Configuration)NutchConfiguration.create(), (Tool)new Injector(), (String[])args);
        System.exit(res);
    }

    public int run(String[] args) throws Exception {
        if (args.length < 2) {
            System.err.println("Usage: Injector <crawldb> <url_dir>");
            return -1;
        }
        try {
            this.inject(new Path(args[0]), new Path(args[1]));
            return 0;
        }
        catch (Exception e) {
            LOG.error("Injector: " + StringUtils.stringifyException((Throwable)e));
            return -1;
        }
    }

    public static class InjectReducer
    implements Reducer<Text, CrawlDatum, Text, CrawlDatum> {
        private int interval;
        private float scoreInjected;
        private boolean overwrite = false;
        private boolean update = false;
        private CrawlDatum old = new CrawlDatum();
        private CrawlDatum injected = new CrawlDatum();

        public void configure(JobConf job) {
            this.interval = job.getInt("db.fetch.interval.default", 2592000);
            this.scoreInjected = job.getFloat("db.score.injected", 1.0f);
            this.overwrite = job.getBoolean("db.injector.overwrite", false);
            this.update = job.getBoolean("db.injector.update", false);
            LOG.info("Injector: overwrite: " + this.overwrite);
            LOG.info("Injector: update: " + this.update);
        }

        public void close() {
        }

        public void reduce(Text key, Iterator<CrawlDatum> values, OutputCollector<Text, CrawlDatum> output, Reporter reporter) throws IOException {
            boolean oldSet = false;
            boolean injectedSet = false;
            while (values.hasNext()) {
                CrawlDatum val = values.next();
                if (val.getStatus() == 66) {
                    this.injected.set(val);
                    this.injected.setStatus(1);
                    injectedSet = true;
                    continue;
                }
                this.old.set(val);
                oldSet = true;
            }
            CrawlDatum res = null;
            res = injectedSet && !oldSet ? this.injected : this.old;
            if (injectedSet && oldSet && this.update && !this.overwrite) {
                res = this.old;
                this.old.putAllMetaData(this.injected);
                this.old.setScore(this.injected.getScore() != this.scoreInjected ? this.injected.getScore() : this.old.getScore());
                this.old.setFetchInterval(this.injected.getFetchInterval() != this.interval ? this.injected.getFetchInterval() : this.old.getFetchInterval());
            }
            if (injectedSet && oldSet && this.overwrite) {
                res = this.injected;
            }
            output.collect((Object)key, (Object)res);
        }
    }

    public static class InjectMapper
    implements Mapper<WritableComparable<?>, Text, Text, CrawlDatum> {
        private URLNormalizers urlNormalizers;
        private int interval;
        private float scoreInjected;
        private JobConf jobConf;
        private URLFilters filters;
        private ScoringFilters scfilters;
        private long curTime;

        public void configure(JobConf job) {
            this.jobConf = job;
            this.urlNormalizers = new URLNormalizers((Configuration)job, "inject");
            this.interval = this.jobConf.getInt("db.fetch.interval.default", 2592000);
            this.filters = new URLFilters((Configuration)this.jobConf);
            this.scfilters = new ScoringFilters((Configuration)this.jobConf);
            this.scoreInjected = this.jobConf.getFloat("db.score.injected", 1.0f);
            this.curTime = job.getLong("injector.current.time", System.currentTimeMillis());
        }

        public void close() {
        }

        public void map(WritableComparable<?> key, Text value, OutputCollector<Text, CrawlDatum> output, Reporter reporter) throws IOException {
            String url = value.toString().trim();
            if (url != null && (url.length() == 0 || url.startsWith("#"))) {
                return;
            }
            float customScore = -1.0f;
            int customInterval = this.interval;
            int fixedInterval = -1;
            TreeMap<String, String> metadata = new TreeMap<String, String>();
            if (url.indexOf("\t") != -1) {
                String[] splits = url.split("\t");
                url = splits[0];
                for (int s = 1; s < splits.length; ++s) {
                    int indexEquals = splits[s].indexOf("=");
                    if (indexEquals == -1) continue;
                    String metaname = splits[s].substring(0, indexEquals);
                    String metavalue = splits[s].substring(indexEquals + 1);
                    if (metaname.equals(nutchScoreMDName)) {
                        try {
                            customScore = Float.parseFloat(metavalue);
                        }
                        catch (NumberFormatException nfe) {}
                        continue;
                    }
                    if (metaname.equals(nutchFetchIntervalMDName)) {
                        try {
                            customInterval = Integer.parseInt(metavalue);
                        }
                        catch (NumberFormatException nfe) {}
                        continue;
                    }
                    if (metaname.equals(nutchFixedFetchIntervalMDName)) {
                        try {
                            fixedInterval = Integer.parseInt(metavalue);
                        }
                        catch (NumberFormatException nfe) {}
                        continue;
                    }
                    metadata.put(metaname, metavalue);
                }
            }
            try {
                url = this.urlNormalizers.normalize(url, "inject");
                url = this.filters.filter(url);
            }
            catch (Exception e) {
                if (LOG.isWarnEnabled()) {
                    LOG.warn("Skipping " + url + ":" + e);
                }
                url = null;
            }
            if (url == null) {
                reporter.getCounter("injector", "urls_filtered").increment(1L);
            } else {
                CrawlDatum datum;
                block24: {
                    value.set(url);
                    datum = new CrawlDatum();
                    datum.setStatus(66);
                    if (fixedInterval > -1) {
                        datum.getMetaData().put((Writable)Nutch.WRITABLE_FIXED_INTERVAL_KEY, (Writable)new FloatWritable((float)fixedInterval));
                        datum.setFetchInterval(fixedInterval);
                    } else {
                        datum.setFetchInterval(customInterval);
                    }
                    datum.setFetchTime(this.curTime);
                    for (String keymd : metadata.keySet()) {
                        String valuemd = (String)metadata.get(keymd);
                        datum.getMetaData().put((Writable)new Text(keymd), (Writable)new Text(valuemd));
                    }
                    if (customScore != -1.0f) {
                        datum.setScore(customScore);
                    } else {
                        datum.setScore(this.scoreInjected);
                    }
                    try {
                        this.scfilters.injectedScore(value, datum);
                    }
                    catch (ScoringFilterException e) {
                        if (!LOG.isWarnEnabled()) break block24;
                        LOG.warn("Cannot filter injected score for url " + url + ", using default (" + e.getMessage() + ")");
                    }
                }
                reporter.getCounter("injector", "urls_injected").increment(1L);
                output.collect((Object)value, (Object)datum);
            }
        }
    }
}

