/*
 * Decompiled with CFR 0.152.
 */
package org.apache.nutch.indexer;

import java.io.IOException;
import java.util.Collection;
import java.util.Iterator;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.Inlinks;
import org.apache.nutch.crawl.NutchWritable;
import org.apache.nutch.indexer.IndexerOutputFormat;
import org.apache.nutch.indexer.IndexingException;
import org.apache.nutch.indexer.IndexingFilters;
import org.apache.nutch.indexer.NutchDocument;
import org.apache.nutch.indexer.NutchIndexAction;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.net.URLFilters;
import org.apache.nutch.net.URLNormalizers;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.parse.ParseImpl;
import org.apache.nutch.parse.ParseText;
import org.apache.nutch.scoring.ScoringFilterException;
import org.apache.nutch.scoring.ScoringFilters;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class IndexerMapReduce
extends Configured
implements Mapper<Text, Writable, Text, NutchWritable>,
Reducer<Text, NutchWritable, Text, NutchIndexAction> {
    public static final Logger LOG = LoggerFactory.getLogger(IndexerMapReduce.class);
    public static final String INDEXER_PARAMS = "indexer.additional.params";
    public static final String INDEXER_DELETE = "indexer.delete";
    public static final String INDEXER_DELETE_ROBOTS_NOINDEX = "indexer.delete.robots.noindex";
    public static final String INDEXER_SKIP_NOTMODIFIED = "indexer.skip.notmodified";
    public static final String URL_FILTERING = "indexer.url.filters";
    public static final String URL_NORMALIZING = "indexer.url.normalizers";
    private boolean skip = false;
    private boolean delete = false;
    private boolean deleteRobotsNoIndex = false;
    private IndexingFilters filters;
    private ScoringFilters scfilters;
    private boolean normalize = false;
    private boolean filter = false;
    private URLNormalizers urlNormalizers;
    private URLFilters urlFilters;

    public void configure(JobConf job) {
        this.setConf((Configuration)job);
        this.filters = new IndexingFilters(this.getConf());
        this.scfilters = new ScoringFilters(this.getConf());
        this.delete = job.getBoolean(INDEXER_DELETE, false);
        this.deleteRobotsNoIndex = job.getBoolean(INDEXER_DELETE_ROBOTS_NOINDEX, false);
        this.skip = job.getBoolean(INDEXER_SKIP_NOTMODIFIED, false);
        this.normalize = job.getBoolean(URL_NORMALIZING, false);
        this.filter = job.getBoolean(URL_FILTERING, false);
        if (this.normalize) {
            this.urlNormalizers = new URLNormalizers(this.getConf(), "indexer");
        }
        if (this.filter) {
            this.urlFilters = new URLFilters(this.getConf());
        }
    }

    private String normalizeUrl(String url) {
        if (!this.normalize) {
            return url;
        }
        String normalized = null;
        if (this.urlNormalizers != null) {
            try {
                normalized = this.urlNormalizers.normalize(url, "indexer");
                normalized = normalized.trim();
            }
            catch (Exception e) {
                LOG.warn("Skipping " + url + ":" + e);
                normalized = null;
            }
        }
        return normalized;
    }

    private String filterUrl(String url) {
        if (!this.filter) {
            return url;
        }
        try {
            url = this.urlFilters.filter(url);
        }
        catch (Exception e) {
            url = null;
        }
        return url;
    }

    public void map(Text key, Writable value, OutputCollector<Text, NutchWritable> output, Reporter reporter) throws IOException {
        String urlString = this.filterUrl(this.normalizeUrl(key.toString()));
        if (urlString == null) {
            return;
        }
        key.set(urlString);
        output.collect((Object)key, (Object)new NutchWritable(value));
    }

    public void reduce(Text key, Iterator<NutchWritable> values, OutputCollector<Text, NutchIndexAction> output, Reporter reporter) throws IOException {
        NutchIndexAction action;
        Inlinks inlinks = null;
        CrawlDatum dbDatum = null;
        CrawlDatum fetchDatum = null;
        ParseData parseData = null;
        ParseText parseText = null;
        while (values.hasNext()) {
            Writable value = values.next().get();
            if (value instanceof Inlinks) {
                inlinks = (Inlinks)value;
                continue;
            }
            if (value instanceof CrawlDatum) {
                CrawlDatum datum = (CrawlDatum)value;
                if (CrawlDatum.hasDbStatus(datum)) {
                    dbDatum = datum;
                    continue;
                }
                if (CrawlDatum.hasFetchStatus(datum)) {
                    if (datum.getStatus() == 38) continue;
                    fetchDatum = datum;
                    continue;
                }
                if (67 == datum.getStatus() || 65 == datum.getStatus() || 68 == datum.getStatus()) continue;
                throw new RuntimeException("Unexpected status: " + datum.getStatus());
            }
            if (value instanceof ParseData) {
                String robotsMeta;
                parseData = (ParseData)value;
                if (!this.deleteRobotsNoIndex || (robotsMeta = parseData.getMeta("robots")) == null || robotsMeta.toLowerCase().indexOf("noindex") == -1) continue;
                NutchIndexAction action2 = new NutchIndexAction(null, 1);
                output.collect((Object)key, (Object)action2);
                return;
            }
            if (value instanceof ParseText) {
                parseText = (ParseText)value;
                continue;
            }
            if (!LOG.isWarnEnabled()) continue;
            LOG.warn("Unrecognized type: " + value.getClass());
        }
        if (this.delete && fetchDatum != null && dbDatum != null) {
            if (fetchDatum.getStatus() == 37 || dbDatum.getStatus() == 3) {
                reporter.incrCounter("IndexerStatus", "Documents deleted", 1L);
                action = new NutchIndexAction(null, 1);
                output.collect((Object)key, (Object)action);
                return;
            }
            if (fetchDatum.getStatus() == 36 || fetchDatum.getStatus() == 35 || dbDatum.getStatus() == 5 || dbDatum.getStatus() == 4) {
                reporter.incrCounter("IndexerStatus", "Deleted redirects", 1L);
                reporter.incrCounter("IndexerStatus", "Perm redirects deleted", 1L);
                action = new NutchIndexAction(null, 1);
                output.collect((Object)key, (Object)action);
                return;
            }
        }
        if (fetchDatum == null || dbDatum == null || parseText == null || parseData == null) {
            return;
        }
        if (this.delete && dbDatum.getStatus() == 7) {
            reporter.incrCounter("IndexerStatus", "Duplicates deleted", 1L);
            action = new NutchIndexAction(null, 1);
            output.collect((Object)key, (Object)action);
            return;
        }
        if (this.skip && dbDatum.getStatus() == 6) {
            reporter.incrCounter("IndexerStatus", "Skipped", 1L);
            return;
        }
        if (!parseData.getStatus().isSuccess() || fetchDatum.getStatus() != 33) {
            return;
        }
        NutchDocument doc = new NutchDocument();
        Metadata metadata = parseData.getContentMeta();
        doc.add("segment", metadata.get("nutch.segment.name"));
        doc.add("digest", metadata.get("nutch.content.digest"));
        ParseImpl parse = new ParseImpl(parseText, parseData);
        try {
            String urlString;
            Text url = (Text)dbDatum.getMetaData().get((Object)Nutch.WRITABLE_REPR_URL_KEY);
            if (url != null && (urlString = this.filterUrl(this.normalizeUrl(url.toString()))) != null) {
                url.set(urlString);
                fetchDatum.getMetaData().put((Writable)Nutch.WRITABLE_REPR_URL_KEY, (Writable)url);
            }
            doc = this.filters.filter(doc, parse, key, fetchDatum, inlinks);
        }
        catch (IndexingException e) {
            if (LOG.isWarnEnabled()) {
                LOG.warn("Error indexing " + key + ": " + e);
            }
            reporter.incrCounter("IndexerStatus", "Errors", 1L);
            return;
        }
        if (doc == null) {
            reporter.incrCounter("IndexerStatus", "Skipped by filters", 1L);
            return;
        }
        float boost = 1.0f;
        try {
            boost = this.scfilters.indexerScore(key, doc, dbDatum, fetchDatum, parse, inlinks, boost);
        }
        catch (ScoringFilterException e) {
            if (LOG.isWarnEnabled()) {
                LOG.warn("Error calculating score " + key + ": " + e);
            }
            return;
        }
        doc.setWeight(boost);
        doc.add("boost", Float.toString(boost));
        reporter.incrCounter("IndexerStatus", "Documents added", 1L);
        NutchIndexAction action3 = new NutchIndexAction(doc, 0);
        output.collect((Object)key, (Object)action3);
    }

    public void close() throws IOException {
    }

    public static void initMRJob(Path crawlDb, Path linkDb, Collection<Path> segments, JobConf job) {
        LOG.info("IndexerMapReduce: crawldb: " + crawlDb);
        if (linkDb != null) {
            LOG.info("IndexerMapReduce: linkdb: " + linkDb);
        }
        for (Path segment : segments) {
            LOG.info("IndexerMapReduces: adding segment: " + segment);
            FileInputFormat.addInputPath((JobConf)job, (Path)new Path(segment, "crawl_fetch"));
            FileInputFormat.addInputPath((JobConf)job, (Path)new Path(segment, "crawl_parse"));
            FileInputFormat.addInputPath((JobConf)job, (Path)new Path(segment, "parse_data"));
            FileInputFormat.addInputPath((JobConf)job, (Path)new Path(segment, "parse_text"));
        }
        FileInputFormat.addInputPath((JobConf)job, (Path)new Path(crawlDb, "current"));
        if (linkDb != null) {
            FileInputFormat.addInputPath((JobConf)job, (Path)new Path(linkDb, "current"));
        }
        job.setInputFormat(SequenceFileInputFormat.class);
        job.setMapperClass(IndexerMapReduce.class);
        job.setReducerClass(IndexerMapReduce.class);
        job.setOutputFormat(IndexerOutputFormat.class);
        job.setOutputKeyClass(Text.class);
        job.setMapOutputValueClass(NutchWritable.class);
        job.setOutputValueClass(NutchWritable.class);
    }
}

