/*
 * Decompiled with CFR 0.152.
 */
package org.apache.nutch.indexer;

import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.Inlinks;
import org.apache.nutch.indexer.IndexingException;
import org.apache.nutch.indexer.IndexingFilters;
import org.apache.nutch.indexer.NutchDocument;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseResult;
import org.apache.nutch.parse.ParseSegment;
import org.apache.nutch.parse.ParseUtil;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.protocol.Protocol;
import org.apache.nutch.protocol.ProtocolFactory;
import org.apache.nutch.protocol.ProtocolOutput;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.URLUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class IndexingFiltersChecker
extends Configured
implements Tool {
    public static final Logger LOG = LoggerFactory.getLogger(IndexingFiltersChecker.class);
    Configuration conf;

    public int run(String[] args) throws Exception {
        CrawlDatum datum;
        String contentType = null;
        String url = null;
        String usage = "Usage: IndexingFiltersChecker <url>";
        if (args.length != 1) {
            System.err.println(usage);
            return -1;
        }
        url = URLUtil.toASCII(args[0]);
        if (LOG.isInfoEnabled()) {
            LOG.info("fetching: " + url);
        }
        IndexingFilters indexers = new IndexingFilters(this.conf);
        ProtocolFactory factory = new ProtocolFactory(this.conf);
        Protocol protocol = factory.getProtocol(url);
        ProtocolOutput output = protocol.getProtocolOutput(new Text(url), datum = new CrawlDatum());
        if (!output.getStatus().isSuccess()) {
            System.out.println("Fetch failed with protocol status: " + output.getStatus());
            return 0;
        }
        Content content = output.getContent();
        if (content == null) {
            System.out.println("No content for " + url);
            return 0;
        }
        contentType = content.getContentType();
        if (contentType == null) {
            return -1;
        }
        datum.getMetaData().put((Writable)new Text("Content-Type"), (Writable)new Text(contentType));
        if (ParseSegment.isTruncated(content)) {
            LOG.warn("Content is truncated, parse may fail!");
        }
        if (LOG.isInfoEnabled()) {
            LOG.info("parsing: " + url);
            LOG.info("contentType: " + contentType);
        }
        ParseResult parseResult = new ParseUtil(this.conf).parse(content);
        NutchDocument doc = new NutchDocument();
        Text urlText = new Text(url);
        Inlinks inlinks = null;
        Parse parse = parseResult.get(urlText);
        try {
            doc = indexers.filter(doc, parse, urlText, datum, inlinks);
        }
        catch (IndexingException e) {
            e.printStackTrace();
        }
        if (doc == null) {
            System.out.println("Document discarded by indexing filter");
            return 0;
        }
        for (String fname : doc.getFieldNames()) {
            List<Object> values = doc.getField(fname).getValues();
            if (values == null) continue;
            for (Object value : values) {
                String str = value.toString();
                int minText = Math.min(100, str.length());
                System.out.println(fname + " :\t" + str.substring(0, minText));
            }
        }
        return 0;
    }

    public static void main(String[] args) throws Exception {
        int res = ToolRunner.run((Configuration)NutchConfiguration.create(), (Tool)new IndexingFiltersChecker(), (String[])args);
        System.exit(res);
    }

    public Configuration getConf() {
        return this.conf;
    }

    public void setConf(Configuration arg0) {
        this.conf = arg0;
    }
}

