/*
 * Decompiled with CFR 0.152.
 */
package org.apache.nutch.parse.tika;

import java.io.ByteArrayInputStream;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Map;
import org.apache.hadoop.conf.Configuration;
import org.apache.html.dom.HTMLDocumentImpl;
import org.apache.nutch.parse.HTMLMetaTags;
import org.apache.nutch.parse.HtmlParseFilters;
import org.apache.nutch.parse.Outlink;
import org.apache.nutch.parse.OutlinkExtractor;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.parse.ParseImpl;
import org.apache.nutch.parse.ParseResult;
import org.apache.nutch.parse.ParseStatus;
import org.apache.nutch.parse.Parser;
import org.apache.nutch.parse.tika.DOMBuilder;
import org.apache.nutch.parse.tika.DOMContentUtils;
import org.apache.nutch.parse.tika.HTMLMetaProcessor;
import org.apache.nutch.protocol.Content;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import org.w3c.dom.DocumentFragment;
import org.xml.sax.ContentHandler;

public class TikaParser
implements Parser {
    public static final Logger LOG = LoggerFactory.getLogger(TikaParser.class);
    private Configuration conf;
    private TikaConfig tikaConfig = null;
    private DOMContentUtils utils;
    private HtmlParseFilters htmlParseFilters;
    private String cachingPolicy;

    public ParseResult getParse(Content content) {
        String[] TikaMDNames;
        URL base;
        String mimeType = content.getContentType();
        try {
            base = new URL(content.getBaseUrl());
        }
        catch (MalformedURLException e) {
            return new ParseStatus((Throwable)e).getEmptyParseResult(content.getUrl(), this.getConf());
        }
        org.apache.tika.parser.Parser parser = this.tikaConfig.getParser(MediaType.parse((String)mimeType));
        byte[] raw = content.getContent();
        if (parser == null) {
            String message = "Can't retrieve Tika parser for mime-type " + mimeType;
            LOG.error(message);
            return new ParseStatus(2, message).getEmptyParseResult(content.getUrl(), this.getConf());
        }
        LOG.debug("Using Tika parser " + parser.getClass().getName() + " for mime-type " + mimeType);
        Metadata tikamd = new Metadata();
        HTMLDocumentImpl doc = new HTMLDocumentImpl();
        doc.setErrorChecking(false);
        DocumentFragment root = doc.createDocumentFragment();
        DOMBuilder domhandler = new DOMBuilder((Document)doc, root);
        ParseContext context = new ParseContext();
        tikamd.set("Content-Type", mimeType);
        try {
            parser.parse((InputStream)new ByteArrayInputStream(raw), (ContentHandler)domhandler, tikamd, context);
        }
        catch (Exception e) {
            LOG.error("Error parsing " + content.getUrl(), (Throwable)e);
            return new ParseStatus(2, e.getMessage()).getEmptyParseResult(content.getUrl(), this.getConf());
        }
        HTMLMetaTags metaTags = new HTMLMetaTags();
        String text = "";
        String title = "";
        Outlink[] outlinks = new Outlink[]{};
        org.apache.nutch.metadata.Metadata nutchMetadata = new org.apache.nutch.metadata.Metadata();
        HTMLMetaProcessor.getMetaTags(metaTags, root, base);
        if (LOG.isTraceEnabled()) {
            LOG.trace("Meta tags for " + base + ": " + metaTags.toString());
        }
        if (!metaTags.getNoIndex()) {
            StringBuffer sb = new StringBuffer();
            if (LOG.isTraceEnabled()) {
                LOG.trace("Getting text...");
            }
            this.utils.getText(sb, root);
            text = sb.toString();
            sb.setLength(0);
            if (LOG.isTraceEnabled()) {
                LOG.trace("Getting title...");
            }
            this.utils.getTitle(sb, root);
            title = sb.toString().trim();
        }
        if (!metaTags.getNoFollow()) {
            ArrayList<Outlink> l = new ArrayList<Outlink>();
            URL baseTag = this.utils.getBase(root);
            if (LOG.isTraceEnabled()) {
                LOG.trace("Getting links...");
            }
            this.utils.getOutlinks(baseTag != null ? baseTag : base, l, root);
            outlinks = l.toArray(new Outlink[l.size()]);
            if (LOG.isTraceEnabled()) {
                LOG.trace("found " + outlinks.length + " outlinks in " + content.getUrl());
            }
        }
        for (String tikaMDName : TikaMDNames = tikamd.names()) {
            if (tikaMDName.equalsIgnoreCase("title")) continue;
            nutchMetadata.add(tikaMDName, tikamd.get(tikaMDName));
        }
        if (outlinks.length == 0) {
            outlinks = OutlinkExtractor.getOutlinks((String)text, (Configuration)this.getConf());
        }
        ParseStatus status = new ParseStatus(1);
        if (metaTags.getRefresh()) {
            status.setMinorCode((short)100);
            status.setArgs(new String[]{metaTags.getRefreshHref().toString(), Integer.toString(metaTags.getRefreshTime())});
        }
        ParseData parseData = new ParseData(status, title, outlinks, content.getMetadata(), nutchMetadata);
        ParseResult parseResult = ParseResult.createParseResult((String)content.getUrl(), (Parse)new ParseImpl(text, parseData));
        ParseResult filteredParse = this.htmlParseFilters.filter(content, parseResult, metaTags, root);
        if (metaTags.getNoCache()) {
            for (Map.Entry entry : filteredParse) {
                ((Parse)entry.getValue()).getData().getParseMeta().set("caching.forbidden", this.cachingPolicy);
            }
        }
        return filteredParse;
    }

    public void setConf(Configuration conf) {
        this.conf = conf;
        this.tikaConfig = null;
        String customConfFile = conf.get("tika.config.file");
        if (customConfFile != null) {
            try {
                URL customTikaConfig = conf.getResource(customConfFile);
                if (customTikaConfig != null) {
                    this.tikaConfig = new TikaConfig(customTikaConfig);
                }
            }
            catch (Exception e1) {
                String message = "Problem loading custom Tika configuration from " + customConfFile;
                LOG.error(message, (Throwable)e1);
            }
        } else {
            try {
                this.tikaConfig = new TikaConfig(this.getClass().getClassLoader());
            }
            catch (Exception e2) {
                String message = "Problem loading default Tika configuration";
                LOG.error(message, (Throwable)e2);
            }
        }
        this.htmlParseFilters = new HtmlParseFilters(this.getConf());
        this.utils = new DOMContentUtils(conf);
        this.cachingPolicy = this.getConf().get("parser.caching.forbidden.policy", "content");
    }

    public Configuration getConf() {
        return this.conf;
    }
}

