/*
 * Decompiled with CFR 0.152.
 */
package org.apache.nutch.parse.html;

import java.io.ByteArrayInputStream;
import java.io.DataInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.hadoop.conf.Configuration;
import org.apache.html.dom.HTMLDocumentImpl;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.parse.HTMLMetaTags;
import org.apache.nutch.parse.HtmlParseFilters;
import org.apache.nutch.parse.Outlink;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.parse.ParseImpl;
import org.apache.nutch.parse.ParseResult;
import org.apache.nutch.parse.ParseStatus;
import org.apache.nutch.parse.Parser;
import org.apache.nutch.parse.html.DOMBuilder;
import org.apache.nutch.parse.html.DOMContentUtils;
import org.apache.nutch.parse.html.HTMLMetaProcessor;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.util.EncodingDetector;
import org.apache.nutch.util.NutchConfiguration;
import org.cyberneko.html.parsers.DOMFragmentParser;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.DOMException;
import org.w3c.dom.Document;
import org.w3c.dom.DocumentFragment;
import org.xml.sax.ContentHandler;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;

public class HtmlParser
implements Parser {
    public static final Logger LOG = LoggerFactory.getLogger((String)"org.apache.nutch.parse.html");
    private static final int CHUNK_SIZE = 2000;
    private static Pattern metaPattern = Pattern.compile("<meta\\s+([^>]*http-equiv=(\"|')?content-type(\"|')?[^>]*)>", 2);
    private static Pattern charsetPattern = Pattern.compile("charset=\\s*([a-z][_\\-0-9a-z]*)", 2);
    private String parserImpl;
    private String defaultCharEncoding;
    private Configuration conf;
    private DOMContentUtils utils;
    private HtmlParseFilters htmlParseFilters;
    private String cachingPolicy;

    private static String sniffCharacterEncoding(byte[] content) {
        Matcher charsetMatcher;
        int length = content.length < 2000 ? content.length : 2000;
        String str = "";
        try {
            str = new String(content, 0, length, Charset.forName("ASCII").toString());
        }
        catch (UnsupportedEncodingException e) {
            return null;
        }
        Matcher metaMatcher = metaPattern.matcher(str);
        String encoding = null;
        if (metaMatcher.find() && (charsetMatcher = charsetPattern.matcher(metaMatcher.group(1))).find()) {
            encoding = new String(charsetMatcher.group(1));
        }
        return encoding;
    }

    public ParseResult getParse(Content content) {
        DocumentFragment root;
        URL base;
        HTMLMetaTags metaTags = new HTMLMetaTags();
        try {
            base = new URL(content.getBaseUrl());
        }
        catch (MalformedURLException e) {
            return new ParseStatus((Throwable)e).getEmptyParseResult(content.getUrl(), this.getConf());
        }
        String text = "";
        String title = "";
        Outlink[] outlinks = new Outlink[]{};
        Metadata metadata = new Metadata();
        try {
            byte[] contentInOctets = content.getContent();
            InputSource input = new InputSource(new ByteArrayInputStream(contentInOctets));
            EncodingDetector detector = new EncodingDetector(this.conf);
            detector.autoDetectClues(content, true);
            detector.addClue(HtmlParser.sniffCharacterEncoding(contentInOctets), "sniffed");
            String encoding = detector.guessEncoding(content, this.defaultCharEncoding);
            metadata.set("OriginalCharEncoding", encoding);
            metadata.set("CharEncodingForConversion", encoding);
            input.setEncoding(encoding);
            if (LOG.isTraceEnabled()) {
                LOG.trace("Parsing...");
            }
            root = this.parse(input);
        }
        catch (IOException e) {
            return new ParseStatus((Throwable)e).getEmptyParseResult(content.getUrl(), this.getConf());
        }
        catch (DOMException e) {
            return new ParseStatus((Throwable)e).getEmptyParseResult(content.getUrl(), this.getConf());
        }
        catch (SAXException e) {
            return new ParseStatus((Throwable)e).getEmptyParseResult(content.getUrl(), this.getConf());
        }
        catch (Exception e) {
            LOG.error("Error: ", (Throwable)e);
            return new ParseStatus((Throwable)e).getEmptyParseResult(content.getUrl(), this.getConf());
        }
        HTMLMetaProcessor.getMetaTags(metaTags, root, base);
        if (LOG.isTraceEnabled()) {
            LOG.trace("Meta tags for " + base + ": " + metaTags.toString());
        }
        if (!metaTags.getNoIndex()) {
            StringBuffer sb = new StringBuffer();
            if (LOG.isTraceEnabled()) {
                LOG.trace("Getting text...");
            }
            this.utils.getText(sb, root);
            text = sb.toString();
            sb.setLength(0);
            if (LOG.isTraceEnabled()) {
                LOG.trace("Getting title...");
            }
            this.utils.getTitle(sb, root);
            title = sb.toString().trim();
        }
        if (!metaTags.getNoFollow()) {
            ArrayList<Outlink> l = new ArrayList<Outlink>();
            URL baseTag = this.utils.getBase(root);
            if (LOG.isTraceEnabled()) {
                LOG.trace("Getting links...");
            }
            this.utils.getOutlinks(baseTag != null ? baseTag : base, l, root);
            outlinks = l.toArray(new Outlink[l.size()]);
            if (LOG.isTraceEnabled()) {
                LOG.trace("found " + outlinks.length + " outlinks in " + content.getUrl());
            }
        }
        ParseStatus status = new ParseStatus(1);
        if (metaTags.getRefresh()) {
            status.setMinorCode((short)100);
            status.setArgs(new String[]{metaTags.getRefreshHref().toString(), Integer.toString(metaTags.getRefreshTime())});
        }
        ParseData parseData = new ParseData(status, title, outlinks, content.getMetadata(), metadata);
        ParseResult parseResult = ParseResult.createParseResult((String)content.getUrl(), (Parse)new ParseImpl(text, parseData));
        ParseResult filteredParse = this.htmlParseFilters.filter(content, parseResult, metaTags, root);
        if (metaTags.getNoCache()) {
            for (Map.Entry entry : filteredParse) {
                ((Parse)entry.getValue()).getData().getParseMeta().set("caching.forbidden", this.cachingPolicy);
            }
        }
        return filteredParse;
    }

    private DocumentFragment parse(InputSource input) throws Exception {
        if (this.parserImpl.equalsIgnoreCase("tagsoup")) {
            return this.parseTagSoup(input);
        }
        return this.parseNeko(input);
    }

    private DocumentFragment parseTagSoup(InputSource input) throws Exception {
        HTMLDocumentImpl doc = new HTMLDocumentImpl();
        DocumentFragment frag = doc.createDocumentFragment();
        DOMBuilder builder = new DOMBuilder((Document)doc, frag);
        org.ccil.cowan.tagsoup.Parser reader = new org.ccil.cowan.tagsoup.Parser();
        reader.setContentHandler((ContentHandler)builder);
        reader.setFeature("http://www.ccil.org/~cowan/tagsoup/features/ignore-bogons", true);
        reader.setFeature("http://www.ccil.org/~cowan/tagsoup/features/bogons-empty", false);
        reader.setProperty("http://xml.org/sax/properties/lexical-handler", (Object)builder);
        reader.parse(input);
        return frag;
    }

    private DocumentFragment parseNeko(InputSource input) throws Exception {
        DOMFragmentParser parser = new DOMFragmentParser();
        try {
            parser.setFeature("http://cyberneko.org/html/features/scanner/allow-selfclosing-iframe", true);
            parser.setFeature("http://cyberneko.org/html/features/augmentations", true);
            parser.setProperty("http://cyberneko.org/html/properties/default-encoding", (Object)this.defaultCharEncoding);
            parser.setFeature("http://cyberneko.org/html/features/scanner/ignore-specified-charset", true);
            parser.setFeature("http://cyberneko.org/html/features/balance-tags/ignore-outside-content", false);
            parser.setFeature("http://cyberneko.org/html/features/balance-tags/document-fragment", true);
            parser.setFeature("http://cyberneko.org/html/features/report-errors", LOG.isTraceEnabled());
        }
        catch (SAXException e) {
            // empty catch block
        }
        HTMLDocumentImpl doc = new HTMLDocumentImpl();
        doc.setErrorChecking(false);
        DocumentFragment res = doc.createDocumentFragment();
        DocumentFragment frag = doc.createDocumentFragment();
        parser.parse(input, frag);
        res.appendChild(frag);
        try {
            while (true) {
                frag = doc.createDocumentFragment();
                parser.parse(input, frag);
                if (frag.hasChildNodes()) {
                    if (LOG.isInfoEnabled()) {
                        LOG.info(" - new frag, " + frag.getChildNodes().getLength() + " nodes.");
                    }
                    res.appendChild(frag);
                    continue;
                }
                break;
            }
        }
        catch (Exception e) {
            LOG.error("Error: ", (Throwable)e);
        }
        return res;
    }

    public static void main(String[] args) throws Exception {
        String name = args[0];
        String url = "file:" + name;
        File file = new File(name);
        byte[] bytes = new byte[(int)file.length()];
        DataInputStream in = new DataInputStream(new FileInputStream(file));
        in.readFully(bytes);
        Configuration conf = NutchConfiguration.create();
        HtmlParser parser = new HtmlParser();
        parser.setConf(conf);
        Parse parse = parser.getParse(new Content(url, url, bytes, "text/html", new Metadata(), conf)).get(url);
        System.out.println("data: " + parse.getData());
        System.out.println("text: " + parse.getText());
    }

    public void setConf(Configuration conf) {
        this.conf = conf;
        this.htmlParseFilters = new HtmlParseFilters(this.getConf());
        this.parserImpl = this.getConf().get("parser.html.impl", "neko");
        this.defaultCharEncoding = this.getConf().get("parser.character.encoding.default", "windows-1252");
        this.utils = new DOMContentUtils(conf);
        this.cachingPolicy = this.getConf().get("parser.caching.forbidden.policy", "content");
    }

    public Configuration getConf() {
        return this.conf;
    }
}

