/*
 * Decompiled with CFR 0.152.
 */
package org.creativecommons.nutch;

import java.io.StringReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.HashMap;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.parse.HTMLMetaTags;
import org.apache.nutch.parse.HtmlParseFilter;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseException;
import org.apache.nutch.parse.ParseResult;
import org.apache.nutch.parse.ParseStatus;
import org.apache.nutch.parse.ParseText;
import org.apache.nutch.protocol.Content;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Comment;
import org.w3c.dom.Document;
import org.w3c.dom.DocumentFragment;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;

public class CCParseFilter
implements HtmlParseFilter {
    public static final Logger LOG = LoggerFactory.getLogger(CCParseFilter.class);
    private static final HashMap<String, String> WORK_TYPE_NAMES = new HashMap();
    private Configuration conf;

    public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) {
        URL base;
        Parse parse = parseResult.get(content.getUrl());
        try {
            base = new URL(content.getBaseUrl());
        }
        catch (MalformedURLException e) {
            Parse emptyParse = new ParseStatus((Throwable)e).getEmptyParse(this.getConf());
            parseResult.put(content.getUrl(), new ParseText(emptyParse.getText()), emptyParse.getData());
            return parseResult;
        }
        try {
            Walker.walk(doc, base, parse.getData().getParseMeta(), this.getConf());
        }
        catch (ParseException e) {
            Parse emptyParse = new ParseStatus((Throwable)e).getEmptyParse(this.getConf());
            parseResult.put(content.getUrl(), new ParseText(emptyParse.getText()), emptyParse.getData());
            return parseResult;
        }
        return parseResult;
    }

    public void setConf(Configuration conf) {
        this.conf = conf;
    }

    public Configuration getConf() {
        return this.conf;
    }

    static {
        WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/MovingImage", "video");
        WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/StillImage", "image");
        WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Sound", "audio");
        WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Text", "text");
        WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Interactive", "interactive");
        WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Software", "software");
        WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Image", "image");
    }

    public static class Walker {
        private URL base;
        private String rdfLicense;
        private URL relLicense;
        private URL anchorLicense;
        private String workType;
        private static final DocumentBuilderFactory FACTORY = DocumentBuilderFactory.newInstance();
        private static final String CC_NS = "http://web.resource.org/cc/";
        private static final String DC_NS = "http://purl.org/dc/elements/1.1/";
        private static final String RDF_NS = "http://www.w3.org/1999/02/22-rdf-syntax-ns#";

        private Walker(URL base) {
            this.base = base;
        }

        public static void walk(Node doc, URL base, Metadata metadata, Configuration conf) throws ParseException {
            Walker walker = new Walker(base);
            walker.walk(doc);
            String licenseUrl = null;
            String licenseLocation = null;
            if (walker.rdfLicense != null) {
                licenseLocation = "rdf";
                licenseUrl = walker.rdfLicense;
            } else if (walker.relLicense != null) {
                licenseLocation = "rel";
                licenseUrl = walker.relLicense.toString();
            } else if (walker.anchorLicense != null) {
                licenseLocation = "a";
                licenseUrl = walker.anchorLicense.toString();
            } else if (conf.getBoolean("creativecommons.exclude.unlicensed", false)) {
                throw new ParseException("No CC license.  Excluding.");
            }
            if (licenseUrl != null) {
                if (LOG.isInfoEnabled()) {
                    LOG.info("CC: found " + licenseUrl + " in " + licenseLocation + " of " + base);
                }
                metadata.add("License-Url", licenseUrl);
                metadata.add("License-Location", licenseLocation);
            }
            if (walker.workType != null) {
                if (LOG.isInfoEnabled()) {
                    LOG.info("CC: found " + walker.workType + " in " + base);
                }
                metadata.add("Work-Type", walker.workType);
            }
        }

        private void walk(Node node) {
            if (node instanceof Element) {
                this.findLicenseUrl((Element)node);
            }
            if (node instanceof Comment) {
                this.findRdf(((Comment)node).getData());
            }
            NodeList children = node.getChildNodes();
            for (int i = 0; children != null && i < children.getLength(); ++i) {
                this.walk(children.item(i));
            }
        }

        private void findLicenseUrl(Element element) {
            if (!"a".equalsIgnoreCase(element.getTagName())) {
                return;
            }
            String href = element.getAttribute("href");
            if (href == null) {
                return;
            }
            try {
                URL url = new URL(this.base, href);
                if ("http".equalsIgnoreCase(url.getProtocol()) && "creativecommons.org".equalsIgnoreCase(url.getHost()) && url.getPath() != null && url.getPath().startsWith("/licenses/") && url.getPath().length() > "/licenses/".length()) {
                    String rel = element.getAttribute("rel");
                    if (rel != null && "license".equals(rel) && this.relLicense == null) {
                        this.relLicense = url;
                    } else if (this.anchorLicense == null) {
                        this.anchorLicense = url;
                    }
                }
            }
            catch (MalformedURLException malformedURLException) {
                // empty catch block
            }
        }

        private void findRdf(String comment) {
            int j;
            Document doc;
            int rdfPosition = comment.indexOf("RDF");
            if (rdfPosition < 0) {
                return;
            }
            int nsPosition = comment.indexOf(CC_NS);
            if (nsPosition < 0) {
                return;
            }
            try {
                DocumentBuilder parser = FACTORY.newDocumentBuilder();
                doc = parser.parse(new InputSource(new StringReader(comment)));
            }
            catch (Exception e) {
                if (LOG.isWarnEnabled()) {
                    LOG.warn("CC: Failed to parse RDF in " + this.base + ": " + e);
                }
                return;
            }
            NodeList roots = doc.getElementsByTagNameNS(RDF_NS, "RDF");
            if (roots.getLength() != 1) {
                if (LOG.isWarnEnabled()) {
                    LOG.warn("CC: No RDF root in " + this.base);
                }
                return;
            }
            Element rdf = (Element)roots.item(0);
            NodeList licenses = rdf.getElementsByTagNameNS(CC_NS, "License");
            for (int i = 0; i < licenses.getLength(); ++i) {
                Element l = (Element)licenses.item(i);
                this.rdfLicense = l.getAttributeNodeNS(RDF_NS, "about").getValue();
                NodeList predicates = l.getChildNodes();
                for (j = 0; j < predicates.getLength(); ++j) {
                    Element predicateElement;
                    Node predicateNode = predicates.item(j);
                    if (predicateNode instanceof Element && CC_NS.equals((predicateElement = (Element)predicateNode).getNamespaceURI())) continue;
                }
            }
            NodeList works = rdf.getElementsByTagNameNS(CC_NS, "Work");
            for (int i = 0; i < works.getLength(); ++i) {
                NodeList types = rdf.getElementsByTagNameNS(DC_NS, "type");
                for (j = 0; j < types.getLength(); ++j) {
                    Element type = (Element)types.item(j);
                    String workUri = type.getAttributeNodeNS(RDF_NS, "resource").getValue();
                    this.workType = (String)WORK_TYPE_NAMES.get(workUri);
                }
            }
        }

        static {
            FACTORY.setNamespaceAware(true);
        }
    }
}

