/*
 * Decompiled with CFR 0.152.
 */
package org.archive.modules.extractor;

import au.id.jericho.lib.html.Attribute;
import au.id.jericho.lib.html.Attributes;
import au.id.jericho.lib.html.Element;
import au.id.jericho.lib.html.FormControl;
import au.id.jericho.lib.html.FormControlType;
import au.id.jericho.lib.html.FormField;
import au.id.jericho.lib.html.FormFields;
import au.id.jericho.lib.html.Segment;
import au.id.jericho.lib.html.Source;
import au.id.jericho.lib.html.StartTagType;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.commons.httpclient.URIException;
import org.apache.commons.lang.StringEscapeUtils;
import org.apache.commons.lang.StringUtils;
import org.archive.modules.ProcessorURI;
import org.archive.modules.extractor.ExtractorCSS;
import org.archive.modules.extractor.ExtractorHTML;
import org.archive.modules.extractor.HTMLLinkContext;
import org.archive.modules.extractor.Hop;
import org.archive.modules.extractor.Link;
import org.archive.modules.net.RobotsHonoringPolicy;
import org.archive.net.UURI;
import org.archive.net.UURIFactory;
import org.archive.util.DevUtils;
import org.archive.util.TextUtils;

/*
 * This class specifies class file version 49.0 but uses Java 6 signatures.  Assumed Java 6.
 */
public class JerichoExtractorHTML
extends ExtractorHTML {
    private static final long serialVersionUID = 1684681316546343615L;
    private static final Logger logger = Logger.getLogger(JerichoExtractorHTML.class.getName());
    protected long numberOfFormsProcessed = 0L;

    private static List<Attribute> findOnAttributes(Attributes attributes) {
        LinkedList<Attribute> result = new LinkedList<Attribute>();
        for (Attribute attr : attributes) {
            if (!attr.getKey().startsWith("on")) continue;
            result.add(attr);
        }
        return result;
    }

    protected void processGeneralTag(ProcessorURI curi, Element element, Attributes attributes) {
        List<Attribute> attrList;
        CharSequence context;
        String attrValue;
        String elementName = element.getName();
        String codebase = null;
        ArrayList<String> resources = null;
        boolean framesAsEmbeds = (Boolean)curi.get(this, TREAT_FRAMES_AS_EMBED_LINKS);
        boolean ignoreFormActions = (Boolean)curi.get(this, IGNORE_FORM_ACTION_URLS);
        boolean overlyEagerLinkDetection = (Boolean)curi.get(this, EXTRACT_VALUE_ATTRIBUTES);
        Attribute attr2 = attributes.get("href");
        if (attr2 != null && (attrValue = attr2.getValue()) != null) {
            context = JerichoExtractorHTML.elementContext(elementName, attr2.getKey());
            if ("link".equals(elementName)) {
                this.processEmbed(curi, attrValue, context);
            } else {
                this.processLink(curi, attrValue, context);
            }
            if ("base".equals(elementName)) {
                try {
                    UURI base = UURIFactory.getInstance((String)attrValue);
                    curi.setBaseURI(base);
                }
                catch (URIException e) {
                    this.logUriError(e, curi, attrValue);
                }
            }
        }
        if ((attr2 = attributes.get("action")) != null && (attrValue = attr2.getValue()) != null && !ignoreFormActions) {
            context = JerichoExtractorHTML.elementContext(elementName, attr2.getKey());
            this.processLink(curi, attrValue, context);
        }
        if ((attrList = JerichoExtractorHTML.findOnAttributes(attributes)).size() != 0) {
            for (Attribute attr2 : attrList) {
                Segment valueSegment = attr2.getValueSegment();
                if (valueSegment == null) continue;
                this.processScriptCode(curi, (CharSequence)valueSegment);
            }
        }
        if (((attr2 = attributes.get("src")) != null || (attr2 = attributes.get("lowsrc")) != null || (attr2 = attributes.get("background")) != null || (attr2 = attributes.get("cite")) != null || (attr2 = attributes.get("longdesc")) != null || (attr2 = attributes.get("usemap")) != null || (attr2 = attributes.get("profile")) != null || (attr2 = attributes.get("datasrc")) != null) && (attrValue = attr2.getValue()) != null) {
            CharSequence context2 = JerichoExtractorHTML.elementContext(elementName, attr2.getKey());
            Hop hopType = !framesAsEmbeds && ("frame".equals(elementName) || "iframe".equals(elementName)) ? Hop.NAVLINK : Hop.EMBED;
            this.processEmbed(curi, attrValue, context2, hopType);
        }
        if ((attr2 = attributes.get("codebase")) != null && (attrValue = attr2.getValue()) != null) {
            codebase = StringEscapeUtils.unescapeHtml((String)attrValue);
            context = JerichoExtractorHTML.elementContext(elementName, attr2.getKey());
            this.processEmbed(curi, codebase, context);
        }
        if (((attr2 = attributes.get("classid")) != null || (attr2 = attributes.get("data")) != null) && (attrValue = attr2.getValue()) != null) {
            if (resources == null) {
                resources = new ArrayList<String>();
            }
            resources.add(attrValue);
        }
        if ((attr2 = attributes.get("archive")) != null && (attrValue = attr2.getValue()) != null) {
            if (resources == null) {
                resources = new ArrayList();
            }
            String[] multi = TextUtils.split((String)"\\s", (CharSequence)attrValue);
            for (int i = 0; i < multi.length; ++i) {
                resources.add(multi[i]);
            }
        }
        if ((attr2 = attributes.get("code")) != null && (attrValue = attr2.getValue()) != null) {
            if (resources == null) {
                resources = new ArrayList();
            }
            if ("applet".equals(elementName) && !attrValue.endsWith(".class")) {
                resources.add(attrValue + ".class");
            } else {
                resources.add(attrValue);
            }
        }
        if ((attr2 = attributes.get("value")) != null && (attrValue = attr2.getValue()) != null && TextUtils.matches((String)"(\\.{0,2}[^\\.\\n\\r\\s\"']*(\\.[^\\.\\n\\r\\s\"']+)+)", (CharSequence)attrValue) && overlyEagerLinkDetection) {
            context = JerichoExtractorHTML.elementContext(elementName, attr2.getKey());
            this.processLink(curi, attrValue, context);
        }
        if ((attr2 = attributes.get("style")) != null && (attrValue = attr2.getValue()) != null) {
            this.numberOfLinksExtracted += ExtractorCSS.processStyleCode(this.uriErrors, curi, attrValue);
        }
        if (resources == null) {
            return;
        }
        Iterator iter = resources.iterator();
        UURI codebaseURI = null;
        String res = null;
        try {
            if (codebase != null) {
                codebaseURI = UURIFactory.getInstance((UURI)curi.getUURI(), (String)codebase);
            }
            while (iter.hasNext()) {
                res = (String)iter.next();
                res = StringEscapeUtils.unescapeHtml((String)res);
                if (codebaseURI != null) {
                    res = codebaseURI.resolve(res).toString();
                }
                this.processEmbed(curi, res, (CharSequence)element);
            }
        }
        catch (URIException e) {
            curi.getNonFatalFailures().add(e);
        }
        catch (IllegalArgumentException e) {
            DevUtils.logger.log(Level.WARNING, "processGeneralTag()\ncodebase=" + codebase + " res=" + res + "\n" + DevUtils.extraInfo(), e);
        }
    }

    protected boolean processMeta(ProcessorURI curi, Element element) {
        String name = element.getAttributeValue("name");
        String httpEquiv = element.getAttributeValue("http-equiv");
        String content = element.getAttributeValue("content");
        if ("robots".equals(name) && content != null) {
            curi.getData().put("meta-robots", content);
            RobotsHonoringPolicy policy = this.honoringPolicy;
            String contentLower = content.toLowerCase();
            if (!(policy != null && (policy.isType(curi, RobotsHonoringPolicy.Type.IGNORE) || policy.isType(curi, RobotsHonoringPolicy.Type.CUSTOM)) || contentLower.indexOf("nofollow") < 0 && contentLower.indexOf("none") < 0)) {
                logger.fine("HTML extraction skipped due to robots meta-tag for: " + curi.toString());
                return true;
            }
        }
        if ("refresh".equals(httpEquiv) && content != null) {
            String refreshUri = content.substring(content.indexOf("=") + 1);
            try {
                int max = this.uriErrors.getMaxOutlinks(curi);
                Link.addRelativeToBase(curi, max, refreshUri, HTMLLinkContext.META, Hop.REFER);
            }
            catch (URIException e) {
                this.logUriError(e, curi, refreshUri);
            }
        }
        return false;
    }

    protected void processScript(ProcessorURI curi, Element element) {
        this.processGeneralTag(curi, element, element.getAttributes());
        this.processScriptCode(curi, (CharSequence)element.getContent());
    }

    protected void processStyle(ProcessorURI curi, Element element) {
        this.processGeneralTag(curi, element, element.getAttributes());
        this.numberOfLinksExtracted += ExtractorCSS.processStyleCode(this.uriErrors, curi, (CharSequence)element.getContent());
    }

    protected void processForm(ProcessorURI curi, Element element) {
        String action = element.getAttributeValue("action");
        String name = element.getAttributeValue("name");
        String queryURL = "";
        boolean ignoreFormActions = (Boolean)curi.get(this, IGNORE_FORM_ACTION_URLS);
        if (ignoreFormActions) {
            return;
        }
        String method = StringUtils.defaultIfEmpty((String)element.getAttributeValue("method"), (String)"GET");
        if (((Boolean)curi.get(this, EXTRACT_ONLY_FORM_GETS)).booleanValue() && !"GET".equalsIgnoreCase(method)) {
            return;
        }
        ++this.numberOfFormsProcessed;
        FormFields formFields = element.findFormFields();
        for (FormField formField : formFields) {
            for (FormControl formControl : formField.getFormControls()) {
                String controlName = formControl.getName();
                Collection controlValues = formControl.getFormControlType() != FormControlType.SUBMIT ? formControl.getValues() : formControl.getPredefinedValues();
                if (controlValues.size() > 0) {
                    for (String value : controlValues) {
                        queryURL = queryURL + "&" + controlName + "=" + value;
                    }
                    continue;
                }
                queryURL = queryURL + "&" + controlName + "=";
            }
        }
        if (action == null) {
            queryURL = queryURL.replaceFirst("&", "?");
        } else {
            if (!action.contains("?")) {
                queryURL = queryURL.replaceFirst("&", "?");
            }
            queryURL = action + queryURL;
        }
        CharSequence context = JerichoExtractorHTML.elementContext(element.getName(), "name=" + name);
        this.processLink(curi, queryURL, context);
    }

    @Override
    void extract(ProcessorURI curi, CharSequence cs) {
        Source source = new Source(cs);
        List elements = source.findAllElements(StartTagType.NORMAL);
        for (Element element : elements) {
            String elementName = element.getName();
            if (elementName.equals("meta")) {
                if (!this.processMeta(curi, element)) continue;
                break;
            }
            if (elementName.equals("script")) {
                this.processScript(curi, element);
                continue;
            }
            if (elementName.equals("style")) {
                this.processStyle(curi, element);
                continue;
            }
            if (elementName.equals("form")) {
                this.processForm(curi, element);
                continue;
            }
            Attributes attributes = element.getAttributes();
            if (attributes.isEmpty()) continue;
            this.processGeneralTag(curi, element, attributes);
        }
    }

    @Override
    public String report() {
        StringBuffer ret = new StringBuffer();
        ret.append("Processor: org.archive.crawler.extractor.JerichoExtractorHTML\n");
        ret.append("  Function:          Link extraction on HTML documents\n");
        ret.append("  ProcessorURIs handled: " + this.numberOfCURIsHandled + "\n");
        ret.append("  Forms processed:   " + this.numberOfFormsProcessed + "\n");
        ret.append("  Links extracted:   " + this.numberOfLinksExtracted + "\n\n");
        return ret.toString();
    }
}

