/*
 * Decompiled with CFR 0.152.
 */
package org.archive.modules.extractor;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.httpclient.URIException;
import org.archive.io.ReplayCharSequence;
import org.archive.modules.ProcessorURI;
import org.archive.modules.extractor.ContentExtractor;
import org.archive.modules.extractor.ExtractorCSS;
import org.archive.modules.extractor.ExtractorJS;
import org.archive.modules.extractor.HTMLLinkContext;
import org.archive.modules.extractor.Hop;
import org.archive.modules.extractor.Link;
import org.archive.modules.net.RobotsHonoringPolicy;
import org.archive.net.UURI;
import org.archive.net.UURIFactory;
import org.archive.state.Expert;
import org.archive.state.Immutable;
import org.archive.state.Initializable;
import org.archive.state.Key;
import org.archive.state.KeyManager;
import org.archive.state.StateProvider;
import org.archive.util.DevUtils;
import org.archive.util.TextUtils;

public class ExtractorHTML
extends ContentExtractor
implements Initializable {
    private static final long serialVersionUID = 2L;
    private static Logger logger = Logger.getLogger(ExtractorHTML.class.getName());
    private static final String MAX_ELEMENT_REPLACE = "MAX_ELEMENT";
    private static final String MAX_ATTR_NAME_REPLACE = "MAX_ATTR_NAME";
    private static final String MAX_ATTR_VAL_REPLACE = "MAX_ATTR_VAL";
    public static final String A_META_ROBOTS = "meta-robots";
    @Immutable
    @Expert
    public static final Key<Integer> MAX_ELEMENT_LENGTH = Key.make((int)1024);
    static final String RELEVANT_TAG_EXTRACTOR = "(?is)<(?:((script[^>]*+)>.*?</script)|((style[^>]*+)>.*?</style)|(((meta)|(?:\\w{1,MAX_ELEMENT}))\\s+[^>]*+)|(!--.*?--))>";
    @Immutable
    @Expert
    public static final Key<Integer> MAX_ATTR_NAME_LENGTH = Key.make((int)1024);
    @Immutable
    @Expert
    public static final Key<Integer> MAX_ATTR_VAL_LENGTH = Key.make((int)16384);
    static final String EACH_ATTRIBUTE_EXTRACTOR = "(?is)\\b((href)|(action)|(on\\w*)|((?:src)|(?:lowsrc)|(?:background)|(?:cite)|(?:longdesc)|(?:usemap)|(?:profile)|(?:datasrc))|(codebase)|((?:classid)|(?:data))|(archive)|(code)|(value)|(style)|(method)|([-\\w]{1,MAX_ATTR_NAME}))\\s*=\\s*(?:(?:\"(.{0,MAX_ATTR_VAL}?)(?:\"|$))|(?:'(.{0,MAX_ATTR_VAL}?)(?:'|$))|(\\S{1,MAX_ATTR_VAL}))";
    static final String LIKELY_URI_PATH = "(\\.{0,2}[^\\.\\n\\r\\s\"']*(\\.[^\\.\\n\\r\\s\"']+)+)";
    static final String WHITESPACE = "\\s";
    static final String CLASSEXT = ".class";
    static final String APPLET = "applet";
    static final String BASE = "base";
    static final String LINK = "link";
    static final String FRAME = "frame";
    static final String IFRAME = "iframe";
    @Expert
    public static final Key<Boolean> TREAT_FRAMES_AS_EMBED_LINKS = Key.make((boolean)true);
    @Expert
    public static final Key<Boolean> IGNORE_FORM_ACTION_URLS = Key.make((boolean)false);
    @Expert
    public static final Key<Boolean> EXTRACT_ONLY_FORM_GETS = Key.make((boolean)true);
    @Expert
    public static final Key<Boolean> EXTRACT_JAVASCRIPT = Key.make((boolean)true);
    @Expert
    public static final Key<Boolean> EXTRACT_VALUE_ATTRIBUTES = Key.make((boolean)true);
    @Expert
    public static final Key<Boolean> IGNORE_UNEXPECTED_HTML = Key.make((boolean)true);
    public static final Key<RobotsHonoringPolicy> ROBOTS_HONORING_POLICY = Key.makeAuto(RobotsHonoringPolicy.class);
    protected long numberOfCURIsHandled = 0L;
    protected long numberOfLinksExtracted = 0L;
    RobotsHonoringPolicy honoringPolicy;
    private Pattern relevantTagExtractor;
    private Pattern eachAttributeExtractor;
    static final String JAVASCRIPT = "(?i)^javascript:.*";
    static final String NON_HTML_PATH_EXTENSION = "(?i)(gif)|(jp(e)?g)|(png)|(tif(f)?)|(bmp)|(avi)|(mov)|(mp(e)?g)|(mp3)|(mp4)|(swf)|(wav)|(au)|(aiff)|(mid)";

    public void initialTasks(StateProvider global) {
        super.initialTasks(global);
        this.honoringPolicy = (RobotsHonoringPolicy)global.get((Object)this, ROBOTS_HONORING_POLICY);
        int maxElementLength = (Integer)global.get((Object)this, MAX_ELEMENT_LENGTH);
        int maxAttrNameLength = (Integer)global.get((Object)this, MAX_ATTR_NAME_LENGTH);
        int maxAttrValLength = (Integer)global.get((Object)this, MAX_ATTR_VAL_LENGTH);
        String regex = RELEVANT_TAG_EXTRACTOR;
        regex = regex.replace(MAX_ELEMENT_REPLACE, Integer.toString(maxElementLength));
        this.relevantTagExtractor = Pattern.compile(regex);
        regex = EACH_ATTRIBUTE_EXTRACTOR;
        regex = regex.replace(MAX_ATTR_NAME_REPLACE, Integer.toString(maxAttrNameLength));
        regex = regex.replace(MAX_ATTR_VAL_REPLACE, Integer.toString(maxAttrValLength));
        this.eachAttributeExtractor = Pattern.compile(regex);
    }

    protected void processGeneralTag(ProcessorURI curi, CharSequence element, CharSequence cs) {
        Matcher attr = this.eachAttributeExtractor.matcher(cs);
        String codebase = null;
        ArrayList<String> resources = null;
        CharSequence action = null;
        CharSequence actionContext = null;
        Object method = null;
        boolean framesAsEmbeds = (Boolean)curi.get(this, TREAT_FRAMES_AS_EMBED_LINKS);
        boolean ignoreFormActions = (Boolean)curi.get(this, IGNORE_FORM_ACTION_URLS);
        boolean extractValueAttributes = (Boolean)curi.get(this, EXTRACT_VALUE_ATTRIBUTES);
        String elementStr = ((Object)element).toString();
        while (attr.find()) {
            CharSequence context;
            int valueGroup = attr.start(14) > -1 ? 14 : (attr.start(15) > -1 ? 15 : 16);
            int start = attr.start(valueGroup);
            int end = attr.end(valueGroup);
            assert (start >= 0) : "Start is: " + start + ", " + curi;
            assert (end >= 0) : "End is :" + end + ", " + curi;
            CharSequence value = cs.subSequence(start, end);
            value = TextUtils.unescapeHtml((CharSequence)value);
            if (attr.start(2) > -1) {
                context = ExtractorHTML.elementContext(element, attr.group(2));
                if (elementStr.equalsIgnoreCase(LINK)) {
                    this.processEmbed(curi, value, context);
                } else {
                    this.processLink(curi, value, context);
                }
                if (!elementStr.equalsIgnoreCase(BASE)) continue;
                try {
                    UURI base = UURIFactory.getInstance((String)((Object)value).toString());
                    curi.setBaseURI(base);
                }
                catch (URIException e) {
                    this.logUriError(e, curi, value);
                }
                continue;
            }
            if (attr.start(3) > -1) {
                if (ignoreFormActions) continue;
                action = value;
                actionContext = ExtractorHTML.elementContext(element, attr.group(3));
                continue;
            }
            if (attr.start(4) > -1) {
                this.processScriptCode(curi, value);
                continue;
            }
            if (attr.start(5) > -1) {
                context = ExtractorHTML.elementContext(element, attr.group(5));
                Hop hop = !framesAsEmbeds && (elementStr.equalsIgnoreCase(FRAME) || elementStr.equalsIgnoreCase(IFRAME)) ? Hop.NAVLINK : Hop.EMBED;
                this.processEmbed(curi, value, context, hop);
                continue;
            }
            if (attr.start(6) > -1) {
                codebase = value instanceof String ? (String)value : ((Object)value).toString();
                context = ExtractorHTML.elementContext(element, attr.group(6));
                this.processEmbed(curi, codebase, context);
                continue;
            }
            if (attr.start(7) > -1) {
                if (resources == null) {
                    resources = new ArrayList<String>();
                }
                resources.add(((Object)value).toString());
                continue;
            }
            if (attr.start(8) > -1) {
                if (resources == null) {
                    resources = new ArrayList();
                }
                String[] multi = TextUtils.split((String)WHITESPACE, (CharSequence)value);
                for (int i = 0; i < multi.length; ++i) {
                    resources.add(multi[i]);
                }
                continue;
            }
            if (attr.start(9) > -1) {
                if (resources == null) {
                    resources = new ArrayList();
                }
                if (elementStr.equalsIgnoreCase(APPLET) && !((Object)value).toString().toLowerCase().endsWith(CLASSEXT)) {
                    resources.add(((Object)value).toString() + CLASSEXT);
                    continue;
                }
                resources.add(((Object)value).toString());
                continue;
            }
            if (attr.start(10) > -1) {
                if (!extractValueAttributes || !TextUtils.matches((String)LIKELY_URI_PATH, (CharSequence)value)) continue;
                context = ExtractorHTML.elementContext(element, attr.group(10));
                this.processLink(curi, value, context);
                continue;
            }
            if (attr.start(11) > -1) {
                this.numberOfLinksExtracted += ExtractorCSS.processStyleCode(this.uriErrors, curi, value);
                continue;
            }
            if (attr.start(12) > -1) {
                method = value;
                continue;
            }
            if (attr.start(13) <= -1) continue;
        }
        TextUtils.recycleMatcher((Matcher)attr);
        if (resources != null) {
            Iterator iter = resources.iterator();
            UURI codebaseURI = null;
            String res = null;
            try {
                if (codebase != null) {
                    codebaseURI = UURIFactory.getInstance((UURI)curi.getUURI(), codebase);
                }
                while (iter.hasNext()) {
                    res = iter.next().toString();
                    res = (String)TextUtils.unescapeHtml((CharSequence)res);
                    if (codebaseURI != null) {
                        res = codebaseURI.resolve(res).toString();
                    }
                    this.processEmbed(curi, res, element);
                }
            }
            catch (URIException e) {
                curi.getNonFatalFailures().add(e);
            }
            catch (IllegalArgumentException e) {
                DevUtils.logger.log(Level.WARNING, "processGeneralTag()\ncodebase=" + codebase + " res=" + res + "\n" + DevUtils.extraInfo(), e);
            }
        }
        if (action != null && (method == null || "GET".equalsIgnoreCase(method.toString()) || !((Boolean)curi.get(this, EXTRACT_ONLY_FORM_GETS)).booleanValue())) {
            this.processLink(curi, action, actionContext);
        }
    }

    protected void processScriptCode(ProcessorURI curi, CharSequence cs) {
        if (((Boolean)curi.get(this, EXTRACT_JAVASCRIPT)).booleanValue()) {
            this.numberOfLinksExtracted += ExtractorJS.considerStrings(this.uriErrors, curi, cs, false);
        }
    }

    protected void processLink(ProcessorURI curi, CharSequence value, CharSequence context) {
        if (TextUtils.matches((String)JAVASCRIPT, (CharSequence)value)) {
            this.processScriptCode(curi, value.subSequence(11, value.length()));
        } else {
            if (logger.isLoggable(Level.FINEST)) {
                logger.finest("link: " + ((Object)value).toString() + " from " + curi);
            }
            this.addLinkFromString(curi, value instanceof String ? (String)value : ((Object)value).toString(), context, Hop.NAVLINK);
            ++this.numberOfLinksExtracted;
        }
    }

    private void addLinkFromString(ProcessorURI curi, String uri, CharSequence context, Hop hop) {
        try {
            HTMLLinkContext hc = new HTMLLinkContext(((Object)context).toString());
            int max = this.uriErrors.getMaxOutlinks(curi);
            Link.addRelativeToBase(curi, max, uri, hc, hop);
        }
        catch (URIException e) {
            this.logUriError(e, curi, uri);
        }
    }

    protected final void processEmbed(ProcessorURI curi, CharSequence value, CharSequence context) {
        this.processEmbed(curi, value, context, Hop.EMBED);
    }

    protected void processEmbed(ProcessorURI curi, CharSequence value, CharSequence context, Hop hop) {
        if (logger.isLoggable(Level.FINEST)) {
            logger.finest("embed (" + hop.getHopChar() + "): " + ((Object)value).toString() + " from " + curi);
        }
        this.addLinkFromString(curi, value instanceof String ? (String)value : ((Object)value).toString(), context, hop);
        ++this.numberOfLinksExtracted;
    }

    protected boolean shouldExtract(ProcessorURI uri) {
        String mime;
        if (((Boolean)uri.get(this, IGNORE_UNEXPECTED_HTML)).booleanValue()) {
            try {
                if (!this.isHtmlExpectedHere(uri)) {
                    return false;
                }
            }
            catch (URIException e) {
                logger.severe("Failed expectedHTML test: " + e.getMessage());
            }
        }
        if ((mime = uri.getContentType().toLowerCase()).startsWith("text/html")) {
            return true;
        }
        return mime.startsWith("application/xhtml");
    }

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    public boolean innerExtract(ProcessorURI curi) {
        ++this.numberOfCURIsHandled;
        ReplayCharSequence cs = null;
        try {
            cs = curi.getRecorder().getReplayCharSequence();
        }
        catch (IOException e) {
            curi.getNonFatalFailures().add(e);
            logger.log(Level.SEVERE, "Failed get of replay char sequence in " + Thread.currentThread().getName(), e);
        }
        if (cs == null) {
            return false;
        }
        try {
            this.extract(curi, (CharSequence)cs);
            boolean bl = true;
            return bl;
        }
        finally {
            if (cs != null) {
                try {
                    cs.close();
                }
                catch (IOException ioe) {
                    logger.warning(TextUtils.exceptionToString((String)"Failed close of ReplayCharSequence.", (Throwable)ioe));
                }
            }
        }
    }

    void extract(ProcessorURI curi, CharSequence cs) {
        Matcher tags = this.relevantTagExtractor.matcher(cs);
        while (tags.find() && !Thread.interrupted()) {
            int end;
            int start;
            if (tags.start(8) > 0) continue;
            if (tags.start(7) > 0) {
                start = tags.start(5);
                end = tags.end(5);
                assert (start >= 0) : "Start is: " + start + ", " + curi;
                assert (end >= 0) : "End is :" + end + ", " + curi;
                if (!this.processMeta(curi, cs.subSequence(start, end))) continue;
                break;
            }
            if (tags.start(5) > 0) {
                int start5 = tags.start(5);
                int end5 = tags.end(5);
                assert (start5 >= 0) : "Start is: " + start5 + ", " + curi;
                assert (end5 >= 0) : "End is :" + end5 + ", " + curi;
                int start6 = tags.start(6);
                int end6 = tags.end(6);
                assert (start6 >= 0) : "Start is: " + start6 + ", " + curi;
                assert (end6 >= 0) : "End is :" + end6 + ", " + curi;
                this.processGeneralTag(curi, cs.subSequence(start6, end6), cs.subSequence(start5, end5));
                continue;
            }
            if (tags.start(1) > 0) {
                start = tags.start(1);
                end = tags.end(1);
                assert (start >= 0) : "Start is: " + start + ", " + curi;
                assert (end >= 0) : "End is :" + end + ", " + curi;
                assert (tags.end(2) >= 0) : "Tags.end(2) illegal " + tags.end(2) + ", " + curi;
                this.processScript(curi, cs.subSequence(start, end), tags.end(2) - start);
                continue;
            }
            if (tags.start(3) <= 0) continue;
            start = tags.start(3);
            end = tags.end(3);
            assert (start >= 0) : "Start is: " + start + ", " + curi;
            assert (end >= 0) : "End is :" + end + ", " + curi;
            assert (tags.end(4) >= 0) : "Tags.end(4) illegal " + tags.end(4) + ", " + curi;
            this.processStyle(curi, cs.subSequence(start, end), tags.end(4) - start);
        }
        TextUtils.recycleMatcher((Matcher)tags);
    }

    protected boolean isHtmlExpectedHere(ProcessorURI curi) throws URIException {
        String path = curi.getUURI().getPath();
        if (path == null) {
            return true;
        }
        int dot = path.lastIndexOf(46);
        if (dot < 0) {
            return true;
        }
        if (dot < path.length() - 5) {
            return true;
        }
        String ext = path.substring(dot + 1);
        return !TextUtils.matches((String)NON_HTML_PATH_EXTENSION, (CharSequence)ext);
    }

    protected void processScript(ProcessorURI curi, CharSequence sequence, int endOfOpenTag) {
        this.processGeneralTag(curi, sequence.subSequence(0, 6), sequence.subSequence(0, endOfOpenTag));
        this.processScriptCode(curi, sequence.subSequence(endOfOpenTag, sequence.length()));
    }

    protected boolean processMeta(ProcessorURI curi, CharSequence cs) {
        int urlIndex;
        Matcher attr = this.eachAttributeExtractor.matcher(cs);
        String name = null;
        String httpEquiv = null;
        String content = null;
        while (attr.find()) {
            int valueGroup = attr.start(14) > -1 ? 14 : (attr.start(15) > -1 ? 15 : 16);
            CharSequence value = cs.subSequence(attr.start(valueGroup), attr.end(valueGroup));
            if (attr.group(1).equalsIgnoreCase("name")) {
                name = ((Object)value).toString();
                continue;
            }
            if (attr.group(1).equalsIgnoreCase("http-equiv")) {
                httpEquiv = ((Object)value).toString();
                continue;
            }
            if (!attr.group(1).equalsIgnoreCase("content")) continue;
            content = ((Object)value).toString();
        }
        TextUtils.recycleMatcher((Matcher)attr);
        if ("robots".equalsIgnoreCase(name) && content != null) {
            curi.getData().put(A_META_ROBOTS, content);
            RobotsHonoringPolicy policy = this.honoringPolicy;
            String contentLower = content.toLowerCase();
            if (!(policy != null && (policy.isType(curi, RobotsHonoringPolicy.Type.IGNORE) || policy.isType(curi, RobotsHonoringPolicy.Type.CUSTOM)) || contentLower.indexOf("nofollow") < 0 && contentLower.indexOf("none") < 0)) {
                logger.fine("HTML extraction skipped due to robots meta-tag for: " + curi.toString());
                return true;
            }
        } else if ("refresh".equalsIgnoreCase(httpEquiv) && content != null && (urlIndex = content.indexOf("=") + 1) > 0) {
            String refreshUri = content.substring(urlIndex);
            try {
                int max = this.uriErrors.getMaxOutlinks(curi);
                Link.addRelativeToBase(curi, max, refreshUri, HTMLLinkContext.META, Hop.REFER);
            }
            catch (URIException e) {
                this.logUriError(e, curi, refreshUri);
            }
        }
        return false;
    }

    protected void processStyle(ProcessorURI curi, CharSequence sequence, int endOfOpenTag) {
        this.processGeneralTag(curi, sequence.subSequence(0, 6), sequence.subSequence(0, endOfOpenTag));
        this.numberOfLinksExtracted += ExtractorCSS.processStyleCode(this.uriErrors, curi, sequence.subSequence(endOfOpenTag, sequence.length()));
    }

    public String report() {
        StringBuffer ret = new StringBuffer();
        ret.append("Processor: org.archive.crawler.extractor.ExtractorHTML\n");
        ret.append("  Function:          Link extraction on HTML documents\n");
        ret.append("  ProcessorURIs handled: " + this.numberOfCURIsHandled + "\n");
        ret.append("  Links extracted:   " + this.numberOfLinksExtracted + "\n\n");
        return ret.toString();
    }

    public static CharSequence elementContext(CharSequence element, CharSequence attribute) {
        return attribute == null ? "" : element + "/@" + attribute;
    }

    static {
        KeyManager.addKeys(ExtractorHTML.class);
    }
}

