/*
 * Decompiled with CFR 0.152.
 */
package org.archive.extractor;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import org.apache.commons.httpclient.URIException;
import org.archive.extractor.CharSequenceLinkExtractor;
import org.archive.extractor.RegexpCSSLinkExtractor;
import org.archive.extractor.RegexpJSLinkExtractor;
import org.archive.modules.extractor.HTMLLinkContext;
import org.archive.modules.extractor.Hop;
import org.archive.modules.extractor.Link;
import org.archive.modules.extractor.LinkContext;
import org.archive.net.UURI;
import org.archive.net.UURIFactory;
import org.archive.util.DevUtils;
import org.archive.util.TextUtils;

public class RegexpHTMLLinkExtractor
extends CharSequenceLinkExtractor {
    private static Logger logger = Logger.getLogger(RegexpHTMLLinkExtractor.class.getName());
    boolean honorRobots = true;
    boolean extractInlineCss = true;
    boolean extractInlineJs = true;
    protected LinkedList<Link> next = new LinkedList();
    protected Matcher tags;
    static final String RELEVANT_TAG_EXTRACTOR = "(?is)<(?:((script[^>]*+)>.*?</script)|((style[^>]*+)>[^<]*+</style)|(((meta)|(?:\\w+))\\s+[^>]*+)|(!--.*?--))>";
    static final String EACH_ATTRIBUTE_EXTRACTOR = "(?is)\\s((href)|(action)|(on\\w*)|((?:src)|(?:lowsrc)|(?:background)|(?:cite)|(?:longdesc)|(?:usemap)|(?:profile)|(?:datasrc)|(?:for))|(codebase)|((?:classid)|(?:data))|(archive)|(code)|(value)|([-\\w]+))\\s*=\\s*(?:(?:\"(.*?)(?:\"|$))|(?:'(.*?)(?:'|$))|(\\S+))";
    static final String LIKELY_URI_PATH = "(\\.{0,2}[^\\.\\n\\r\\s\"']*(\\.[^\\.\\n\\r\\s\"']+)+)";
    static final String ESCAPED_AMP = "&amp;";
    static final String AMP = "&";
    static final String WHITESPACE = "\\s";
    static final String CLASSEXT = ".class";
    static final String APPLET = "applet";
    static final String BASE = "base";
    static final String LINK = "link";
    static final String JAVASCRIPT = "(?i)^javascript:.*";
    static final String NON_HTML_PATH_EXTENSION = "(?i)(gif)|(jp(e)?g)|(png)|(tif(f)?)|(bmp)|(avi)|(mov)|(mp(e)?g)|(mp3)|(mp4)|(swf)|(wav)|(au)|(aiff)|(mid)";

    protected boolean findNextLink() {
        if (this.tags == null) {
            this.tags = TextUtils.getMatcher((String)RELEVANT_TAG_EXTRACTOR, (CharSequence)this.sourceContent);
        }
        while (this.tags.find() && !Thread.interrupted()) {
            if (this.tags.start(8) <= 0) {
                int end;
                int start;
                if (this.tags.start(7) > 0) {
                    start = this.tags.start(5);
                    end = this.tags.end(5);
                    this.processMeta(this.sourceContent.subSequence(start, end));
                } else if (this.tags.start(5) > 0) {
                    int start5 = this.tags.start(5);
                    int end5 = this.tags.end(5);
                    int start6 = this.tags.start(6);
                    int end6 = this.tags.end(6);
                    this.processGeneralTag(this.sourceContent.subSequence(start6, end6), this.sourceContent.subSequence(start5, end5));
                } else if (this.tags.start(1) > 0) {
                    start = this.tags.start(1);
                    end = this.tags.end(1);
                    this.processScript(this.sourceContent.subSequence(start, end), this.tags.end(2) - start);
                } else if (this.tags.start(3) > 0) {
                    start = this.tags.start(3);
                    end = this.tags.end(3);
                    this.processStyle(this.sourceContent.subSequence(start, end), this.tags.end(4) - start);
                }
            }
            if (this.next.isEmpty()) continue;
            return true;
        }
        return false;
    }

    protected boolean processGeneralTag(CharSequence element, CharSequence cs) {
        Matcher attr = TextUtils.getMatcher((String)EACH_ATTRIBUTE_EXTRACTOR, (CharSequence)cs);
        String codebase = null;
        ArrayList<String> resources = null;
        long tally = this.next.size();
        while (attr.find()) {
            HTMLLinkContext context;
            int valueGroup = attr.start(12) > -1 ? 12 : (attr.start(13) > -1 ? 13 : 14);
            int start = attr.start(valueGroup);
            int end = attr.end(valueGroup);
            CharSequence value = cs.subSequence(start, end);
            if (attr.start(2) > -1) {
                context = new HTMLLinkContext(element, (CharSequence)attr.group(2));
                if (((Object)element).toString().equalsIgnoreCase(LINK)) {
                    this.processEmbed(value, (LinkContext)context);
                    continue;
                }
                if (((Object)element).toString().equalsIgnoreCase(BASE)) {
                    try {
                        this.base = UURIFactory.getInstance((String)((Object)value).toString());
                    }
                    catch (URIException e) {
                        this.extractErrorListener.noteExtractError((IOException)((Object)e), this.source, value);
                    }
                }
                this.processLink(value, (LinkContext)context);
                continue;
            }
            if (attr.start(3) > -1) {
                context = new HTMLLinkContext(element, (CharSequence)attr.group(3));
                this.processLink(value, (LinkContext)context);
                continue;
            }
            if (attr.start(4) > -1) {
                this.processScriptCode(value);
                continue;
            }
            if (attr.start(5) > -1) {
                context = new HTMLLinkContext(element, (CharSequence)attr.group(5));
                this.processEmbed(value, (LinkContext)context);
                continue;
            }
            if (attr.start(6) > -1) {
                codebase = TextUtils.replaceAll((String)ESCAPED_AMP, (CharSequence)value, (String)AMP);
                context = new HTMLLinkContext(element, (CharSequence)attr.group(6));
                this.processEmbed(codebase, (LinkContext)context);
                continue;
            }
            if (attr.start(7) > -1) {
                if (resources == null) {
                    resources = new ArrayList<String>();
                }
                resources.add(((Object)value).toString());
                continue;
            }
            if (attr.start(8) > -1) {
                if (resources == null) {
                    resources = new ArrayList();
                }
                String[] multi = TextUtils.split((String)WHITESPACE, (CharSequence)value);
                for (int i = 0; i < multi.length; ++i) {
                    resources.add(multi[i]);
                }
                continue;
            }
            if (attr.start(9) > -1) {
                if (resources == null) {
                    resources = new ArrayList();
                }
                if (((Object)element).toString().toLowerCase().equals(APPLET) && !((Object)value).toString().toLowerCase().endsWith(CLASSEXT)) {
                    resources.add(((Object)value).toString() + CLASSEXT);
                    continue;
                }
                resources.add(((Object)value).toString());
                continue;
            }
            if (attr.start(10) > -1) {
                if (!TextUtils.matches((String)LIKELY_URI_PATH, (CharSequence)value)) continue;
                context = new HTMLLinkContext(element, (CharSequence)attr.group(10));
                this.processLink(value, (LinkContext)context);
                continue;
            }
            if (attr.start(11) <= -1) continue;
        }
        TextUtils.recycleMatcher((Matcher)attr);
        if (resources == null) {
            return tally - (long)this.next.size() > 0L;
        }
        Iterator iter = resources.iterator();
        UURI codebaseURI = null;
        String res = null;
        try {
            if (codebase != null) {
                codebaseURI = UURIFactory.getInstance((UURI)this.base, codebase);
            }
            while (iter.hasNext()) {
                res = iter.next().toString();
                res = TextUtils.replaceAll((String)ESCAPED_AMP, (CharSequence)res, (String)AMP);
                if (codebaseURI != null) {
                    res = codebaseURI.resolve(res).toString();
                }
                this.processEmbed(res, (LinkContext)new HTMLLinkContext(((Object)element).toString()));
            }
        }
        catch (URIException e) {
            this.extractErrorListener.noteExtractError((IOException)((Object)e), this.source, codebase);
        }
        catch (IllegalArgumentException e) {
            DevUtils.logger.log(Level.WARNING, "processGeneralTag()\ncodebase=" + codebase + " res=" + res + "\n" + DevUtils.extraInfo(), e);
        }
        return tally - (long)this.next.size() > 0L;
    }

    protected void processScriptCode(CharSequence cs) {
        RegexpJSLinkExtractor.extract(cs, this.source, this.base, this.next, this.extractErrorListener);
    }

    protected void processLink(CharSequence value, LinkContext context) {
        String link = TextUtils.replaceAll((String)ESCAPED_AMP, (CharSequence)value, (String)AMP);
        if (TextUtils.matches((String)JAVASCRIPT, (CharSequence)link)) {
            this.processScriptCode(value.subSequence(11, value.length()));
        } else {
            this.addLinkFromString(link, context, Hop.NAVLINK);
        }
    }

    private void addLinkFromString(String uri, LinkContext context, Hop hop) {
        try {
            Link link = new Link((CharSequence)this.source, (CharSequence)UURIFactory.getInstance((UURI)this.base, (String)uri), context, hop);
            this.next.addLast(link);
        }
        catch (URIException e) {
            this.extractErrorListener.noteExtractError((IOException)((Object)e), this.source, uri);
        }
    }

    protected long processEmbed(CharSequence value, LinkContext context) {
        String embed = TextUtils.replaceAll((String)ESCAPED_AMP, (CharSequence)value, (String)AMP);
        this.addLinkFromString(embed, context, Hop.EMBED);
        return 1L;
    }

    protected void processScript(CharSequence sequence, int endOfOpenTag) {
        this.processGeneralTag(sequence.subSequence(0, 6), sequence.subSequence(0, endOfOpenTag));
        this.processScriptCode(sequence.subSequence(endOfOpenTag, sequence.length()));
    }

    protected void processMeta(CharSequence cs) {
        Matcher attr = TextUtils.getMatcher((String)EACH_ATTRIBUTE_EXTRACTOR, (CharSequence)cs);
        String name = null;
        String httpEquiv = null;
        String content = null;
        while (attr.find()) {
            int valueGroup = attr.start(12) > -1 ? 12 : (attr.start(13) > -1 ? 13 : 14);
            CharSequence value = cs.subSequence(attr.start(valueGroup), attr.end(valueGroup));
            if (attr.group(1).equalsIgnoreCase("name")) {
                name = ((Object)value).toString();
                continue;
            }
            if (attr.group(1).equalsIgnoreCase("http-equiv")) {
                httpEquiv = ((Object)value).toString();
                continue;
            }
            if (!attr.group(1).equalsIgnoreCase("content")) continue;
            content = ((Object)value).toString();
        }
        TextUtils.recycleMatcher((Matcher)attr);
        if ("robots".equalsIgnoreCase(name) && content != null) {
            String contentLower;
            if (this.getHonorRobots() && ((contentLower = content.toLowerCase()).indexOf("nofollow") >= 0 || contentLower.indexOf("none") >= 0)) {
                logger.fine("HTML extraction skipped due to robots meta-tag for: " + this.source);
                this.cancelFurtherExtraction();
                return;
            }
        } else if ("refresh".equalsIgnoreCase(httpEquiv) && content != null) {
            String refreshUri = content.substring(content.indexOf("=") + 1);
            try {
                Link refreshLink = new Link((CharSequence)this.source, (CharSequence)UURIFactory.getInstance((UURI)this.base, (String)refreshUri), (LinkContext)new HTMLLinkContext((CharSequence)"meta", (CharSequence)httpEquiv), Hop.REFER);
                this.next.addLast(refreshLink);
            }
            catch (URIException e) {
                this.extractErrorListener.noteExtractError((IOException)((Object)e), this.source, refreshUri);
            }
        }
    }

    private boolean getHonorRobots() {
        return this.honorRobots;
    }

    private void cancelFurtherExtraction() {
        this.tags.reset("");
    }

    protected void processStyle(CharSequence sequence, int endOfOpenTag) {
        this.processGeneralTag(sequence.subSequence(0, 6), sequence.subSequence(0, endOfOpenTag));
        RegexpCSSLinkExtractor.extract(sequence.subSequence(endOfOpenTag, sequence.length()), this.source, this.base, this.next, this.extractErrorListener);
    }

    public void reset() {
        super.reset();
        TextUtils.recycleMatcher((Matcher)this.tags);
        this.tags = null;
    }

    protected static CharSequenceLinkExtractor newDefaultInstance() {
        return new RegexpHTMLLinkExtractor();
    }
}

