/*
 * Decompiled with CFR 0.152.
 */
package org.archive.modules.extractor;

import java.io.IOException;
import java.util.concurrent.atomic.AtomicLong;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.httpclient.URIException;
import org.archive.io.ReplayCharSequence;
import org.archive.modules.ProcessorURI;
import org.archive.modules.extractor.ContentExtractor;
import org.archive.modules.extractor.Hop;
import org.archive.modules.extractor.Link;
import org.archive.modules.extractor.LinkContext;
import org.archive.modules.extractor.UriErrorLoggerModule;
import org.archive.state.KeyManager;
import org.archive.util.TextUtils;

public class ExtractorXML
extends ContentExtractor {
    private static final long serialVersionUID = 3L;
    private static Logger logger = Logger.getLogger(ExtractorXML.class.getName());
    private static String ESCAPED_AMP = "&amp";
    static final Pattern XML_URI_EXTRACTOR = Pattern.compile("(?i)[\"'>]\\s*(http:[^\\s\"'<>]+)\\s*[\"'<]");
    private AtomicLong linksExtracted = new AtomicLong(0L);

    protected boolean shouldExtract(ProcessorURI curi) {
        String mimeType = curi.getContentType();
        if (mimeType == null) {
            return false;
        }
        return mimeType.toLowerCase().indexOf("xml") >= 0 || curi.toString().toLowerCase().endsWith(".rss") || curi.toString().toLowerCase().endsWith(".xml");
    }

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    protected boolean innerExtract(ProcessorURI curi) {
        ReplayCharSequence cs = null;
        try {
            cs = curi.getRecorder().getReplayCharSequence();
        }
        catch (IOException e) {
            logger.severe("Failed getting ReplayCharSequence: " + e.getMessage());
        }
        if (cs == null) {
            logger.severe("Failed getting ReplayCharSequence: " + curi.toString());
            return false;
        }
        try {
            this.linksExtracted.addAndGet(ExtractorXML.processXml(this.uriErrors, curi, (CharSequence)cs));
            boolean bl = true;
            return bl;
        }
        finally {
            if (cs != null) {
                try {
                    cs.close();
                }
                catch (IOException ioe) {
                    logger.warning(TextUtils.exceptionToString((String)"Failed close of ReplayCharSequence.", (Throwable)ioe));
                }
            }
        }
    }

    public static long processXml(UriErrorLoggerModule uriErrors, ProcessorURI curi, CharSequence cs) {
        long foundLinks = 0L;
        Matcher uris = null;
        uris = XML_URI_EXTRACTOR.matcher(cs);
        while (uris.find()) {
            String xmlUri = uris.group(1);
            xmlUri = TextUtils.replaceAll((String)ESCAPED_AMP, (CharSequence)xmlUri, (String)"&");
            ++foundLinks;
            try {
                int max = uriErrors.getMaxOutlinks(curi);
                Link.add(curi, max, xmlUri, LinkContext.SPECULATIVE_MISC, Hop.SPECULATIVE);
            }
            catch (URIException e) {
                uriErrors.logUriError(e, curi.getUURI(), xmlUri);
            }
        }
        return foundLinks;
    }

    public String report() {
        StringBuffer ret = new StringBuffer();
        ret.append("Processor: org.archive.crawler.extractor.ExtractorXML\n");
        ret.append("  Function:          Link extraction on XML/RSS\n");
        ret.append("  CrawlURIs handled: " + this.getURICount() + "\n");
        ret.append("  Links extracted:   " + this.linksExtracted + "\n\n");
        return ret.toString();
    }

    static {
        KeyManager.addKeys(ExtractorXML.class);
    }
}

