/*
 * Decompiled with CFR 0.152.
 */
package org.archive.modules.extractor;

import java.io.IOException;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.httpclient.URIException;
import org.archive.io.ReplayInputStream;
import org.archive.io.SeekInputStream;
import org.archive.io.SeekReader;
import org.archive.io.SeekReaderCharSequence;
import org.archive.modules.ProcessorURI;
import org.archive.modules.extractor.ContentExtractor;
import org.archive.modules.extractor.Hop;
import org.archive.modules.extractor.Link;
import org.archive.modules.extractor.LinkContext;
import org.archive.net.UURI;
import org.archive.net.UURIFactory;
import org.archive.state.KeyManager;
import org.archive.util.ms.Doc;

public class ExtractorDOC
extends ContentExtractor {
    private static final long serialVersionUID = 3L;
    private static Pattern PATTERN = Pattern.compile("HYPERLINK.*?\"(.*?)\"");
    private static Logger logger = Logger.getLogger("org.archive.crawler.extractor.ExtractorDOC");
    private long numberOfCURIsHandled = 0L;
    private long numberOfLinksExtracted = 0L;

    protected boolean shouldExtract(ProcessorURI uri) {
        String mimeType = uri.getContentType();
        if (mimeType == null) {
            return false;
        }
        return mimeType.toLowerCase().startsWith("application/msword");
    }

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    protected boolean innerExtract(ProcessorURI curi) {
        int links = 0;
        ReplayInputStream documentStream = null;
        SeekReader docReader = null;
        try {
            documentStream = curi.getRecorder().getRecordedInput().getContentReplayInputStream();
            if (documentStream == null) {
                boolean bl = false;
                return bl;
            }
            docReader = Doc.getText((SeekInputStream)documentStream);
        }
        catch (Exception e) {
            curi.getNonFatalFailures().add(e);
            boolean ignored = false;
            return ignored;
        }
        finally {
            try {
                documentStream.close();
            }
            catch (IOException ignored) {}
        }
        SeekReaderCharSequence cs = new SeekReaderCharSequence(docReader, 0);
        Matcher m = PATTERN.matcher((CharSequence)cs);
        while (m.find()) {
            ++links;
            this.addLink(curi, m.group(1));
        }
        logger.fine(curi + " has " + links + " links.");
        return true;
    }

    private void addLink(ProcessorURI curi, String hyperlink) {
        try {
            UURI dest = UURIFactory.getInstance((UURI)curi.getUURI(), (String)hyperlink);
            LinkContext lc = LinkContext.NAVLINK_MISC;
            Link link = new Link((CharSequence)curi.getUURI(), (CharSequence)dest, lc, Hop.NAVLINK);
            curi.getOutLinks().add(link);
        }
        catch (URIException e1) {
            this.logUriError(e1, curi, hyperlink);
        }
        ++this.numberOfLinksExtracted;
    }

    public String report() {
        StringBuffer ret = new StringBuffer();
        ret.append("Processor: org.archive.crawler.extractor.ExtractorDOC\n");
        ret.append("  Function:          Link extraction on MS Word documents (.doc)\n");
        ret.append("  ProcessorURIs handled: " + this.numberOfCURIsHandled + "\n");
        ret.append("  Links extracted:   " + this.numberOfLinksExtracted + "\n\n");
        return ret.toString();
    }

    static {
        KeyManager.addKeys(ExtractorDOC.class);
    }
}

