/*
 * Decompiled with CFR 0.152.
 */
package org.archive.modules.extractor;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.concurrent.atomic.AtomicLong;
import java.util.logging.Logger;
import org.apache.commons.httpclient.URIException;
import org.archive.io.SinkHandlerLogThread;
import org.archive.modules.ProcessorURI;
import org.archive.modules.extractor.ContentExtractor;
import org.archive.modules.extractor.Hop;
import org.archive.modules.extractor.Link;
import org.archive.modules.extractor.LinkContext;
import org.archive.modules.extractor.PDFParser;
import org.archive.net.UURI;
import org.archive.net.UURIFactory;
import org.archive.state.Initializable;
import org.archive.state.Key;
import org.archive.state.KeyManager;

public class ExtractorPDF
extends ContentExtractor
implements Initializable {
    private static final long serialVersionUID = 3L;
    private static final Logger LOGGER = Logger.getLogger(ExtractorPDF.class.getName());
    public static final Key<Long> MAX_SIZE_TO_PARSE = Key.make((long)0x500000L);
    private final AtomicLong numberOfLinksExtracted = new AtomicLong(0L);

    protected boolean shouldExtract(ProcessorURI uri) {
        long max = (Long)uri.get(this, MAX_SIZE_TO_PARSE);
        if (uri.getRecorder().getRecordedInput().getSize() > max) {
            return false;
        }
        String ct = uri.getContentType();
        return ct != null && ct.startsWith("application/pdf");
    }

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    protected boolean innerExtract(ProcessorURI curi) {
        ArrayList<String> uris;
        File tempFile;
        Thread thread = Thread.currentThread();
        int sn = thread instanceof SinkHandlerLogThread ? ((SinkHandlerLogThread)thread).getSerialNumber() : System.identityHashCode(thread);
        try {
            tempFile = File.createTempFile("tt" + sn, "tmp.pdf");
        }
        catch (IOException ioe) {
            throw new RuntimeException(ioe);
        }
        try {
            curi.getRecorder().getRecordedInput().copyContentBodyTo(tempFile);
            PDFParser parser = new PDFParser(tempFile.getAbsolutePath());
            uris = parser.extractURIs();
        }
        catch (IOException e) {
            curi.getNonFatalFailures().add(e);
            boolean bl = false;
            return bl;
        }
        catch (RuntimeException e) {
            curi.getNonFatalFailures().add(e);
            boolean bl = false;
            return bl;
        }
        finally {
            tempFile.delete();
        }
        if (uris == null) {
            return true;
        }
        for (String uri : uris) {
            try {
                UURI src = curi.getUURI();
                UURI dest = UURIFactory.getInstance((String)uri);
                LinkContext lc = LinkContext.NAVLINK_MISC;
                Hop hop = Hop.NAVLINK;
                Link out = new Link((CharSequence)src, (CharSequence)dest, lc, hop);
                curi.getOutLinks().add(out);
            }
            catch (URIException e1) {
                this.logUriError(e1, curi, uri);
            }
        }
        this.numberOfLinksExtracted.addAndGet(uris.size());
        LOGGER.fine(curi + " has " + uris.size() + " links.");
        return true;
    }

    public String report() {
        StringBuffer ret = new StringBuffer();
        ret.append("Processor: org.archive.crawler.extractor.ExtractorPDF\n");
        ret.append("  Function:          Link extraction on PDF documents\n");
        ret.append("  CrawlURIs handled: " + this.getURICount() + "\n");
        ret.append("  Links extracted:   " + this.numberOfLinksExtracted + "\n\n");
        return ret.toString();
    }

    static {
        KeyManager.addKeys(ExtractorPDF.class);
    }
}

