/*
 * Decompiled with CFR 0.152.
 */
package org.archive.modules.extractor;

import java.io.IOException;
import java.io.InputStream;
import java.util.concurrent.atomic.AtomicLong;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.io.IOUtils;
import org.archive.io.ReplayInputStream;
import org.archive.modules.ProcessorURI;
import org.archive.modules.extractor.ContentExtractor;
import org.archive.modules.extractor.Hop;
import org.archive.modules.extractor.Link;
import org.archive.modules.extractor.LinkContext;
import org.archive.net.UURI;
import org.archive.net.UURIFactory;
import org.archive.state.Key;
import org.archive.state.KeyManager;

public class ExtractorUniversal
extends ContentExtractor {
    private static final long serialVersionUID = 3L;
    public static final Key<Long> MAX_DEPTH_BYTES = Key.make((long)10240L);
    public static final Key<Long> MAX_URL_LENGTH = Key.make((long)2083L);
    static final Pattern IP_ADDRESS = Pattern.compile("((http://)|(https://))(\\d(\\d)?(\\d)?\\.\\d(\\d)?(\\d)?\\.\\d(\\d)?(\\d)?\\.\\d(\\d)?(\\d)?)");
    public static final Pattern TLDs = Pattern.compile("(ac(/.*)?)|(ad(/.*)?)|(ae(/.*)?)|(af(/.*)?)|(ag(/.*)?)|(ai(/.*)?)|(al(/.*)?)|(am(/.*)?)|(an(/.*)?)|(ao(/.*)?)|(aero(/.*)?)|(aq(/.*)?)|(ar(/.*)?)|(as(/.*)?)|(at(/.*)?)|(au(/.*)?)|(aw(/.*)?)|(az(/.*)?)|(ba(/.*)?)|(bb(/.*)?)|(bd(/.*)?)|(be(/.*)?)|(bf(/.*)?)|(bg(/.*)?)|(bh(/.*)?)|(bi(/.*)?)|(biz(/.*)?)|(bj(/.*)?)|(bm(/.*)?)|(bn(/.*)?)|(bo(/.*)?)|(br(/.*)?)|(bs(/.*)?)|(bt(/.*)?)|(bv(/.*)?)|(bw(/.*)?)|(by(/.*)?)|(bz(/.*)?)|(ca(/.*)?)|(cc(/.*)?)|(cd(/.*)?)|(cf(/.*)?)|(cg(/.*)?)|(ch(/.*)?)|(ci(/.*)?)|(ck(/.*)?)|(cl(/.*)?)|(cm(/.*)?)|(cn(/.*)?)|(co(/.*)?)|(com(/.*)?)|(coop(/.*)?)|(cr(/.*)?)|(cs(/.*)?)|(cu(/.*)?)|(cv(/.*)?)|(cx(/.*)?)|(cy(/.*)?)|(cz(/.*)?)|(de(/.*)?)|(dj(/.*)?)|(dk(/.*)?)|(dm(/.*)?)|(do(/.*)?)|(dz(/.*)?)|(ec(/.*)?)|(edu(/.*)?)|(ee(/.*)?)|(eg(/.*)?)|(eh(/.*)?)|(er(/.*)?)|(es(/.*)?)|(et(/.*)?)|(fi(/.*)?)|(fj(/.*)?)|(fk(/.*)?)|(fm(/.*)?)|(fo(/.*)?)|(fr(/.*)?)|(ga(/.*)?)|(gd(/.*)?)|(ge(/.*)?)|(gf(/.*)?)|(gg(/.*)?)|(gh(/.*)?)|(gi(/.*)?)|(gl(/.*)?)|(gm(/.*)?)|(gn(/.*)?)|(gov(/.*)?)|(gp(/.*)?)|(gq(/.*)?)|(gr(/.*)?)|(gs(/.*)?)|(gt(/.*)?)|(gu(/.*)?)|(gw(/.*)?)|(gy(/.*)?)|(hk(/.*)?)|(hm(/.*)?)|(hn(/.*)?)|(hr(/.*)?)|(ht(/.*)?)|(hu(/.*)?)|(id(/.*)?)|(ie(/.*)?)|(il(/.*)?)|(im(/.*)?)|(in(/.*)?)|(info(/.*)?)|(int(/.*)?)|(io(/.*)?)|(iq(/.*)?)|(ir(/.*)?)|(is(/.*)?)|(it(/.*)?)|(je(/.*)?)|(jm(/.*)?)|(jo(/.*)?)|(jp(/.*)?)|(ke(/.*)?)|(kg(/.*)?)|(kh(/.*)?)|(ki(/.*)?)|(km(/.*)?)|(kn(/.*)?)|(kp(/.*)?)|(kr(/.*)?)|(kw(/.*)?)|(ky(/.*)?)|(kz(/.*)?)|(la(/.*)?)|(lb(/.*)?)|(lc(/.*)?)|(li(/.*)?)|(lk(/.*)?)|(lr(/.*)?)|(ls(/.*)?)|(lt(/.*)?)|(lu(/.*)?)|(lv(/.*)?)|(ly(/.*)?)|(ma(/.*)?)|(mc(/.*)?)|(md(/.*)?)|(mg(/.*)?)|(mh(/.*)?)|(mil(/.*)?)|(mk(/.*)?)|(ml(/.*)?)|(mm(/.*)?)|(mn(/.*)?)|(mo(/.*)?)|(mp(/.*)?)|(mq(/.*)?)|(mr(/.*)?)|(ms(/.*)?)|(mt(/.*)?)|(mu(/.*)?)|(museum(/.*)?)|(mv(/.*)?)|(mw(/.*)?)|(mx(/.*)?)|(my(/.*)?)|(mz(/.*)?)|(na(/.*)?)|(name(/.*)?)|(nc(/.*)?)|(ne(/.*)?)|(net(/.*)?)|(nf(/.*)?)|(ng(/.*)?)|(ni(/.*)?)|(nl(/.*)?)|(no(/.*)?)|(np(/.*)?)|(nr(/.*)?)|(nt(/.*)?)|(nu(/.*)?)|(nz(/.*)?)|(om(/.*)?)|(org(/.*)?)|(pa(/.*)?)|(pe(/.*)?)|(pf(/.*)?)|(pg(/.*)?)|(ph(/.*)?)|(pk(/.*)?)|(pl(/.*)?)|(pm(/.*)?)|(pn(/.*)?)|(pr(/.*)?)|(pro(/.*)?)|(ps(/.*)?)|(pt(/.*)?)|(pw(/.*)?)|(py(/.*)?)|(qa(/.*)?)|(re(/.*)?)|(ro(/.*)?)|(ru(/.*)?)|(rw(/.*)?)|(sa(/.*)?)|(sb(/.*)?)|(sc(/.*)?)|(sd(/.*)?)|(se(/.*)?)|(sg(/.*)?)|(sh(/.*)?)|(si(/.*)?)|(sj(/.*)?)|(sk(/.*)?)|(sl(/.*)?)|(sm(/.*)?)|(sn(/.*)?)|(so(/.*)?)|(sr(/.*)?)|(sv(/.*)?)|(st(/.*)?)|(sy(/.*)?)|(sz(/.*)?)|(tc(/.*)?)|(td(/.*)?)|(tf(/.*)?)|(tg(/.*)?)|(th(/.*)?)|(tj(/.*)?)|(tk(/.*)?)|(tm(/.*)?)|(tn(/.*)?)|(to(/.*)?)|(tp(/.*)?)|(tr(/.*)?)|(tt(/.*)?)|(tv(/.*)?)|(tw(/.*)?)|(tz(/.*)?)|(ua(/.*)?)|(ug(/.*)?)|(uk(/.*)?)|(um(/.*)?)|(us(/.*)?)|(uy(/.*)?)|(uz(/.*)?)|(va(/.*)?)|(vc(/.*)?)|(ve(/.*)?)|(vg(/.*)?)|(vi(/.*)?)|(vn(/.*)?)|(vu(/.*)?)|(wf(/.*)?)|(ws(/.*)?)|(ye(/.*)?)|(yt(/.*)?)|(yu(/.*)?)|(za(/.*)?)|(zm(/.*)?)|(zw(/.*)?)");
    protected AtomicLong linksExtracted = new AtomicLong(0L);

    protected boolean shouldExtract(ProcessorURI uri) {
        return true;
    }

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    protected boolean innerExtract(ProcessorURI curi) {
        ReplayInputStream instream = null;
        try {
            instream = curi.getRecorder().getRecordedInput().getContentReplayInputStream();
            int ch = instream.read();
            StringBuffer lookat = new StringBuffer();
            long counter = 0L;
            long maxdepth = (Long)curi.get(this, MAX_DEPTH_BYTES);
            if (maxdepth <= 0L) {
                maxdepth = Long.MAX_VALUE;
            }
            long maxURLLength = (Long)curi.get(this, MAX_URL_LENGTH);
            boolean foundDot = false;
            while (ch != -1 && ++counter <= maxdepth) {
                if ((long)lookat.length() > maxURLLength) {
                    lookat = new StringBuffer();
                    foundDot = false;
                } else if (this.isURLableChar(ch)) {
                    if (ch == 46) {
                        foundDot = true;
                    }
                    lookat.append((char)ch);
                } else if (lookat.length() > 3 && foundDot) {
                    String newURL = lookat.toString();
                    if (this.looksLikeAnURL(newURL)) {
                        if (newURL.toLowerCase().indexOf("http") > 0) {
                            newURL = newURL.substring(newURL.toLowerCase().indexOf("http"));
                        }
                        while (newURL.substring(newURL.length() - 1).equals(".")) {
                            newURL = newURL.substring(0, newURL.length() - 1);
                        }
                        this.linksExtracted.incrementAndGet();
                        UURI src = curi.getUURI();
                        UURI dest = UURIFactory.getInstance((String)newURL);
                        LinkContext lc = LinkContext.SPECULATIVE_MISC;
                        Hop hop = Hop.SPECULATIVE;
                        Link link = new Link((CharSequence)src, (CharSequence)dest, lc, hop);
                        curi.getOutLinks().add(link);
                    }
                    lookat = new StringBuffer();
                    foundDot = false;
                } else if (lookat.length() > 0) {
                    lookat = new StringBuffer();
                    foundDot = false;
                }
                ch = instream.read();
            }
        }
        catch (IOException e) {
            try {
                curi.getNonFatalFailures().add(e);
            }
            catch (Throwable throwable) {
                IOUtils.closeQuietly(instream);
                throw throwable;
            }
            IOUtils.closeQuietly((InputStream)instream);
        }
        IOUtils.closeQuietly((InputStream)instream);
        return true;
    }

    private boolean looksLikeAnURL(String lookat) {
        Matcher ip;
        boolean testVal;
        if ((lookat.indexOf("http://") == 0 || lookat.indexOf("https://") == 0) && (testVal = (ip = IP_ADDRESS.matcher(lookat)).matches())) {
            return true;
        }
        int dot = lookat.indexOf(".");
        if (dot != 0) {
            while (dot != -1 && dot < lookat.length()) {
                if (this.isTLD(lookat.substring(0, (lookat = lookat.substring(dot + 1)).length() <= 6 ? lookat.length() : 6))) {
                    return true;
                }
                dot = lookat.indexOf(".");
            }
        }
        return false;
    }

    private boolean isTLD(String potentialTLD) {
        if (potentialTLD.length() < 2) {
            return false;
        }
        potentialTLD.toLowerCase();
        Matcher uri = TLDs.matcher(potentialTLD);
        boolean ret = uri.matches();
        return ret;
    }

    private boolean isURLableChar(int ch) {
        return ch >= 35 && ch <= 38 || ch >= 43 && ch <= 59 || ch == 61 || ch >= 63 && ch <= 90 || ch == 95 || ch >= 97 && ch <= 122 || ch == 126;
    }

    public String report() {
        StringBuffer ret = new StringBuffer();
        ret.append("Processor: org.archive.crawler.extractor.ExtractorUniversal\n");
        ret.append("  Function:          Link extraction on unknown file types.\n");
        ret.append("  CrawlURIs handled: " + this.getURICount() + "\n");
        ret.append("  Links extracted:   " + this.linksExtracted + "\n\n");
        return ret.toString();
    }

    static {
        KeyManager.addKeys(ExtractorUniversal.class);
    }
}

