/*
 * Decompiled with CFR 0.152.
 */
package org.apache.nutch.fetcher;

import crawlercommons.robots.BaseRobotRules;
import java.io.IOException;
import java.net.InetAddress;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.UnknownHostException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.StringTokenizer;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapRunnable;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.NutchWritable;
import org.apache.nutch.crawl.SignatureFactory;
import org.apache.nutch.fetcher.FetcherOutputFormat;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.net.URLFilterException;
import org.apache.nutch.net.URLFilters;
import org.apache.nutch.net.URLNormalizers;
import org.apache.nutch.parse.Outlink;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.parse.ParseImpl;
import org.apache.nutch.parse.ParseOutputFormat;
import org.apache.nutch.parse.ParseResult;
import org.apache.nutch.parse.ParseSegment;
import org.apache.nutch.parse.ParseStatus;
import org.apache.nutch.parse.ParseText;
import org.apache.nutch.parse.ParseUtil;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.protocol.Protocol;
import org.apache.nutch.protocol.ProtocolFactory;
import org.apache.nutch.protocol.ProtocolOutput;
import org.apache.nutch.protocol.ProtocolStatus;
import org.apache.nutch.scoring.ScoringFilterException;
import org.apache.nutch.scoring.ScoringFilters;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.StringUtil;
import org.apache.nutch.util.TimingUtil;
import org.apache.nutch.util.URLUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class Fetcher
extends Configured
implements Tool,
MapRunnable<Text, CrawlDatum, Text, NutchWritable> {
    public static final int PERM_REFRESH_TIME = 5;
    public static final String CONTENT_REDIR = "content";
    public static final String PROTOCOL_REDIR = "protocol";
    public static final Logger LOG = LoggerFactory.getLogger(Fetcher.class);
    private OutputCollector<Text, NutchWritable> output;
    private Reporter reporter;
    private String segmentName;
    private AtomicInteger activeThreads = new AtomicInteger(0);
    private AtomicInteger spinWaiting = new AtomicInteger(0);
    private long start = System.currentTimeMillis();
    private AtomicLong lastRequestStart = new AtomicLong(this.start);
    private AtomicLong bytes = new AtomicLong(0L);
    private AtomicInteger pages = new AtomicInteger(0);
    private AtomicInteger errors = new AtomicInteger(0);
    private boolean storingContent;
    private boolean parsing;
    FetchItemQueues fetchQueues;
    QueueFeeder feeder;

    public Fetcher() {
        super(null);
    }

    public Fetcher(Configuration conf) {
        super(conf);
    }

    private void updateStatus(int bytesInPage) throws IOException {
        this.pages.incrementAndGet();
        this.bytes.addAndGet(bytesInPage);
    }

    private void reportStatus(int pagesLastSec, int bytesLastSec) throws IOException {
        long elapsed = (System.currentTimeMillis() - this.start) / 1000L;
        float avgPagesSec = Math.round((float)this.pages.get() * 10.0f / (float)elapsed) / 10;
        float avgBytesSec = Math.round((float)this.bytes.get() * 8.0f / 1000.0f / (float)elapsed);
        String status = this.activeThreads + " threads, " + this.fetchQueues.getQueueCount() + " queues, " + this.fetchQueues.getTotalSize() + " URLs queued, " + this.pages + " pages, " + this.errors + " errors, " + avgPagesSec + " (" + pagesLastSec + ") pages/s, " + avgBytesSec + " (" + bytesLastSec + ") kbits/s, ";
        this.reporter.setStatus(status);
    }

    public void configure(JobConf job) {
        this.setConf((Configuration)job);
        this.segmentName = job.get("nutch.segment.name");
        this.storingContent = Fetcher.isStoringContent((Configuration)job);
        this.parsing = Fetcher.isParsing((Configuration)job);
    }

    public void close() {
    }

    public static boolean isParsing(Configuration conf) {
        return conf.getBoolean("fetcher.parse", true);
    }

    public static boolean isStoringContent(Configuration conf) {
        return conf.getBoolean("fetcher.store.content", true);
    }

    public void run(RecordReader<Text, CrawlDatum> input, OutputCollector<Text, NutchWritable> output, Reporter reporter) throws IOException {
        this.output = output;
        this.reporter = reporter;
        this.fetchQueues = new FetchItemQueues(this.getConf());
        int threadCount = this.getConf().getInt("fetcher.threads.fetch", 10);
        if (LOG.isInfoEnabled()) {
            LOG.info("Fetcher: threads: " + threadCount);
        }
        int timeoutDivisor = this.getConf().getInt("fetcher.threads.timeout.divisor", 2);
        if (LOG.isInfoEnabled()) {
            LOG.info("Fetcher: time-out divisor: " + timeoutDivisor);
        }
        int queueDepthMuliplier = this.getConf().getInt("fetcher.queue.depth.multiplier", 50);
        this.feeder = new QueueFeeder(input, this.fetchQueues, threadCount * queueDepthMuliplier);
        long timelimit = this.getConf().getLong("fetcher.timelimit", -1L);
        if (timelimit != -1L) {
            this.feeder.setTimeLimit(timelimit);
        }
        this.feeder.start();
        this.getConf().setBoolean("protocol.plugin.check.blocking", false);
        this.getConf().setBoolean("protocol.plugin.check.robots", false);
        for (int i = 0; i < threadCount; ++i) {
            new FetcherThread(this.getConf()).start();
        }
        long timeout = this.getConf().getInt("mapred.task.timeout", 600000) / timeoutDivisor;
        boolean throughputThresholdExceeded = false;
        int throughputThresholdNumRetries = 0;
        int throughputThresholdPages = this.getConf().getInt("fetcher.throughput.threshold.pages", -1);
        if (LOG.isInfoEnabled()) {
            LOG.info("Fetcher: throughput threshold: " + throughputThresholdPages);
        }
        int throughputThresholdMaxRetries = this.getConf().getInt("fetcher.throughput.threshold.retries", 5);
        if (LOG.isInfoEnabled()) {
            LOG.info("Fetcher: throughput threshold retries: " + throughputThresholdMaxRetries);
        }
        long throughputThresholdTimeLimit = this.getConf().getLong("fetcher.throughput.threshold.check.after", -1L);
        do {
            int hitByTimeLimit;
            int pagesLastSec = this.pages.get();
            int bytesLastSec = (int)this.bytes.get();
            try {
                Thread.sleep(1000L);
            }
            catch (InterruptedException e) {
                // empty catch block
            }
            pagesLastSec = this.pages.get() - pagesLastSec;
            bytesLastSec = (int)this.bytes.get() - bytesLastSec;
            reporter.incrCounter("FetcherStatus", "bytes_downloaded", (long)bytesLastSec);
            this.reportStatus(pagesLastSec, bytesLastSec);
            LOG.info("-activeThreads=" + this.activeThreads + ", spinWaiting=" + this.spinWaiting.get() + ", fetchQueues.totalSize=" + this.fetchQueues.getTotalSize());
            if (!this.feeder.isAlive() && this.fetchQueues.getTotalSize() < 5) {
                this.fetchQueues.dump();
            }
            if (throughputThresholdTimeLimit < System.currentTimeMillis() && throughputThresholdPages != -1 && pagesLastSec < throughputThresholdPages) {
                LOG.warn(Integer.toString(++throughputThresholdNumRetries) + ": dropping below configured threshold of " + Integer.toString(throughputThresholdPages) + " pages per second");
                if (throughputThresholdNumRetries == throughputThresholdMaxRetries) {
                    LOG.warn("Dropped below threshold too many times, killing!");
                    throughputThresholdPages = -1;
                    int hitByThrougputThreshold = this.fetchQueues.emptyQueues();
                    if (hitByThrougputThreshold != 0) {
                        reporter.incrCounter("FetcherStatus", "hitByThrougputThreshold", (long)hitByThrougputThreshold);
                    }
                }
            }
            if (!this.feeder.isAlive() && (hitByTimeLimit = this.fetchQueues.checkTimelimit()) != 0) {
                reporter.incrCounter("FetcherStatus", "hitByTimeLimit", (long)hitByTimeLimit);
            }
            if (System.currentTimeMillis() - this.lastRequestStart.get() <= timeout) continue;
            if (LOG.isWarnEnabled()) {
                LOG.warn("Aborting with " + this.activeThreads + " hung threads.");
            }
            return;
        } while (this.activeThreads.get() > 0);
        LOG.info("-activeThreads=" + this.activeThreads);
    }

    public void fetch(Path segment, int threads) throws IOException {
        long timelimit;
        this.checkConfiguration();
        SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
        long start = System.currentTimeMillis();
        if (LOG.isInfoEnabled()) {
            LOG.info("Fetcher: starting at " + sdf.format(start));
            LOG.info("Fetcher: segment: " + segment);
        }
        if ((timelimit = this.getConf().getLong("fetcher.timelimit.mins", -1L)) != -1L) {
            timelimit = System.currentTimeMillis() + timelimit * 60L * 1000L;
            LOG.info("Fetcher Timelimit set for : " + timelimit);
            this.getConf().setLong("fetcher.timelimit", timelimit);
        }
        timelimit = this.getConf().getLong("fetcher.throughput.threshold.check.after", 10L);
        timelimit = System.currentTimeMillis() + timelimit * 60L * 1000L;
        this.getConf().setLong("fetcher.throughput.threshold.check.after", timelimit);
        int maxOutlinkDepth = this.getConf().getInt("fetcher.follow.outlinks.depth", -1);
        if (maxOutlinkDepth > 0) {
            LOG.info("Fetcher: following outlinks up to depth: " + Integer.toString(maxOutlinkDepth));
            int maxOutlinkDepthNumLinks = this.getConf().getInt("fetcher.follow.outlinks.num.links", 4);
            int outlinksDepthDivisor = this.getConf().getInt("fetcher.follow.outlinks.depth.divisor", 2);
            int totalOutlinksToFollow = 0;
            for (int i = 0; i < maxOutlinkDepth; ++i) {
                totalOutlinksToFollow += (int)Math.floor(outlinksDepthDivisor / (i + 1) * maxOutlinkDepthNumLinks);
            }
            LOG.info("Fetcher: maximum outlinks to follow: " + Integer.toString(totalOutlinksToFollow));
        }
        NutchJob job = new NutchJob(this.getConf());
        job.setJobName("fetch " + segment);
        job.setInt("fetcher.threads.fetch", threads);
        job.set("nutch.segment.name", segment.getName());
        job.setSpeculativeExecution(false);
        FileInputFormat.addInputPath((JobConf)job, (Path)new Path(segment, "crawl_generate"));
        job.setInputFormat(InputFormat.class);
        job.setMapRunnerClass(Fetcher.class);
        FileOutputFormat.setOutputPath((JobConf)job, (Path)segment);
        job.setOutputFormat(FetcherOutputFormat.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(NutchWritable.class);
        JobClient.runJob((JobConf)job);
        long end = System.currentTimeMillis();
        LOG.info("Fetcher: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
    }

    public static void main(String[] args) throws Exception {
        int res = ToolRunner.run((Configuration)NutchConfiguration.create(), (Tool)new Fetcher(), (String[])args);
        System.exit(res);
    }

    public int run(String[] args) throws Exception {
        String usage = "Usage: Fetcher <segment> [-threads n]";
        if (args.length < 1) {
            System.err.println(usage);
            return -1;
        }
        Path segment = new Path(args[0]);
        int threads = this.getConf().getInt("fetcher.threads.fetch", 10);
        boolean parsing = false;
        for (int i = 1; i < args.length; ++i) {
            if (!args[i].equals("-threads")) continue;
            threads = Integer.parseInt(args[++i]);
        }
        this.getConf().setInt("fetcher.threads.fetch", threads);
        try {
            this.fetch(segment, threads);
            return 0;
        }
        catch (Exception e) {
            LOG.error("Fetcher: " + StringUtils.stringifyException((Throwable)e));
            return -1;
        }
    }

    private void checkConfiguration() {
        String agentName = this.getConf().get("http.agent.name");
        if (agentName == null || agentName.trim().length() == 0) {
            String message = "Fetcher: No agents listed in 'http.agent.name' property.";
            if (LOG.isErrorEnabled()) {
                LOG.error(message);
            }
            throw new IllegalArgumentException(message);
        }
        String agentNames = this.getConf().get("http.robots.agents");
        StringTokenizer tok = new StringTokenizer(agentNames, ",");
        ArrayList<String> agents = new ArrayList<String>();
        while (tok.hasMoreTokens()) {
            agents.add(tok.nextToken().trim());
        }
        if (!((String)agents.get(0)).equalsIgnoreCase(agentName)) {
            String message = "Fetcher: Your 'http.agent.name' value should be listed first in 'http.robots.agents' property.";
            if (LOG.isWarnEnabled()) {
                LOG.warn(message);
            }
        }
    }

    private class FetcherThread
    extends Thread {
        private Configuration conf;
        private URLFilters urlFilters;
        private ScoringFilters scfilters;
        private ParseUtil parseUtil;
        private URLNormalizers normalizers;
        private ProtocolFactory protocolFactory;
        private long maxCrawlDelay;
        private String queueMode;
        private int maxRedirect;
        private String reprUrl;
        private boolean redirecting;
        private int redirectCount;
        private boolean ignoreExternalLinks;
        private int maxOutlinksPerPage;
        private final int maxOutlinks;
        private final int interval;
        private int maxOutlinkDepth;
        private int maxOutlinkDepthNumLinks;
        private boolean outlinksIgnoreExternal;
        private int outlinksDepthDivisor;
        private boolean skipTruncated;

        public FetcherThread(Configuration conf) {
            this.setDaemon(true);
            this.setName("FetcherThread");
            this.conf = conf;
            this.urlFilters = new URLFilters(conf);
            this.scfilters = new ScoringFilters(conf);
            this.parseUtil = new ParseUtil(conf);
            this.skipTruncated = conf.getBoolean("parser.skip.truncated", true);
            this.protocolFactory = new ProtocolFactory(conf);
            this.normalizers = new URLNormalizers(conf, "fetcher");
            this.maxCrawlDelay = conf.getInt("fetcher.max.crawl.delay", 30) * 1000;
            this.queueMode = conf.get("fetcher.queue.mode", "byHost");
            if (!(this.queueMode.equals("byIP") || this.queueMode.equals("byDomain") || this.queueMode.equals("byHost"))) {
                LOG.error("Unknown partition mode : " + this.queueMode + " - forcing to byHost");
                this.queueMode = "byHost";
            }
            LOG.info("Using queue mode : " + this.queueMode);
            this.maxRedirect = conf.getInt("http.redirect.max", 3);
            this.ignoreExternalLinks = conf.getBoolean("db.ignore.external.links", false);
            this.maxOutlinksPerPage = conf.getInt("db.max.outlinks.per.page", 100);
            this.maxOutlinks = this.maxOutlinksPerPage < 0 ? Integer.MAX_VALUE : this.maxOutlinksPerPage;
            this.interval = conf.getInt("db.fetch.interval.default", 2592000);
            this.ignoreExternalLinks = conf.getBoolean("db.ignore.external.links", false);
            this.maxOutlinkDepth = conf.getInt("fetcher.follow.outlinks.depth", -1);
            this.outlinksIgnoreExternal = conf.getBoolean("fetcher.follow.outlinks.ignore.external", false);
            this.maxOutlinkDepthNumLinks = conf.getInt("fetcher.follow.outlinks.num.links", 4);
            this.outlinksDepthDivisor = conf.getInt("fetcher.follow.outlinks.depth.divisor", 2);
        }

        /*
         * WARNING - Removed try catching itself - possible behaviour change.
         * Enabled aggressive block sorting
         * Enabled unnecessary exception pruning
         * Enabled aggressive exception aggregation
         */
        @Override
        public void run() {
            Fetcher.this.activeThreads.incrementAndGet();
            FetchItem fit = null;
            try {
                while (true) {
                    if ((fit = Fetcher.this.fetchQueues.getFetchItem()) == null) {
                        if (!Fetcher.this.feeder.isAlive()) {
                            if (Fetcher.this.fetchQueues.getTotalSize() <= 0) return;
                        }
                        LOG.debug(this.getName() + " spin-waiting ...");
                        Fetcher.this.spinWaiting.incrementAndGet();
                        try {
                            Thread.sleep(500L);
                        }
                        catch (Exception e) {
                            // empty catch block
                        }
                        Fetcher.this.spinWaiting.decrementAndGet();
                        continue;
                    }
                    Fetcher.this.lastRequestStart.set(System.currentTimeMillis());
                    Text reprUrlWritable = (Text)fit.datum.getMetaData().get((Object)Nutch.WRITABLE_REPR_URL_KEY);
                    this.reprUrl = reprUrlWritable == null ? fit.url.toString() : reprUrlWritable.toString();
                    try {
                        this.redirecting = false;
                        this.redirectCount = 0;
                        do {
                            if (LOG.isInfoEnabled()) {
                                LOG.info("fetching " + fit.url + " (queue crawl delay=" + Fetcher.this.fetchQueues.getFetchItemQueue((String)fit.queueID).crawlDelay + "ms)");
                            }
                            if (LOG.isDebugEnabled()) {
                                LOG.debug("redirectCount=" + this.redirectCount);
                            }
                            this.redirecting = false;
                            Protocol protocol = this.protocolFactory.getProtocol(fit.url.toString());
                            BaseRobotRules rules = protocol.getRobotRules(fit.url, fit.datum);
                            if (!rules.isAllowed(fit.u.toString())) {
                                Fetcher.this.fetchQueues.finishFetchItem(fit, true);
                                if (LOG.isDebugEnabled()) {
                                    LOG.debug("Denied by robots.txt: " + fit.url);
                                }
                                this.output(fit.url, fit.datum, null, ProtocolStatus.STATUS_ROBOTS_DENIED, 37);
                                Fetcher.this.reporter.incrCounter("FetcherStatus", "robots_denied", 1L);
                                continue;
                            }
                            if (rules.getCrawlDelay() > 0L) {
                                if (rules.getCrawlDelay() > this.maxCrawlDelay && this.maxCrawlDelay >= 0L) {
                                    Fetcher.this.fetchQueues.finishFetchItem(fit, true);
                                    LOG.debug("Crawl-Delay for " + fit.url + " too long (" + rules.getCrawlDelay() + "), skipping");
                                    this.output(fit.url, fit.datum, null, ProtocolStatus.STATUS_ROBOTS_DENIED, 37);
                                    Fetcher.this.reporter.incrCounter("FetcherStatus", "robots_denied_maxcrawldelay", 1L);
                                    continue;
                                }
                                FetchItemQueue fiq = Fetcher.this.fetchQueues.getFetchItemQueue(fit.queueID);
                                fiq.crawlDelay = rules.getCrawlDelay();
                                if (LOG.isDebugEnabled()) {
                                    LOG.info("Crawl delay for queue: " + fit.queueID + " is set to " + fiq.crawlDelay + " as per robots.txt. url: " + fit.url);
                                }
                            }
                            ProtocolOutput output = protocol.getProtocolOutput(fit.url, fit.datum);
                            ProtocolStatus status = output.getStatus();
                            Content content = output.getContent();
                            ParseStatus pstatus = null;
                            Fetcher.this.fetchQueues.finishFetchItem(fit);
                            String urlString = fit.url.toString();
                            Fetcher.this.reporter.incrCounter("FetcherStatus", status.getName(), 1L);
                            switch (status.getCode()) {
                                case 22: {
                                    Fetcher.this.fetchQueues.addFetchItem(fit);
                                    break;
                                }
                                case 1: {
                                    int refreshTime;
                                    String newUrl;
                                    Text redirUrl;
                                    pstatus = this.output(fit.url, fit.datum, content, status, 33, fit.outlinkDepth);
                                    Fetcher.this.updateStatus(content.getContent().length);
                                    if (pstatus == null || !pstatus.isSuccess() || pstatus.getMinorCode() != 100 || (redirUrl = this.handleRedirect(fit.url, fit.datum, urlString, newUrl = pstatus.getMessage(), (refreshTime = Integer.valueOf(pstatus.getArgs()[1]).intValue()) < 5, Fetcher.CONTENT_REDIR)) == null) break;
                                    CrawlDatum newDatum = new CrawlDatum(1, fit.datum.getFetchInterval(), fit.datum.getScore());
                                    newDatum.getMetaData().putAll((Map)fit.datum.getMetaData());
                                    this.scfilters.initialScore(redirUrl, newDatum);
                                    if (this.reprUrl != null) {
                                        newDatum.getMetaData().put((Writable)Nutch.WRITABLE_REPR_URL_KEY, (Writable)new Text(this.reprUrl));
                                    }
                                    if ((fit = FetchItem.create(redirUrl, newDatum, this.queueMode)) != null) {
                                        FetchItemQueue fiq = Fetcher.this.fetchQueues.getFetchItemQueue(fit.queueID);
                                        fiq.addInProgressFetchItem(fit);
                                        break;
                                    }
                                    this.redirecting = false;
                                    Fetcher.this.reporter.incrCounter("FetcherStatus", "FetchItem.notCreated.redirect", 1L);
                                    break;
                                }
                                case 12: 
                                case 13: {
                                    boolean temp;
                                    int code;
                                    if (status.getCode() == 12) {
                                        code = 36;
                                        temp = false;
                                    } else {
                                        code = 35;
                                        temp = true;
                                    }
                                    this.output(fit.url, fit.datum, content, status, code);
                                    String newUrl = status.getMessage();
                                    Text redirUrl = this.handleRedirect(fit.url, fit.datum, urlString, newUrl, temp, Fetcher.PROTOCOL_REDIR);
                                    if (redirUrl != null) {
                                        CrawlDatum newDatum = new CrawlDatum(1, fit.datum.getFetchInterval(), fit.datum.getScore());
                                        newDatum.getMetaData().putAll((Map)fit.datum.getMetaData());
                                        this.scfilters.initialScore(redirUrl, newDatum);
                                        if (this.reprUrl != null) {
                                            newDatum.getMetaData().put((Writable)Nutch.WRITABLE_REPR_URL_KEY, (Writable)new Text(this.reprUrl));
                                        }
                                        if ((fit = FetchItem.create(redirUrl, newDatum, this.queueMode)) != null) {
                                            FetchItemQueue fiq = Fetcher.this.fetchQueues.getFetchItemQueue(fit.queueID);
                                            fiq.addInProgressFetchItem(fit);
                                            break;
                                        }
                                        this.redirecting = false;
                                        Fetcher.this.reporter.incrCounter("FetcherStatus", "FetchItem.notCreated.redirect", 1L);
                                        break;
                                    }
                                    this.redirecting = false;
                                    break;
                                }
                                case 16: {
                                    this.logError(fit.url, status.getMessage());
                                    int killedURLs = Fetcher.this.fetchQueues.checkExceptionThreshold(fit.getQueueID());
                                    if (killedURLs != 0) {
                                        Fetcher.this.reporter.incrCounter("FetcherStatus", "AboveExceptionThresholdInQueue", (long)killedURLs);
                                    }
                                }
                                case 15: 
                                case 23: {
                                    this.output(fit.url, fit.datum, null, status, 34);
                                    break;
                                }
                                case 11: 
                                case 14: 
                                case 17: 
                                case 18: {
                                    this.output(fit.url, fit.datum, null, status, 37);
                                    break;
                                }
                                case 21: {
                                    this.output(fit.url, fit.datum, null, status, 38);
                                    break;
                                }
                                default: {
                                    if (LOG.isWarnEnabled()) {
                                        LOG.warn("Unknown ProtocolStatus: " + status.getCode());
                                    }
                                    this.output(fit.url, fit.datum, null, status, 34);
                                }
                            }
                            if (!this.redirecting || this.redirectCount <= this.maxRedirect) continue;
                            Fetcher.this.fetchQueues.finishFetchItem(fit);
                            if (LOG.isInfoEnabled()) {
                                LOG.info(" - redirect count exceeded " + fit.url);
                            }
                            this.output(fit.url, fit.datum, null, ProtocolStatus.STATUS_REDIR_EXCEEDED, 37);
                        } while (this.redirecting && this.redirectCount <= this.maxRedirect);
                    }
                    catch (Throwable t) {
                        Fetcher.this.fetchQueues.finishFetchItem(fit);
                        this.logError(fit.url, StringUtils.stringifyException((Throwable)t));
                        this.output(fit.url, fit.datum, null, ProtocolStatus.STATUS_FAILED, 34);
                    }
                    continue;
                    break;
                }
            }
            catch (Throwable e) {
                if (!LOG.isErrorEnabled()) return;
                LOG.error("fetcher caught:" + e.toString());
                return;
            }
            finally {
                if (fit != null) {
                    Fetcher.this.fetchQueues.finishFetchItem(fit);
                }
                Fetcher.this.activeThreads.decrementAndGet();
                LOG.info("-finishing thread " + this.getName() + ", activeThreads=" + Fetcher.this.activeThreads);
            }
        }

        private Text handleRedirect(Text url, CrawlDatum datum, String urlString, String newUrl, boolean temp, String redirType) throws MalformedURLException, URLFilterException {
            newUrl = this.normalizers.normalize(newUrl, "fetcher");
            newUrl = this.urlFilters.filter(newUrl);
            if (this.ignoreExternalLinks) {
                try {
                    String origHost = new URL(urlString).getHost().toLowerCase();
                    String newHost = new URL(newUrl).getHost().toLowerCase();
                    if (!origHost.equals(newHost)) {
                        if (LOG.isDebugEnabled()) {
                            LOG.debug(" - ignoring redirect " + redirType + " from " + urlString + " to " + newUrl + " because external links are ignored");
                        }
                        return null;
                    }
                }
                catch (MalformedURLException e) {
                    // empty catch block
                }
            }
            if (newUrl != null && !newUrl.equals(urlString)) {
                this.reprUrl = URLUtil.chooseRepr(this.reprUrl, newUrl, temp);
                url = new Text(newUrl);
                if (this.maxRedirect > 0) {
                    this.redirecting = true;
                    ++this.redirectCount;
                    if (LOG.isDebugEnabled()) {
                        LOG.debug(" - " + redirType + " redirect to " + url + " (fetching now)");
                    }
                    return url;
                }
                CrawlDatum newDatum = new CrawlDatum(67, datum.getFetchInterval(), datum.getScore());
                newDatum.getMetaData().putAll((Map)datum.getMetaData());
                try {
                    this.scfilters.initialScore(url, newDatum);
                }
                catch (ScoringFilterException e) {
                    e.printStackTrace();
                }
                if (this.reprUrl != null) {
                    newDatum.getMetaData().put((Writable)Nutch.WRITABLE_REPR_URL_KEY, (Writable)new Text(this.reprUrl));
                }
                this.output(url, newDatum, null, null, 67);
                if (LOG.isDebugEnabled()) {
                    LOG.debug(" - " + redirType + " redirect to " + url + " (fetching later)");
                }
                return null;
            }
            if (LOG.isDebugEnabled()) {
                LOG.debug(" - " + redirType + " redirect skipped: " + (newUrl != null ? "to same url" : "filtered"));
            }
            return null;
        }

        private void logError(Text url, String message) {
            if (LOG.isInfoEnabled()) {
                LOG.info("fetch of " + url + " failed with: " + message);
            }
            Fetcher.this.errors.incrementAndGet();
        }

        private ParseStatus output(Text key, CrawlDatum datum, Content content, ProtocolStatus pstatus, int status) {
            return this.output(key, datum, content, pstatus, status, 0);
        }

        private ParseStatus output(Text key, CrawlDatum datum, Content content, ProtocolStatus pstatus, int status, int outlinkDepth) {
            Parse p;
            ParseResult parseResult;
            block27: {
                datum.setStatus(status);
                datum.setFetchTime(System.currentTimeMillis());
                if (pstatus != null) {
                    datum.getMetaData().put((Writable)Nutch.WRITABLE_PROTO_STATUS_KEY, (Writable)pstatus);
                }
                parseResult = null;
                if (content != null) {
                    block26: {
                        Metadata metadata = content.getMetadata();
                        if (content.getContentType() != null) {
                            datum.getMetaData().put((Writable)new Text("Content-Type"), (Writable)new Text(content.getContentType()));
                        }
                        metadata.set("nutch.segment.name", Fetcher.this.segmentName);
                        try {
                            this.scfilters.passScoreBeforeParsing(key, datum, content);
                        }
                        catch (Exception e) {
                            if (!LOG.isWarnEnabled()) break block26;
                            LOG.warn("Couldn't pass score, url " + key + " (" + e + ")");
                        }
                    }
                    if (Fetcher.this.parsing && status == 33) {
                        if (!this.skipTruncated || this.skipTruncated && !ParseSegment.isTruncated(content)) {
                            try {
                                parseResult = this.parseUtil.parse(content);
                            }
                            catch (Exception e) {
                                LOG.warn("Error parsing: " + key + ": " + StringUtils.stringifyException((Throwable)e));
                            }
                        }
                        if (parseResult == null) {
                            byte[] signature = SignatureFactory.getSignature(Fetcher.this.getConf()).calculate(content, new ParseStatus().getEmptyParse(this.conf));
                            datum.setSignature(signature);
                        }
                    }
                    content.getMetadata().add("_fst_", Integer.toString(status));
                }
                try {
                    Fetcher.this.output.collect((Object)key, (Object)new NutchWritable((Writable)datum));
                    if (content != null && Fetcher.this.storingContent) {
                        Fetcher.this.output.collect((Object)key, (Object)new NutchWritable(content));
                    }
                    if (parseResult == null) break block27;
                    for (Map.Entry<Text, Parse> entry : parseResult) {
                        String fromHost;
                        ParseData parseData;
                        Parse parse;
                        Text url;
                        block28: {
                            url = entry.getKey();
                            parse = entry.getValue();
                            ParseStatus parseStatus = parse.getData().getStatus();
                            parseData = parse.getData();
                            if (!parseStatus.isSuccess()) {
                                LOG.warn("Error parsing: " + key + ": " + parseStatus);
                                parse = parseStatus.getEmptyParse(Fetcher.this.getConf());
                            }
                            byte[] signature = SignatureFactory.getSignature(Fetcher.this.getConf()).calculate(content, parse);
                            parseData.getContentMeta().set("nutch.segment.name", Fetcher.this.segmentName);
                            parseData.getContentMeta().set("nutch.content.digest", StringUtil.toHexString(signature));
                            parseData.getContentMeta().set("_ftk_", Long.toString(datum.getFetchTime()));
                            if (url.equals((Object)key)) {
                                datum.setSignature(signature);
                            }
                            try {
                                this.scfilters.passScoreAfterParsing(url, content, parse);
                            }
                            catch (Exception e) {
                                if (!LOG.isWarnEnabled()) break block28;
                                LOG.warn("Couldn't pass score, url " + key + " (" + e + ")");
                            }
                        }
                        Outlink[] links = parseData.getOutlinks();
                        int outlinksToStore = Math.min(this.maxOutlinks, links.length);
                        if (this.ignoreExternalLinks) {
                            try {
                                fromHost = new URL(url.toString()).getHost().toLowerCase();
                            }
                            catch (MalformedURLException e) {
                                fromHost = null;
                            }
                        } else {
                            fromHost = null;
                        }
                        int validCount = 0;
                        ArrayList<Outlink> outlinkList = new ArrayList<Outlink>(outlinksToStore);
                        HashSet<String> outlinks = new HashSet<String>(outlinksToStore);
                        for (int i = 0; i < links.length && validCount < outlinksToStore; ++i) {
                            String toUrl = links[i].getToUrl();
                            toUrl = ParseOutputFormat.filterNormalize(url.toString(), toUrl, fromHost, this.ignoreExternalLinks, this.urlFilters, this.normalizers);
                            if (toUrl == null) continue;
                            ++validCount;
                            links[i].setUrl(toUrl);
                            outlinkList.add(links[i]);
                            outlinks.add(toUrl);
                        }
                        if (this.maxOutlinkDepth > 0 && outlinkDepth < this.maxOutlinkDepth) {
                            Fetcher.this.reporter.incrCounter("FetcherOutlinks", "outlinks_detected", (long)outlinks.size());
                            int outlinkCounter = 0;
                            int maxOutlinksByDepth = (int)Math.floor(this.outlinksDepthDivisor / (outlinkDepth + 1) * this.maxOutlinkDepthNumLinks);
                            Iterator iter = outlinks.iterator();
                            while (iter.hasNext() && outlinkCounter < this.maxOutlinkDepthNumLinks) {
                                String followUrl = (String)iter.next();
                                if (this.outlinksIgnoreExternal && !URLUtil.getHost(url.toString()).equals(URLUtil.getHost(followUrl))) continue;
                                Fetcher.this.reporter.incrCounter("FetcherOutlinks", "outlinks_following", 1L);
                                FetchItem fit = FetchItem.create(new Text(followUrl), new CrawlDatum(67, this.interval), this.queueMode, outlinkDepth + 1);
                                Fetcher.this.fetchQueues.addFetchItem(fit);
                                ++outlinkCounter;
                            }
                        }
                        parseData.setOutlinks(outlinkList.toArray(new Outlink[outlinkList.size()]));
                        Fetcher.this.output.collect((Object)url, (Object)new NutchWritable(new ParseImpl(new ParseText(parse.getText()), parseData, parse.isCanonical())));
                    }
                }
                catch (IOException e) {
                    if (!LOG.isErrorEnabled()) break block27;
                    LOG.error("fetcher caught:" + e.toString());
                }
            }
            if (parseResult != null && !parseResult.isEmpty() && (p = parseResult.get(content.getUrl())) != null) {
                Fetcher.this.reporter.incrCounter("ParserStatus", ParseStatus.majorCodes[p.getData().getStatus().getMajorCode()], 1L);
                return p.getData().getStatus();
            }
            return null;
        }
    }

    private static class QueueFeeder
    extends Thread {
        private RecordReader<Text, CrawlDatum> reader;
        private FetchItemQueues queues;
        private int size;
        private long timelimit = -1L;

        public QueueFeeder(RecordReader<Text, CrawlDatum> reader, FetchItemQueues queues, int size) {
            this.reader = reader;
            this.queues = queues;
            this.size = size;
            this.setDaemon(true);
            this.setName("QueueFeeder");
        }

        public void setTimeLimit(long tl) {
            this.timelimit = tl;
        }

        @Override
        public void run() {
            boolean hasMore = true;
            int cnt = 0;
            int timelimitcount = 0;
            while (hasMore) {
                if (System.currentTimeMillis() >= this.timelimit && this.timelimit != -1L) {
                    try {
                        Text url = new Text();
                        CrawlDatum datum = new CrawlDatum();
                        hasMore = this.reader.next((Object)url, (Object)datum);
                        ++timelimitcount;
                        continue;
                    }
                    catch (IOException e) {
                        LOG.error("QueueFeeder error reading input, record " + cnt, (Throwable)e);
                        return;
                    }
                }
                int feed = this.size - this.queues.getTotalSize();
                if (feed <= 0) {
                    try {
                        Thread.sleep(1000L);
                    }
                    catch (Exception e) {}
                    continue;
                }
                LOG.debug("-feeding " + feed + " input urls ...");
                while (feed > 0 && hasMore) {
                    try {
                        Text url = new Text();
                        CrawlDatum datum = new CrawlDatum();
                        hasMore = this.reader.next((Object)url, (Object)datum);
                        if (!hasMore) continue;
                        this.queues.addFetchItem(url, datum);
                        ++cnt;
                        --feed;
                    }
                    catch (IOException e) {
                        LOG.error("QueueFeeder error reading input, record " + cnt, (Throwable)e);
                        return;
                    }
                }
            }
            LOG.info("QueueFeeder finished: total " + cnt + " records + hit by time limit :" + timelimitcount);
        }
    }

    private static class FetchItemQueues {
        public static final String DEFAULT_ID = "default";
        Map<String, FetchItemQueue> queues = new HashMap<String, FetchItemQueue>();
        AtomicInteger totalSize = new AtomicInteger(0);
        int maxThreads;
        long crawlDelay;
        long minCrawlDelay;
        long timelimit = -1L;
        int maxExceptionsPerQueue = -1;
        Configuration conf;
        public static final String QUEUE_MODE_HOST = "byHost";
        public static final String QUEUE_MODE_DOMAIN = "byDomain";
        public static final String QUEUE_MODE_IP = "byIP";
        String queueMode;

        public FetchItemQueues(Configuration conf) {
            this.conf = conf;
            this.maxThreads = conf.getInt("fetcher.threads.per.queue", 1);
            this.queueMode = conf.get("fetcher.queue.mode", QUEUE_MODE_HOST);
            if (!(this.queueMode.equals(QUEUE_MODE_IP) || this.queueMode.equals(QUEUE_MODE_DOMAIN) || this.queueMode.equals(QUEUE_MODE_HOST))) {
                LOG.error("Unknown partition mode : " + this.queueMode + " - forcing to byHost");
                this.queueMode = QUEUE_MODE_HOST;
            }
            LOG.info("Using queue mode : " + this.queueMode);
            this.crawlDelay = (long)(conf.getFloat("fetcher.server.delay", 1.0f) * 1000.0f);
            this.minCrawlDelay = (long)(conf.getFloat("fetcher.server.min.delay", 0.0f) * 1000.0f);
            this.timelimit = conf.getLong("fetcher.timelimit", -1L);
            this.maxExceptionsPerQueue = conf.getInt("fetcher.max.exceptions.per.queue", -1);
        }

        public int getTotalSize() {
            return this.totalSize.get();
        }

        public int getQueueCount() {
            return this.queues.size();
        }

        public void addFetchItem(Text url, CrawlDatum datum) {
            FetchItem it = FetchItem.create(url, datum, this.queueMode);
            if (it != null) {
                this.addFetchItem(it);
            }
        }

        public synchronized void addFetchItem(FetchItem it) {
            FetchItemQueue fiq = this.getFetchItemQueue(it.queueID);
            fiq.addFetchItem(it);
            this.totalSize.incrementAndGet();
        }

        public void finishFetchItem(FetchItem it) {
            this.finishFetchItem(it, false);
        }

        public void finishFetchItem(FetchItem it, boolean asap) {
            FetchItemQueue fiq = this.queues.get(it.queueID);
            if (fiq == null) {
                LOG.warn("Attempting to finish item from unknown queue: " + it);
                return;
            }
            fiq.finishFetchItem(it, asap);
        }

        public synchronized FetchItemQueue getFetchItemQueue(String id) {
            FetchItemQueue fiq = this.queues.get(id);
            if (fiq == null) {
                fiq = new FetchItemQueue(this.conf, this.maxThreads, this.crawlDelay, this.minCrawlDelay);
                this.queues.put(id, fiq);
            }
            return fiq;
        }

        public synchronized FetchItem getFetchItem() {
            Iterator<Map.Entry<String, FetchItemQueue>> it = this.queues.entrySet().iterator();
            while (it.hasNext()) {
                FetchItemQueue fiq = it.next().getValue();
                if (fiq.getQueueSize() == 0 && fiq.getInProgressSize() == 0) {
                    it.remove();
                    continue;
                }
                FetchItem fit = fiq.getFetchItem();
                if (fit == null) continue;
                this.totalSize.decrementAndGet();
                return fit;
            }
            return null;
        }

        public synchronized int checkTimelimit() {
            int count = 0;
            if (System.currentTimeMillis() >= this.timelimit && this.timelimit != -1L) {
                count = this.emptyQueues();
                if (this.totalSize.get() != 0 && this.queues.size() == 0) {
                    this.totalSize.set(0);
                }
            }
            return count;
        }

        public synchronized int emptyQueues() {
            int count = 0;
            for (String id : this.queues.keySet()) {
                FetchItemQueue fiq = this.queues.get(id);
                if (fiq.getQueueSize() == 0) continue;
                LOG.info("* queue: " + id + " >> dropping! ");
                int deleted = fiq.emptyQueue();
                for (int i = 0; i < deleted; ++i) {
                    this.totalSize.decrementAndGet();
                }
                count += deleted;
            }
            return count;
        }

        public synchronized int checkExceptionThreshold(String queueid) {
            FetchItemQueue fiq = this.queues.get(queueid);
            if (fiq == null) {
                return 0;
            }
            if (fiq.getQueueSize() == 0) {
                return 0;
            }
            int excCount = fiq.incrementExceptionCounter();
            if (this.maxExceptionsPerQueue != -1 && excCount >= this.maxExceptionsPerQueue) {
                int deleted = fiq.emptyQueue();
                LOG.info("* queue: " + queueid + " >> removed " + deleted + " URLs from queue because " + excCount + " exceptions occurred");
                for (int i = 0; i < deleted; ++i) {
                    this.totalSize.decrementAndGet();
                }
                return deleted;
            }
            return 0;
        }

        public synchronized void dump() {
            for (String id : this.queues.keySet()) {
                FetchItemQueue fiq = this.queues.get(id);
                if (fiq.getQueueSize() == 0) continue;
                LOG.info("* queue: " + id);
                fiq.dump();
            }
        }
    }

    private static class FetchItemQueue {
        List<FetchItem> queue = Collections.synchronizedList(new LinkedList());
        Set<FetchItem> inProgress = Collections.synchronizedSet(new HashSet());
        AtomicLong nextFetchTime = new AtomicLong();
        AtomicInteger exceptionCounter = new AtomicInteger();
        long crawlDelay;
        long minCrawlDelay;
        int maxThreads;
        Configuration conf;

        public FetchItemQueue(Configuration conf, int maxThreads, long crawlDelay, long minCrawlDelay) {
            this.conf = conf;
            this.maxThreads = maxThreads;
            this.crawlDelay = crawlDelay;
            this.minCrawlDelay = minCrawlDelay;
            this.setEndTime(System.currentTimeMillis() - crawlDelay);
        }

        public synchronized int emptyQueue() {
            int presize = this.queue.size();
            this.queue.clear();
            return presize;
        }

        public int getQueueSize() {
            return this.queue.size();
        }

        public int getInProgressSize() {
            return this.inProgress.size();
        }

        public int incrementExceptionCounter() {
            return this.exceptionCounter.incrementAndGet();
        }

        public void finishFetchItem(FetchItem it, boolean asap) {
            if (it != null) {
                this.inProgress.remove(it);
                this.setEndTime(System.currentTimeMillis(), asap);
            }
        }

        public void addFetchItem(FetchItem it) {
            if (it == null) {
                return;
            }
            this.queue.add(it);
        }

        public void addInProgressFetchItem(FetchItem it) {
            if (it == null) {
                return;
            }
            this.inProgress.add(it);
        }

        public FetchItem getFetchItem() {
            if (this.inProgress.size() >= this.maxThreads) {
                return null;
            }
            long now = System.currentTimeMillis();
            if (this.nextFetchTime.get() > now) {
                return null;
            }
            FetchItem it = null;
            if (this.queue.size() == 0) {
                return null;
            }
            try {
                it = this.queue.remove(0);
                this.inProgress.add(it);
            }
            catch (Exception e) {
                LOG.error("Cannot remove FetchItem from queue or cannot add it to inProgress queue", (Throwable)e);
            }
            return it;
        }

        public synchronized void dump() {
            LOG.info("  maxThreads    = " + this.maxThreads);
            LOG.info("  inProgress    = " + this.inProgress.size());
            LOG.info("  crawlDelay    = " + this.crawlDelay);
            LOG.info("  minCrawlDelay = " + this.minCrawlDelay);
            LOG.info("  nextFetchTime = " + this.nextFetchTime.get());
            LOG.info("  now           = " + System.currentTimeMillis());
            for (int i = 0; i < this.queue.size(); ++i) {
                FetchItem it = this.queue.get(i);
                LOG.info("  " + i + ". " + it.url);
            }
        }

        private void setEndTime(long endTime) {
            this.setEndTime(endTime, false);
        }

        private void setEndTime(long endTime, boolean asap) {
            if (!asap) {
                this.nextFetchTime.set(endTime + (this.maxThreads > 1 ? this.minCrawlDelay : this.crawlDelay));
            } else {
                this.nextFetchTime.set(endTime);
            }
        }
    }

    private static class FetchItem {
        int outlinkDepth = 0;
        String queueID;
        Text url;
        URL u;
        CrawlDatum datum;

        public FetchItem(Text url, URL u, CrawlDatum datum, String queueID) {
            this(url, u, datum, queueID, 0);
        }

        public FetchItem(Text url, URL u, CrawlDatum datum, String queueID, int outlinkDepth) {
            this.url = url;
            this.u = u;
            this.datum = datum;
            this.queueID = queueID;
            this.outlinkDepth = outlinkDepth;
        }

        public static FetchItem create(Text url, CrawlDatum datum, String queueMode) {
            return FetchItem.create(url, datum, queueMode, 0);
        }

        public static FetchItem create(Text url, CrawlDatum datum, String queueMode, int outlinkDepth) {
            String key;
            URL u = null;
            try {
                u = new URL(url.toString());
            }
            catch (Exception e) {
                LOG.warn("Cannot parse url: " + url, (Throwable)e);
                return null;
            }
            String proto = u.getProtocol().toLowerCase();
            if ("byIP".equalsIgnoreCase(queueMode)) {
                try {
                    InetAddress addr = InetAddress.getByName(u.getHost());
                    key = addr.getHostAddress();
                }
                catch (UnknownHostException e) {
                    LOG.warn("Unable to resolve: " + u.getHost() + ", skipping.");
                    return null;
                }
            } else if ("byDomain".equalsIgnoreCase(queueMode)) {
                key = URLUtil.getDomainName(u);
                if (key == null) {
                    LOG.warn("Unknown domain for url: " + url + ", using URL string as key");
                    key = u.toExternalForm();
                }
            } else {
                key = u.getHost();
                if (key == null) {
                    LOG.warn("Unknown host for url: " + url + ", using URL string as key");
                    key = u.toExternalForm();
                }
            }
            String queueID = proto + "://" + key.toLowerCase();
            return new FetchItem(url, u, datum, queueID, outlinkDepth);
        }

        public CrawlDatum getDatum() {
            return this.datum;
        }

        public String getQueueID() {
            return this.queueID;
        }

        public Text getUrl() {
            return this.url;
        }

        public URL getURL2() {
            return this.u;
        }
    }

    public static class InputFormat
    extends SequenceFileInputFormat<Text, CrawlDatum> {
        public InputSplit[] getSplits(JobConf job, int nSplits) throws IOException {
            FileStatus[] files = this.listStatus(job);
            FileSplit[] splits = new FileSplit[files.length];
            for (int i = 0; i < files.length; ++i) {
                FileStatus cur = files[i];
                splits[i] = new FileSplit(cur.getPath(), 0L, cur.getLen(), (String[])null);
            }
            return splits;
        }
    }
}

