/*
 * Decompiled with CFR 0.152.
 */
package org.apache.nutch.crawl;

import java.io.Closeable;
import java.io.DataOutputStream;
import java.io.IOException;
import java.net.URL;
import java.util.Date;
import java.util.Iterator;
import java.util.Map;
import java.util.Random;
import java.util.TreeMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.MapFile;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapFileOutputFormat;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Partitioner;
import org.apache.hadoop.mapred.RecordWriter;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.mapred.lib.HashPartitioner;
import org.apache.hadoop.mapred.lib.IdentityMapper;
import org.apache.hadoop.mapred.lib.IdentityReducer;
import org.apache.hadoop.util.Progressable;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.StringUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class CrawlDbReader
implements Closeable {
    public static final Logger LOG = LoggerFactory.getLogger(CrawlDbReader.class);
    private MapFile.Reader[] readers = null;

    private void openReaders(String crawlDb, Configuration config) throws IOException {
        if (this.readers != null) {
            return;
        }
        FileSystem fs = FileSystem.get((Configuration)config);
        this.readers = MapFileOutputFormat.getReaders((FileSystem)fs, (Path)new Path(crawlDb, "current"), (Configuration)config);
    }

    private void closeReaders() {
        if (this.readers == null) {
            return;
        }
        for (int i = 0; i < this.readers.length; ++i) {
            try {
                this.readers[i].close();
                continue;
            }
            catch (Exception exception) {
                // empty catch block
            }
        }
    }

    @Override
    public void close() {
        this.closeReaders();
    }

    public void processStatJob(String crawlDb, Configuration config, boolean sort) throws IOException {
        if (LOG.isInfoEnabled()) {
            LOG.info("CrawlDb statistics start: " + crawlDb);
        }
        Path tmpFolder = new Path(crawlDb, "stat_tmp" + System.currentTimeMillis());
        NutchJob job = new NutchJob(config);
        job.setJobName("stats " + crawlDb);
        job.setBoolean("db.reader.stats.sort", sort);
        FileInputFormat.addInputPath((JobConf)job, (Path)new Path(crawlDb, "current"));
        job.setInputFormat(SequenceFileInputFormat.class);
        job.setMapperClass(CrawlDbStatMapper.class);
        job.setCombinerClass(CrawlDbStatCombiner.class);
        job.setReducerClass(CrawlDbStatReducer.class);
        FileOutputFormat.setOutputPath((JobConf)job, (Path)tmpFolder);
        job.setOutputFormat(SequenceFileOutputFormat.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(LongWritable.class);
        job.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);
        JobClient.runJob((JobConf)job);
        FileSystem fileSystem = FileSystem.get((Configuration)config);
        SequenceFile.Reader[] readers = SequenceFileOutputFormat.getReaders((Configuration)config, (Path)tmpFolder);
        Text key = new Text();
        LongWritable value = new LongWritable();
        TreeMap<String, LongWritable> stats = new TreeMap<String, LongWritable>();
        for (int i = 0; i < readers.length; ++i) {
            SequenceFile.Reader reader = readers[i];
            while (reader.next((Writable)key, (Writable)value)) {
                String k = key.toString();
                LongWritable val = (LongWritable)stats.get(k);
                if (val == null) {
                    val = new LongWritable();
                    if (k.equals("scx")) {
                        val.set(Long.MIN_VALUE);
                    }
                    if (k.equals("scn")) {
                        val.set(Long.MAX_VALUE);
                    }
                    stats.put(k, val);
                }
                if (k.equals("scx")) {
                    if (val.get() >= value.get()) continue;
                    val.set(value.get());
                    continue;
                }
                if (k.equals("scn")) {
                    if (val.get() <= value.get()) continue;
                    val.set(value.get());
                    continue;
                }
                val.set(val.get() + value.get());
            }
            reader.close();
        }
        if (LOG.isInfoEnabled()) {
            LOG.info("Statistics for CrawlDb: " + crawlDb);
            LongWritable totalCnt = (LongWritable)stats.get("T");
            stats.remove("T");
            LOG.info("TOTAL urls:\t" + totalCnt.get());
            for (Map.Entry entry : stats.entrySet()) {
                String k = (String)entry.getKey();
                LongWritable val = (LongWritable)entry.getValue();
                if (k.equals("scn")) {
                    LOG.info("min score:\t" + (float)val.get() / 1000.0f);
                    continue;
                }
                if (k.equals("scx")) {
                    LOG.info("max score:\t" + (float)val.get() / 1000.0f);
                    continue;
                }
                if (k.equals("sct")) {
                    LOG.info("avg score:\t" + (float)((double)val.get() / (double)totalCnt.get() / 1000.0));
                    continue;
                }
                if (k.startsWith("status")) {
                    String[] st = k.split(" ");
                    int code = Integer.parseInt(st[1]);
                    if (st.length > 2) {
                        LOG.info("   " + st[2] + " :\t" + val);
                        continue;
                    }
                    LOG.info(st[0] + " " + code + " (" + CrawlDatum.getStatusName((byte)code) + "):\t" + val);
                    continue;
                }
                LOG.info(k + ":\t" + val);
            }
        }
        fileSystem.delete(tmpFolder, true);
        if (LOG.isInfoEnabled()) {
            LOG.info("CrawlDb statistics: done");
        }
    }

    public CrawlDatum get(String crawlDb, String url, Configuration config) throws IOException {
        Text key = new Text(url);
        CrawlDatum val = new CrawlDatum();
        this.openReaders(crawlDb, config);
        CrawlDatum res = (CrawlDatum)MapFileOutputFormat.getEntry((MapFile.Reader[])this.readers, (Partitioner)new HashPartitioner(), (WritableComparable)key, (Writable)val);
        return res;
    }

    public void readUrl(String crawlDb, String url, Configuration config) throws IOException {
        CrawlDatum res = this.get(crawlDb, url, config);
        System.out.println("URL: " + url);
        if (res != null) {
            System.out.println(res);
        } else {
            System.out.println("not found");
        }
    }

    public void processDumpJob(String crawlDb, String output, Configuration config, String format, String regex, String status, Integer retry) throws IOException {
        if (LOG.isInfoEnabled()) {
            LOG.info("CrawlDb dump: starting");
            LOG.info("CrawlDb db: " + crawlDb);
        }
        Path outFolder = new Path(output);
        NutchJob job = new NutchJob(config);
        job.setJobName("dump " + crawlDb);
        FileInputFormat.addInputPath((JobConf)job, (Path)new Path(crawlDb, "current"));
        job.setInputFormat(SequenceFileInputFormat.class);
        FileOutputFormat.setOutputPath((JobConf)job, (Path)outFolder);
        if (format.equals("csv")) {
            job.setOutputFormat(CrawlDatumCsvOutputFormat.class);
        } else if (format.equals("crawldb")) {
            job.setOutputFormat(MapFileOutputFormat.class);
        } else {
            job.setOutputFormat(TextOutputFormat.class);
        }
        if (status != null) {
            job.set("status", status);
        }
        if (regex != null) {
            job.set("regex", regex);
        }
        if (retry != null) {
            job.setInt("retry", retry);
        }
        job.setMapperClass(CrawlDbDumpMapper.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(CrawlDatum.class);
        JobClient.runJob((JobConf)job);
        if (LOG.isInfoEnabled()) {
            LOG.info("CrawlDb dump: done");
        }
    }

    public void processTopNJob(String crawlDb, long topN, float min, String output, Configuration config) throws IOException {
        if (LOG.isInfoEnabled()) {
            LOG.info("CrawlDb topN: starting (topN=" + topN + ", min=" + min + ")");
            LOG.info("CrawlDb db: " + crawlDb);
        }
        Path outFolder = new Path(output);
        Path tempDir = new Path(config.get("mapred.temp.dir", ".") + "/readdb-topN-temp-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
        NutchJob job = new NutchJob(config);
        job.setJobName("topN prepare " + crawlDb);
        FileInputFormat.addInputPath((JobConf)job, (Path)new Path(crawlDb, "current"));
        job.setInputFormat(SequenceFileInputFormat.class);
        job.setMapperClass(CrawlDbTopNMapper.class);
        job.setReducerClass(IdentityReducer.class);
        FileOutputFormat.setOutputPath((JobConf)job, (Path)tempDir);
        job.setOutputFormat(SequenceFileOutputFormat.class);
        job.setOutputKeyClass(FloatWritable.class);
        job.setOutputValueClass(Text.class);
        job.setLong("db.reader.topn.min", Math.round(1000000.0 * (double)min));
        JobClient.runJob((JobConf)job);
        if (LOG.isInfoEnabled()) {
            LOG.info("CrawlDb topN: collecting topN scores.");
        }
        job = new NutchJob(config);
        job.setJobName("topN collect " + crawlDb);
        job.setLong("db.reader.topn", topN);
        FileInputFormat.addInputPath((JobConf)job, (Path)tempDir);
        job.setInputFormat(SequenceFileInputFormat.class);
        job.setMapperClass(IdentityMapper.class);
        job.setReducerClass(CrawlDbTopNReducer.class);
        FileOutputFormat.setOutputPath((JobConf)job, (Path)outFolder);
        job.setOutputFormat(TextOutputFormat.class);
        job.setOutputKeyClass(FloatWritable.class);
        job.setOutputValueClass(Text.class);
        job.setNumReduceTasks(1);
        JobClient.runJob((JobConf)job);
        FileSystem fs = FileSystem.get((Configuration)config);
        fs.delete(tempDir, true);
        if (LOG.isInfoEnabled()) {
            LOG.info("CrawlDb topN: done");
        }
    }

    public static void main(String[] args) throws IOException {
        CrawlDbReader dbr = new CrawlDbReader();
        if (args.length < 1) {
            System.err.println("Usage: CrawlDbReader <crawldb> (-stats | -dump <out_dir> | -topN <nnnn> <out_dir> [<min>] | -url <url>)");
            System.err.println("\t<crawldb>\tdirectory name where crawldb is located");
            System.err.println("\t-stats [-sort] \tprint overall statistics to System.out");
            System.err.println("\t\t[-sort]\tlist status sorted by host");
            System.err.println("\t-dump <out_dir> [-format normal|csv|crawldb]\tdump the whole db to a text file in <out_dir>");
            System.err.println("\t\t[-format csv]\tdump in Csv format");
            System.err.println("\t\t[-format normal]\tdump in standard format (default option)");
            System.err.println("\t\t[-format crawldb]\tdump as CrawlDB");
            System.err.println("\t\t[-regex <expr>]\tfilter records with expression");
            System.err.println("\t\t[-retry <num>]\tminimum retry count");
            System.err.println("\t\t[-status <status>]\tfilter records by CrawlDatum status");
            System.err.println("\t-url <url>\tprint information on <url> to System.out");
            System.err.println("\t-topN <nnnn> <out_dir> [<min>]\tdump top <nnnn> urls sorted by score to <out_dir>");
            System.err.println("\t\t[<min>]\tskip records with scores below this value.");
            System.err.println("\t\t\tThis can significantly improve performance.");
            return;
        }
        String param = null;
        String crawlDb = args[0];
        Configuration conf = NutchConfiguration.create();
        for (int i = 1; i < args.length; ++i) {
            if (args[i].equals("-stats")) {
                boolean toSort = false;
                if (i < args.length - 1 && "-sort".equals(args[i + 1])) {
                    toSort = true;
                    ++i;
                }
                dbr.processStatJob(crawlDb, conf, toSort);
                continue;
            }
            if (args[i].equals("-dump")) {
                param = args[++i];
                String format = "normal";
                String regex = null;
                Integer retry = null;
                String status = null;
                for (int j = i + 1; j < args.length; ++j) {
                    if (args[j].equals("-format")) {
                        format = args[++j];
                        i += 2;
                    }
                    if (args[j].equals("-regex")) {
                        regex = args[++j];
                        i += 2;
                    }
                    if (args[j].equals("-retry")) {
                        retry = Integer.parseInt(args[++j]);
                        i += 2;
                    }
                    if (!args[j].equals("-status")) continue;
                    status = args[++j];
                    i += 2;
                }
                dbr.processDumpJob(crawlDb, param, conf, format, regex, status, retry);
                continue;
            }
            if (args[i].equals("-url")) {
                param = args[++i];
                dbr.readUrl(crawlDb, param, conf);
                continue;
            }
            if (args[i].equals("-topN")) {
                param = args[++i];
                long topN = Long.parseLong(param);
                param = args[++i];
                float min = 0.0f;
                if (i < args.length - 1) {
                    min = Float.parseFloat(args[++i]);
                }
                dbr.processTopNJob(crawlDb, topN, min, param, conf);
                continue;
            }
            System.err.println("\nError: wrong argument " + args[i]);
        }
    }

    public static class CrawlDbDumpMapper
    implements Mapper<Text, CrawlDatum, Text, CrawlDatum> {
        Pattern pattern = null;
        Matcher matcher = null;
        String status = null;
        Integer retry = null;

        public void configure(JobConf job) {
            if (job.get("regex", null) != null) {
                this.pattern = Pattern.compile(job.get("regex"));
            }
            this.status = job.get("status", null);
            this.retry = job.getInt("retry", -1);
        }

        public void close() {
        }

        public void map(Text key, CrawlDatum value, OutputCollector<Text, CrawlDatum> output, Reporter reporter) throws IOException {
            if (this.retry != -1 && value.getRetriesSinceFetch() < this.retry) {
                return;
            }
            if (this.status != null && !this.status.equalsIgnoreCase(CrawlDatum.getStatusName(value.getStatus()))) {
                return;
            }
            if (this.pattern != null) {
                this.matcher = this.pattern.matcher(key.toString());
                if (!this.matcher.matches()) {
                    return;
                }
            }
            output.collect((Object)key, (Object)value);
        }
    }

    public static class CrawlDbTopNReducer
    implements Reducer<FloatWritable, Text, FloatWritable, Text> {
        private long topN;
        private long count = 0L;

        public void reduce(FloatWritable key, Iterator<Text> values, OutputCollector<FloatWritable, Text> output, Reporter reporter) throws IOException {
            while (values.hasNext() && this.count < this.topN) {
                key.set(-key.get());
                output.collect((Object)key, (Object)values.next());
                ++this.count;
            }
        }

        public void configure(JobConf job) {
            this.topN = job.getLong("db.reader.topn", 100L) / (long)job.getNumReduceTasks();
        }

        public void close() {
        }
    }

    public static class CrawlDbTopNMapper
    implements Mapper<Text, CrawlDatum, FloatWritable, Text> {
        private static final FloatWritable fw = new FloatWritable();
        private float min = 0.0f;

        public void configure(JobConf job) {
            long lmin = job.getLong("db.reader.topn.min", 0L);
            if (lmin != 0L) {
                this.min = (float)lmin / 1000000.0f;
            }
        }

        public void close() {
        }

        public void map(Text key, CrawlDatum value, OutputCollector<FloatWritable, Text> output, Reporter reporter) throws IOException {
            if (value.getScore() < this.min) {
                return;
            }
            fw.set(-value.getScore());
            output.collect((Object)fw, (Object)key);
        }
    }

    public static class CrawlDbStatReducer
    implements Reducer<Text, LongWritable, Text, LongWritable> {
        public void configure(JobConf job) {
        }

        public void close() {
        }

        public void reduce(Text key, Iterator<LongWritable> values, OutputCollector<Text, LongWritable> output, Reporter reporter) throws IOException {
            String k = key.toString();
            if (k.equals("T")) {
                long sum = 0L;
                while (values.hasNext()) {
                    sum += values.next().get();
                }
                output.collect((Object)key, (Object)new LongWritable(sum));
            } else if (k.startsWith("status") || k.startsWith("retry")) {
                LongWritable cnt = new LongWritable();
                while (values.hasNext()) {
                    LongWritable val = values.next();
                    cnt.set(cnt.get() + val.get());
                }
                output.collect((Object)key, (Object)cnt);
            } else if (k.equals("scx")) {
                LongWritable cnt = new LongWritable(Long.MIN_VALUE);
                while (values.hasNext()) {
                    LongWritable val = values.next();
                    if (cnt.get() >= val.get()) continue;
                    cnt.set(val.get());
                }
                output.collect((Object)key, (Object)cnt);
            } else if (k.equals("scn")) {
                LongWritable cnt = new LongWritable(Long.MAX_VALUE);
                while (values.hasNext()) {
                    LongWritable val = values.next();
                    if (cnt.get() <= val.get()) continue;
                    cnt.set(val.get());
                }
                output.collect((Object)key, (Object)cnt);
            } else if (k.equals("sct")) {
                LongWritable cnt = new LongWritable();
                while (values.hasNext()) {
                    LongWritable val = values.next();
                    cnt.set(cnt.get() + val.get());
                }
                output.collect((Object)key, (Object)cnt);
            }
        }
    }

    public static class CrawlDbStatCombiner
    implements Reducer<Text, LongWritable, Text, LongWritable> {
        LongWritable val = new LongWritable();

        public void configure(JobConf job) {
        }

        public void close() {
        }

        public void reduce(Text key, Iterator<LongWritable> values, OutputCollector<Text, LongWritable> output, Reporter reporter) throws IOException {
            this.val.set(0L);
            String k = key.toString();
            if (!k.equals("s")) {
                while (values.hasNext()) {
                    LongWritable cnt = values.next();
                    this.val.set(this.val.get() + cnt.get());
                }
                output.collect((Object)key, (Object)this.val);
            } else {
                long total = 0L;
                long min = Long.MAX_VALUE;
                long max = Long.MIN_VALUE;
                while (values.hasNext()) {
                    LongWritable cnt = values.next();
                    if (cnt.get() < min) {
                        min = cnt.get();
                    }
                    if (cnt.get() > max) {
                        max = cnt.get();
                    }
                    total += cnt.get();
                }
                output.collect((Object)new Text("scn"), (Object)new LongWritable(min));
                output.collect((Object)new Text("scx"), (Object)new LongWritable(max));
                output.collect((Object)new Text("sct"), (Object)new LongWritable(total));
            }
        }
    }

    public static class CrawlDbStatMapper
    implements Mapper<Text, CrawlDatum, Text, LongWritable> {
        LongWritable COUNT_1 = new LongWritable(1L);
        private boolean sort = false;

        public void configure(JobConf job) {
            this.sort = job.getBoolean("db.reader.stats.sort", false);
        }

        public void close() {
        }

        public void map(Text key, CrawlDatum value, OutputCollector<Text, LongWritable> output, Reporter reporter) throws IOException {
            output.collect((Object)new Text("T"), (Object)this.COUNT_1);
            output.collect((Object)new Text("status " + value.getStatus()), (Object)this.COUNT_1);
            output.collect((Object)new Text("retry " + value.getRetriesSinceFetch()), (Object)this.COUNT_1);
            output.collect((Object)new Text("s"), (Object)new LongWritable((long)((double)value.getScore() * 1000.0)));
            if (this.sort) {
                URL u = new URL(key.toString());
                String host = u.getHost();
                output.collect((Object)new Text("status " + value.getStatus() + " " + host), (Object)this.COUNT_1);
            }
        }
    }

    public static class CrawlDatumCsvOutputFormat
    extends FileOutputFormat<Text, CrawlDatum> {
        public RecordWriter<Text, CrawlDatum> getRecordWriter(FileSystem fs, JobConf job, String name, Progressable progress) throws IOException {
            Path dir = FileOutputFormat.getOutputPath((JobConf)job);
            FSDataOutputStream fileOut = fs.create(new Path(dir, name), progress);
            return new LineRecordWriter((DataOutputStream)fileOut);
        }

        protected static class LineRecordWriter
        implements RecordWriter<Text, CrawlDatum> {
            private DataOutputStream out;

            public LineRecordWriter(DataOutputStream out) {
                this.out = out;
                try {
                    out.writeBytes("Url;Status code;Status name;Fetch Time;Modified Time;Retries since fetch;Retry interval seconds;Retry interval days;Score;Signature;Metadata\n");
                }
                catch (IOException iOException) {
                    // empty catch block
                }
            }

            public synchronized void write(Text key, CrawlDatum value) throws IOException {
                this.out.writeByte(34);
                this.out.writeBytes(key.toString());
                this.out.writeByte(34);
                this.out.writeByte(59);
                this.out.writeBytes(Integer.toString(value.getStatus()));
                this.out.writeByte(59);
                this.out.writeByte(34);
                this.out.writeBytes(CrawlDatum.getStatusName(value.getStatus()));
                this.out.writeByte(34);
                this.out.writeByte(59);
                this.out.writeBytes(new Date(value.getFetchTime()).toString());
                this.out.writeByte(59);
                this.out.writeBytes(new Date(value.getModifiedTime()).toString());
                this.out.writeByte(59);
                this.out.writeBytes(Integer.toString(value.getRetriesSinceFetch()));
                this.out.writeByte(59);
                this.out.writeBytes(Float.toString(value.getFetchInterval()));
                this.out.writeByte(59);
                this.out.writeBytes(Float.toString(value.getFetchInterval() / 86400));
                this.out.writeByte(59);
                this.out.writeBytes(Float.toString(value.getScore()));
                this.out.writeByte(59);
                this.out.writeByte(34);
                this.out.writeBytes(value.getSignature() != null ? StringUtil.toHexString(value.getSignature()) : "null");
                this.out.writeByte(34);
                this.out.writeByte(59);
                this.out.writeByte(34);
                if (value.getMetaData() != null) {
                    for (Map.Entry e : value.getMetaData().entrySet()) {
                        this.out.writeBytes(((Writable)e.getKey()).toString());
                        this.out.writeByte(58);
                        this.out.writeBytes(((Writable)e.getValue()).toString());
                        this.out.writeBytes("|||");
                    }
                }
                this.out.writeByte(34);
                this.out.writeByte(10);
            }

            public synchronized void close(Reporter reporter) throws IOException {
                this.out.close();
            }
        }
    }
}

