/*
 * Decompiled with CFR 0.152.
 */
package org.apache.nutch.segment;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.PrintStream;
import java.io.PrintWriter;
import java.io.Writer;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Random;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.MapFile;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapFileOutputFormat;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.RecordWriter;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.util.Progressable;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.NutchWritable;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.parse.ParseText;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.util.HadoopFSUtil;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class SegmentReader
extends Configured
implements Reducer<Text, NutchWritable, Text, Text> {
    public static final Logger LOG = LoggerFactory.getLogger(SegmentReader.class);
    long recNo = 0L;
    private boolean co;
    private boolean fe;
    private boolean ge;
    private boolean pa;
    private boolean pd;
    private boolean pt;
    private FileSystem fs;
    private static final String[][] keys = new String[][]{{"co", "Content::\n"}, {"ge", "Crawl Generate::\n"}, {"fe", "Crawl Fetch::\n"}, {"pa", "Crawl Parse::\n"}, {"pd", "ParseData::\n"}, {"pt", "ParseText::\n"}};
    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss");
    private static final int MODE_DUMP = 0;
    private static final int MODE_LIST = 1;
    private static final int MODE_GET = 2;

    public SegmentReader() {
        super(null);
    }

    public SegmentReader(Configuration conf, boolean co, boolean fe, boolean ge, boolean pa, boolean pd, boolean pt) {
        super(conf);
        this.co = co;
        this.fe = fe;
        this.ge = ge;
        this.pa = pa;
        this.pd = pd;
        this.pt = pt;
        try {
            this.fs = FileSystem.get((Configuration)this.getConf());
        }
        catch (IOException e) {
            LOG.error("IOException:", (Throwable)e);
        }
    }

    public void configure(JobConf job) {
        this.setConf((Configuration)job);
        this.co = this.getConf().getBoolean("segment.reader.co", true);
        this.fe = this.getConf().getBoolean("segment.reader.fe", true);
        this.ge = this.getConf().getBoolean("segment.reader.ge", true);
        this.pa = this.getConf().getBoolean("segment.reader.pa", true);
        this.pd = this.getConf().getBoolean("segment.reader.pd", true);
        this.pt = this.getConf().getBoolean("segment.reader.pt", true);
        try {
            this.fs = FileSystem.get((Configuration)this.getConf());
        }
        catch (IOException e) {
            LOG.error("IOException:", (Throwable)e);
        }
    }

    private JobConf createJobConf() {
        NutchJob job = new NutchJob(this.getConf());
        job.setBoolean("segment.reader.co", this.co);
        job.setBoolean("segment.reader.fe", this.fe);
        job.setBoolean("segment.reader.ge", this.ge);
        job.setBoolean("segment.reader.pa", this.pa);
        job.setBoolean("segment.reader.pd", this.pd);
        job.setBoolean("segment.reader.pt", this.pt);
        return job;
    }

    public void close() {
    }

    public void reduce(Text key, Iterator<NutchWritable> values, OutputCollector<Text, Text> output, Reporter reporter) throws IOException {
        StringBuffer dump = new StringBuffer();
        dump.append("\nRecno:: ").append(this.recNo++).append("\n");
        dump.append("URL:: " + key.toString() + "\n");
        while (values.hasNext()) {
            Writable value = values.next().get();
            if (value instanceof CrawlDatum) {
                dump.append("\nCrawlDatum::\n").append(((CrawlDatum)value).toString());
                continue;
            }
            if (value instanceof Content) {
                dump.append("\nContent::\n").append(((Content)value).toString());
                continue;
            }
            if (value instanceof ParseData) {
                dump.append("\nParseData::\n").append(((ParseData)value).toString());
                continue;
            }
            if (value instanceof ParseText) {
                dump.append("\nParseText::\n").append(((ParseText)value).toString());
                continue;
            }
            if (!LOG.isWarnEnabled()) continue;
            LOG.warn("Unrecognized type: " + value.getClass());
        }
        output.collect((Object)key, (Object)new Text(dump.toString()));
    }

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    public void dump(Path segment, Path output) throws IOException {
        if (LOG.isInfoEnabled()) {
            LOG.info("SegmentReader: dump segment: " + segment);
        }
        JobConf job = this.createJobConf();
        job.setJobName("read " + segment);
        if (this.ge) {
            FileInputFormat.addInputPath((JobConf)job, (Path)new Path(segment, "crawl_generate"));
        }
        if (this.fe) {
            FileInputFormat.addInputPath((JobConf)job, (Path)new Path(segment, "crawl_fetch"));
        }
        if (this.pa) {
            FileInputFormat.addInputPath((JobConf)job, (Path)new Path(segment, "crawl_parse"));
        }
        if (this.co) {
            FileInputFormat.addInputPath((JobConf)job, (Path)new Path(segment, "content"));
        }
        if (this.pd) {
            FileInputFormat.addInputPath((JobConf)job, (Path)new Path(segment, "parse_data"));
        }
        if (this.pt) {
            FileInputFormat.addInputPath((JobConf)job, (Path)new Path(segment, "parse_text"));
        }
        job.setInputFormat(SequenceFileInputFormat.class);
        job.setMapperClass(InputCompatMapper.class);
        job.setReducerClass(SegmentReader.class);
        Path tempDir = new Path(job.get("hadoop.tmp.dir", "/tmp") + "/segread-" + new Random().nextInt());
        this.fs.delete(tempDir, true);
        FileOutputFormat.setOutputPath((JobConf)job, (Path)tempDir);
        job.setOutputFormat(TextOutputFormat.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(NutchWritable.class);
        JobClient.runJob((JobConf)job);
        Path dumpFile = new Path(output, job.get("segment.dump.dir", "dump"));
        this.fs.delete(dumpFile, true);
        FileStatus[] fstats = this.fs.listStatus(tempDir, HadoopFSUtil.getPassAllFilter());
        Path[] files = HadoopFSUtil.getPaths(fstats);
        PrintWriter writer = null;
        int currentRecordNumber = 0;
        if (files.length > 0) {
            writer = new PrintWriter(new BufferedWriter(new OutputStreamWriter((OutputStream)this.fs.create(dumpFile))));
            try {
                for (int i = 0; i < files.length; ++i) {
                    Path partFile = files[i];
                    try {
                        currentRecordNumber = this.append(this.fs, (Configuration)job, partFile, writer, currentRecordNumber);
                        continue;
                    }
                    catch (IOException exception) {
                        if (!LOG.isWarnEnabled()) continue;
                        LOG.warn("Couldn't copy the content of " + partFile.toString() + " into " + dumpFile.toString());
                        LOG.warn(exception.getMessage());
                    }
                }
            }
            finally {
                writer.close();
            }
        }
        this.fs.delete(tempDir, true);
        if (LOG.isInfoEnabled()) {
            LOG.info("SegmentReader: done");
        }
    }

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    private int append(FileSystem fs, Configuration conf, Path src, PrintWriter writer, int currentRecordNumber) throws IOException {
        BufferedReader reader = new BufferedReader(new InputStreamReader((InputStream)fs.open(src)));
        try {
            String line = reader.readLine();
            while (line != null) {
                if (line.startsWith("Recno:: ")) {
                    line = "Recno:: " + currentRecordNumber++;
                }
                writer.println(line);
                line = reader.readLine();
            }
            int n = currentRecordNumber;
            return n;
        }
        finally {
            reader.close();
        }
    }

    public void get(final Path segment, final Text key, Writer writer, final Map<String, List<Writable>> results) throws Exception {
        int cnt;
        LOG.info("SegmentReader: get '" + key + "'");
        ArrayList<Thread> threads = new ArrayList<Thread>();
        if (this.co) {
            threads.add(new Thread(){

                @Override
                public void run() {
                    try {
                        List res = SegmentReader.this.getMapRecords(new Path(segment, "content"), key);
                        results.put("co", res);
                    }
                    catch (Exception e) {
                        LOG.error("Exception:", (Throwable)e);
                    }
                }
            });
        }
        if (this.fe) {
            threads.add(new Thread(){

                @Override
                public void run() {
                    try {
                        List res = SegmentReader.this.getMapRecords(new Path(segment, "crawl_fetch"), key);
                        results.put("fe", res);
                    }
                    catch (Exception e) {
                        LOG.error("Exception:", (Throwable)e);
                    }
                }
            });
        }
        if (this.ge) {
            threads.add(new Thread(){

                @Override
                public void run() {
                    try {
                        List res = SegmentReader.this.getSeqRecords(new Path(segment, "crawl_generate"), key);
                        results.put("ge", res);
                    }
                    catch (Exception e) {
                        LOG.error("Exception:", (Throwable)e);
                    }
                }
            });
        }
        if (this.pa) {
            threads.add(new Thread(){

                @Override
                public void run() {
                    try {
                        List res = SegmentReader.this.getSeqRecords(new Path(segment, "crawl_parse"), key);
                        results.put("pa", res);
                    }
                    catch (Exception e) {
                        LOG.error("Exception:", (Throwable)e);
                    }
                }
            });
        }
        if (this.pd) {
            threads.add(new Thread(){

                @Override
                public void run() {
                    try {
                        List res = SegmentReader.this.getMapRecords(new Path(segment, "parse_data"), key);
                        results.put("pd", res);
                    }
                    catch (Exception e) {
                        LOG.error("Exception:", (Throwable)e);
                    }
                }
            });
        }
        if (this.pt) {
            threads.add(new Thread(){

                @Override
                public void run() {
                    try {
                        List res = SegmentReader.this.getMapRecords(new Path(segment, "parse_text"), key);
                        results.put("pt", res);
                    }
                    catch (Exception e) {
                        LOG.error("Exception:", (Throwable)e);
                    }
                }
            });
        }
        Iterator it = threads.iterator();
        while (it.hasNext()) {
            ((Thread)it.next()).start();
        }
        do {
            cnt = 0;
            try {
                Thread.sleep(5000L);
            }
            catch (Exception e) {
                // empty catch block
            }
            it = threads.iterator();
            while (it.hasNext()) {
                if (!((Thread)it.next()).isAlive()) continue;
                ++cnt;
            }
            if (cnt <= 0 || !LOG.isDebugEnabled()) continue;
            LOG.debug("(" + cnt + " to retrieve)");
        } while (cnt > 0);
        for (int i = 0; i < keys.length; ++i) {
            List<Writable> res = results.get(keys[i][0]);
            if (res != null && res.size() > 0) {
                for (int k = 0; k < res.size(); ++k) {
                    writer.write(keys[i][1]);
                    writer.write(res.get(k) + "\n");
                }
            }
            writer.flush();
        }
    }

    private List<Writable> getMapRecords(Path dir, Text key) throws Exception {
        MapFile.Reader[] readers = MapFileOutputFormat.getReaders((FileSystem)this.fs, (Path)dir, (Configuration)this.getConf());
        ArrayList<Writable> res = new ArrayList<Writable>();
        Class keyClass = readers[0].getKeyClass();
        Class valueClass = readers[0].getValueClass();
        if (!keyClass.getName().equals("org.apache.hadoop.io.Text")) {
            throw new IOException("Incompatible key (" + keyClass.getName() + ")");
        }
        Writable value = (Writable)valueClass.newInstance();
        for (int i = 0; i < readers.length; ++i) {
            if (readers[i].get((WritableComparable)key, value) != null) {
                res.add(value);
                value = (Writable)valueClass.newInstance();
                Text aKey = (Text)keyClass.newInstance();
                while (readers[i].next((WritableComparable)aKey, value) && aKey.equals((Object)key)) {
                    res.add(value);
                    value = (Writable)valueClass.newInstance();
                }
            }
            readers[i].close();
        }
        return res;
    }

    private List<Writable> getSeqRecords(Path dir, Text key) throws Exception {
        SequenceFile.Reader[] readers = SequenceFileOutputFormat.getReaders((Configuration)this.getConf(), (Path)dir);
        ArrayList<Writable> res = new ArrayList<Writable>();
        Class keyClass = readers[0].getKeyClass();
        Class valueClass = readers[0].getValueClass();
        if (!keyClass.getName().equals("org.apache.hadoop.io.Text")) {
            throw new IOException("Incompatible key (" + keyClass.getName() + ")");
        }
        Writable aKey = (Writable)keyClass.newInstance();
        Writable value = (Writable)valueClass.newInstance();
        for (int i = 0; i < readers.length; ++i) {
            while (readers[i].next(aKey, value)) {
                if (!aKey.equals(key)) continue;
                res.add(value);
                value = (Writable)valueClass.newInstance();
            }
            readers[i].close();
        }
        return res;
    }

    public void list(List<Path> dirs, Writer writer) throws Exception {
        writer.write("NAME\t\tGENERATED\tFETCHER START\t\tFETCHER END\t\tFETCHED\tPARSED\n");
        for (int i = 0; i < dirs.size(); ++i) {
            Path dir = dirs.get(i);
            SegmentReaderStats stats = new SegmentReaderStats();
            this.getStats(dir, stats);
            writer.write(dir.getName() + "\t");
            if (stats.generated == -1L) {
                writer.write("?");
            } else {
                writer.write(stats.generated + "");
            }
            writer.write("\t\t");
            if (stats.start == -1L) {
                writer.write("?\t");
            } else {
                writer.write(this.sdf.format(new Date(stats.start)));
            }
            writer.write("\t");
            if (stats.end == -1L) {
                writer.write("?");
            } else {
                writer.write(this.sdf.format(new Date(stats.end)));
            }
            writer.write("\t");
            if (stats.fetched == -1L) {
                writer.write("?");
            } else {
                writer.write(stats.fetched + "");
            }
            writer.write("\t");
            if (stats.parsed == -1L) {
                writer.write("?");
            } else {
                writer.write(stats.parsed + "");
            }
            writer.write("\n");
            writer.flush();
        }
    }

    public void getStats(Path segment, SegmentReaderStats stats) throws Exception {
        Path parseDir;
        SequenceFile.Reader[] readers = SequenceFileOutputFormat.getReaders((Configuration)this.getConf(), (Path)new Path(segment, "crawl_generate"));
        long cnt = 0L;
        Text key = new Text();
        for (int i = 0; i < readers.length; ++i) {
            while (readers[i].next((Writable)key)) {
                ++cnt;
            }
            readers[i].close();
        }
        stats.generated = cnt;
        Path fetchDir = new Path(segment, "crawl_fetch");
        if (this.fs.exists(fetchDir) && this.fs.getFileStatus(fetchDir).isDir()) {
            cnt = 0L;
            long start = Long.MAX_VALUE;
            long end = Long.MIN_VALUE;
            CrawlDatum value = new CrawlDatum();
            MapFile.Reader[] mreaders = MapFileOutputFormat.getReaders((FileSystem)this.fs, (Path)fetchDir, (Configuration)this.getConf());
            for (int i = 0; i < mreaders.length; ++i) {
                while (mreaders[i].next((WritableComparable)key, (Writable)value)) {
                    ++cnt;
                    if (value.getFetchTime() < start) {
                        start = value.getFetchTime();
                    }
                    if (value.getFetchTime() <= end) continue;
                    end = value.getFetchTime();
                }
                mreaders[i].close();
            }
            stats.start = start;
            stats.end = end;
            stats.fetched = cnt;
        }
        if (this.fs.exists(parseDir = new Path(segment, "parse_data")) && this.fs.getFileStatus(parseDir).isDir()) {
            cnt = 0L;
            long errors = 0L;
            ParseData value = new ParseData();
            MapFile.Reader[] mreaders = MapFileOutputFormat.getReaders((FileSystem)this.fs, (Path)parseDir, (Configuration)this.getConf());
            for (int i = 0; i < mreaders.length; ++i) {
                while (mreaders[i].next((WritableComparable)key, (Writable)value)) {
                    ++cnt;
                    if (value.getStatus().isSuccess()) continue;
                    ++errors;
                }
                mreaders[i].close();
            }
            stats.parsed = cnt;
            stats.parseErrors = errors;
        }
    }

    public static void main(String[] args) throws Exception {
        if (args.length < 2) {
            SegmentReader.usage();
            return;
        }
        int mode = -1;
        if (args[0].equals("-dump")) {
            mode = 0;
        } else if (args[0].equals("-list")) {
            mode = 1;
        } else if (args[0].equals("-get")) {
            mode = 2;
        }
        boolean co = true;
        boolean fe = true;
        boolean ge = true;
        boolean pa = true;
        boolean pd = true;
        boolean pt = true;
        for (int i = 1; i < args.length; ++i) {
            if (args[i].equals("-nocontent")) {
                co = false;
                args[i] = null;
                continue;
            }
            if (args[i].equals("-nofetch")) {
                fe = false;
                args[i] = null;
                continue;
            }
            if (args[i].equals("-nogenerate")) {
                ge = false;
                args[i] = null;
                continue;
            }
            if (args[i].equals("-noparse")) {
                pa = false;
                args[i] = null;
                continue;
            }
            if (args[i].equals("-noparsedata")) {
                pd = false;
                args[i] = null;
                continue;
            }
            if (!args[i].equals("-noparsetext")) continue;
            pt = false;
            args[i] = null;
        }
        Configuration conf = NutchConfiguration.create();
        FileSystem fs = FileSystem.get((Configuration)conf);
        SegmentReader segmentReader = new SegmentReader(conf, co, fe, ge, pa, pd, pt);
        switch (mode) {
            case 0: {
                String output;
                String input = args[1];
                if (input == null) {
                    System.err.println("Missing required argument: <segment_dir>");
                    SegmentReader.usage();
                    return;
                }
                String string = output = args.length > 2 ? args[2] : null;
                if (output == null) {
                    System.err.println("Missing required argument: <output>");
                    SegmentReader.usage();
                    return;
                }
                segmentReader.dump(new Path(input), new Path(output));
                return;
            }
            case 1: {
                ArrayList<Path> dirs = new ArrayList<Path>();
                for (int i = 1; i < args.length; ++i) {
                    if (args[i] == null) continue;
                    if (args[i].equals("-dir")) {
                        Path dir;
                        FileStatus[] fstats;
                        Path[] files;
                        if ((files = HadoopFSUtil.getPaths(fstats = fs.listStatus(dir = new Path(args[++i]), HadoopFSUtil.getPassDirectoriesFilter(fs)))) == null || files.length <= 0) continue;
                        dirs.addAll(Arrays.asList(files));
                        continue;
                    }
                    dirs.add(new Path(args[i]));
                }
                segmentReader.list(dirs, new OutputStreamWriter((OutputStream)System.out, "UTF-8"));
                return;
            }
            case 2: {
                String key;
                String input = args[1];
                if (input == null) {
                    System.err.println("Missing required argument: <segment_dir>");
                    SegmentReader.usage();
                    return;
                }
                String string = key = args.length > 2 ? args[2] : null;
                if (key == null) {
                    System.err.println("Missing required argument: <keyValue>");
                    SegmentReader.usage();
                    return;
                }
                segmentReader.get(new Path(input), new Text(key), new OutputStreamWriter((OutputStream)System.out, "UTF-8"), new HashMap<String, List<Writable>>());
                return;
            }
        }
        System.err.println("Invalid operation: " + args[0]);
        SegmentReader.usage();
    }

    private static void usage() {
        System.err.println("Usage: SegmentReader (-dump ... | -list ... | -get ...) [general options]\n");
        System.err.println("* General options:");
        System.err.println("\t-nocontent\tignore content directory");
        System.err.println("\t-nofetch\tignore crawl_fetch directory");
        System.err.println("\t-nogenerate\tignore crawl_generate directory");
        System.err.println("\t-noparse\tignore crawl_parse directory");
        System.err.println("\t-noparsedata\tignore parse_data directory");
        System.err.println("\t-noparsetext\tignore parse_text directory");
        System.err.println();
        System.err.println("* SegmentReader -dump <segment_dir> <output> [general options]");
        System.err.println("  Dumps content of a <segment_dir> as a text file to <output>.\n");
        System.err.println("\t<segment_dir>\tname of the segment directory.");
        System.err.println("\t<output>\tname of the (non-existent) output directory.");
        System.err.println();
        System.err.println("* SegmentReader -list (<segment_dir1> ... | -dir <segments>) [general options]");
        System.err.println("  List a synopsis of segments in specified directories, or all segments in");
        System.err.println("  a directory <segments>, and print it on System.out\n");
        System.err.println("\t<segment_dir1> ...\tlist of segment directories to process");
        System.err.println("\t-dir <segments>\t\tdirectory that contains multiple segments");
        System.err.println();
        System.err.println("* SegmentReader -get <segment_dir> <keyValue> [general options]");
        System.err.println("  Get a specified record from a segment, and print it on System.out.\n");
        System.err.println("\t<segment_dir>\tname of the segment directory.");
        System.err.println("\t<keyValue>\tvalue of the key (url).");
        System.err.println("\t\tNote: put double-quotes around strings with spaces.");
    }

    public static class SegmentReaderStats {
        public long start = -1L;
        public long end = -1L;
        public long generated = -1L;
        public long fetched = -1L;
        public long fetchErrors = -1L;
        public long parsed = -1L;
        public long parseErrors = -1L;
    }

    public static class TextOutputFormat
    extends FileOutputFormat<WritableComparable<?>, Writable> {
        public RecordWriter<WritableComparable<?>, Writable> getRecordWriter(FileSystem fs, JobConf job, String name, Progressable progress) throws IOException {
            Path segmentDumpFile = new Path(FileOutputFormat.getOutputPath((JobConf)job), name);
            if (fs.exists(segmentDumpFile)) {
                fs.delete(segmentDumpFile, true);
            }
            final PrintStream printStream = new PrintStream((OutputStream)fs.create(segmentDumpFile));
            return new RecordWriter<WritableComparable<?>, Writable>(){

                public synchronized void write(WritableComparable<?> key, Writable value) throws IOException {
                    printStream.println(value);
                }

                public synchronized void close(Reporter reporter) throws IOException {
                    printStream.close();
                }
            };
        }
    }

    public static class InputCompatMapper
    extends MapReduceBase
    implements Mapper<WritableComparable<?>, Writable, Text, NutchWritable> {
        private Text newKey = new Text();

        public void map(WritableComparable<?> key, Writable value, OutputCollector<Text, NutchWritable> collector, Reporter reporter) throws IOException {
            if (key instanceof Text) {
                this.newKey.set(key.toString());
                key = this.newKey;
            }
            collector.collect((Object)key, (Object)new NutchWritable(value));
        }
    }
}

