/*
 * Decompiled with CFR 0.152.
 */
package org.apache.nutch.segment;

import java.io.Closeable;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.TreeMap;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.MapFile;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.RecordWriter;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.mapred.SequenceFileRecordReader;
import org.apache.hadoop.util.Progressable;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.Generator;
import org.apache.nutch.metadata.MetaWrapper;
import org.apache.nutch.net.URLFilters;
import org.apache.nutch.net.URLNormalizers;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.parse.ParseText;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.segment.SegmentMergeFilters;
import org.apache.nutch.segment.SegmentPart;
import org.apache.nutch.util.HadoopFSUtil;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class SegmentMerger
extends Configured
implements Mapper<Text, MetaWrapper, Text, MetaWrapper>,
Reducer<Text, MetaWrapper, Text, MetaWrapper> {
    private static final Logger LOG = LoggerFactory.getLogger(SegmentMerger.class);
    private static final String SEGMENT_PART_KEY = "part";
    private static final String SEGMENT_SLICE_KEY = "slice";
    private URLFilters filters = null;
    private URLNormalizers normalizers = null;
    private SegmentMergeFilters mergeFilters = null;
    private long sliceSize = -1L;
    private long curCount = 0L;
    private Text newKey = new Text();

    public SegmentMerger() {
        super(null);
    }

    public SegmentMerger(Configuration conf) {
        super(conf);
    }

    public void setConf(Configuration conf) {
        super.setConf(conf);
        if (conf == null) {
            return;
        }
        if (conf.getBoolean("segment.merger.filter", false)) {
            this.filters = new URLFilters(conf);
            this.mergeFilters = new SegmentMergeFilters(conf);
        }
        if (conf.getBoolean("segment.merger.normalizer", false)) {
            this.normalizers = new URLNormalizers(conf, "default");
        }
        this.sliceSize = conf.getLong("segment.merger.slice", -1L);
        if (this.sliceSize > 0L && LOG.isInfoEnabled()) {
            LOG.info("Slice size: " + this.sliceSize + " URLs.");
        }
    }

    public void close() throws IOException {
    }

    public void configure(JobConf conf) {
        this.setConf((Configuration)conf);
        if (this.sliceSize > 0L) {
            this.sliceSize /= (long)conf.getNumReduceTasks();
        }
    }

    public void map(Text key, MetaWrapper value, OutputCollector<Text, MetaWrapper> output, Reporter reporter) throws IOException {
        String url = key.toString();
        if (this.normalizers != null) {
            try {
                url = this.normalizers.normalize(url, "default");
            }
            catch (Exception e) {
                LOG.warn("Skipping " + url + ":" + e.getMessage());
                url = null;
            }
        }
        if (url != null && this.filters != null) {
            try {
                url = this.filters.filter(url);
            }
            catch (Exception e) {
                LOG.warn("Skipping key " + url + ": " + e.getMessage());
                url = null;
            }
        }
        if (url != null) {
            this.newKey.set(url);
            output.collect((Object)this.newKey, (Object)value);
        }
    }

    public void reduce(Text key, Iterator<MetaWrapper> values, OutputCollector<Text, MetaWrapper> output, Reporter reporter) throws IOException {
        CrawlDatum lastG = null;
        CrawlDatum lastF = null;
        CrawlDatum lastSig = null;
        Content lastC = null;
        ParseData lastPD = null;
        ParseText lastPT = null;
        String lastGname = null;
        String lastFname = null;
        String lastSigname = null;
        String lastCname = null;
        String lastPDname = null;
        String lastPTname = null;
        TreeMap<String, ArrayList<CrawlDatum>> linked = new TreeMap<String, ArrayList<CrawlDatum>>();
        while (values.hasNext()) {
            MetaWrapper wrapper = values.next();
            Writable o = wrapper.get();
            String spString = wrapper.getMeta(SEGMENT_PART_KEY);
            if (spString == null) {
                throw new IOException("Null segment part, key=" + key);
            }
            SegmentPart sp = SegmentPart.parse(spString);
            if (o instanceof CrawlDatum) {
                CrawlDatum val = (CrawlDatum)o;
                if (sp.partName.equals("crawl_generate")) {
                    if (lastG == null) {
                        lastG = val;
                        lastGname = sp.segmentName;
                        continue;
                    }
                    if (lastGname.compareTo(sp.segmentName) >= 0) continue;
                    lastG = val;
                    lastGname = sp.segmentName;
                    continue;
                }
                if (sp.partName.equals("crawl_fetch")) {
                    if (!CrawlDatum.hasFetchStatus(val) || val.getStatus() == 34 || val.getStatus() == 38) continue;
                    if (lastF == null) {
                        lastF = val;
                        lastFname = sp.segmentName;
                        continue;
                    }
                    if (lastFname.compareTo(sp.segmentName) >= 0) continue;
                    lastF = val;
                    lastFname = sp.segmentName;
                    continue;
                }
                if (sp.partName.equals("crawl_parse")) {
                    if (val.getStatus() == 65) {
                        if (lastSig == null) {
                            lastSig = val;
                            lastSigname = sp.segmentName;
                            continue;
                        }
                        if (lastSigname.compareTo(sp.segmentName) >= 0) continue;
                        lastSig = val;
                        lastSigname = sp.segmentName;
                        continue;
                    }
                    ArrayList<CrawlDatum> segLinked = (ArrayList<CrawlDatum>)linked.get(sp.segmentName);
                    if (segLinked == null) {
                        segLinked = new ArrayList<CrawlDatum>();
                        linked.put(sp.segmentName, segLinked);
                    }
                    segLinked.add(val);
                    continue;
                }
                throw new IOException("Cannot determine segment part: " + sp.partName);
            }
            if (o instanceof Content) {
                if (lastC == null) {
                    lastC = (Content)o;
                    lastCname = sp.segmentName;
                    continue;
                }
                if (lastCname.compareTo(sp.segmentName) >= 0) continue;
                lastC = (Content)o;
                lastCname = sp.segmentName;
                continue;
            }
            if (o instanceof ParseData) {
                if (lastPD == null) {
                    lastPD = (ParseData)o;
                    lastPDname = sp.segmentName;
                    continue;
                }
                if (lastPDname.compareTo(sp.segmentName) >= 0) continue;
                lastPD = (ParseData)o;
                lastPDname = sp.segmentName;
                continue;
            }
            if (!(o instanceof ParseText)) continue;
            if (lastPT == null) {
                lastPT = (ParseText)o;
                lastPTname = sp.segmentName;
                continue;
            }
            if (lastPTname.compareTo(sp.segmentName) >= 0) continue;
            lastPT = (ParseText)o;
            lastPTname = sp.segmentName;
        }
        if (this.mergeFilters != null && !this.mergeFilters.filter(key, lastG, lastF, lastSig, lastC, lastPD, lastPT, linked.isEmpty() ? null : (ArrayList)linked.lastEntry().getValue())) {
            return;
        }
        ++this.curCount;
        String sliceName = null;
        MetaWrapper wrapper = new MetaWrapper();
        if (this.sliceSize > 0L) {
            sliceName = String.valueOf(this.curCount / this.sliceSize);
            wrapper.setMeta(SEGMENT_SLICE_KEY, sliceName);
        }
        SegmentPart sp = new SegmentPart();
        if (lastG != null) {
            wrapper.set((Writable)lastG);
            sp.partName = "crawl_generate";
            sp.segmentName = lastGname;
            wrapper.setMeta(SEGMENT_PART_KEY, sp.toString());
            output.collect((Object)key, (Object)wrapper);
        }
        if (lastF != null) {
            wrapper.set((Writable)lastF);
            sp.partName = "crawl_fetch";
            sp.segmentName = lastFname;
            wrapper.setMeta(SEGMENT_PART_KEY, sp.toString());
            output.collect((Object)key, (Object)wrapper);
        }
        if (lastSig != null) {
            wrapper.set((Writable)lastSig);
            sp.partName = "crawl_parse";
            sp.segmentName = lastSigname;
            wrapper.setMeta(SEGMENT_PART_KEY, sp.toString());
            output.collect((Object)key, (Object)wrapper);
        }
        if (lastC != null) {
            wrapper.set(lastC);
            sp.partName = "content";
            sp.segmentName = lastCname;
            wrapper.setMeta(SEGMENT_PART_KEY, sp.toString());
            output.collect((Object)key, (Object)wrapper);
        }
        if (lastPD != null) {
            wrapper.set((Writable)lastPD);
            sp.partName = "parse_data";
            sp.segmentName = lastPDname;
            wrapper.setMeta(SEGMENT_PART_KEY, sp.toString());
            output.collect((Object)key, (Object)wrapper);
        }
        if (lastPT != null) {
            wrapper.set(lastPT);
            sp.partName = "parse_text";
            sp.segmentName = lastPTname;
            wrapper.setMeta(SEGMENT_PART_KEY, sp.toString());
            output.collect((Object)key, (Object)wrapper);
        }
        if (linked.size() > 0) {
            String name = (String)linked.lastKey();
            sp.partName = "crawl_parse";
            sp.segmentName = name;
            wrapper.setMeta(SEGMENT_PART_KEY, sp.toString());
            ArrayList segLinked = (ArrayList)linked.get(name);
            for (int i = 0; i < segLinked.size(); ++i) {
                CrawlDatum link = (CrawlDatum)segLinked.get(i);
                wrapper.set((Writable)link);
                output.collect((Object)key, (Object)wrapper);
            }
        }
    }

    public void merge(Path out, Path[] segs, boolean filter, boolean normalize, long slice) throws Exception {
        Path gDir;
        String segmentName = Generator.generateSegmentName();
        if (LOG.isInfoEnabled()) {
            LOG.info("Merging " + segs.length + " segments to " + out + "/" + segmentName);
        }
        NutchJob job = new NutchJob(this.getConf());
        job.setJobName("mergesegs " + out + "/" + segmentName);
        job.setBoolean("segment.merger.filter", filter);
        job.setBoolean("segment.merger.normalizer", normalize);
        job.setLong("segment.merger.slice", slice);
        job.set("segment.merger.segmentName", segmentName);
        FileSystem fs = FileSystem.get((Configuration)this.getConf());
        boolean g = true;
        boolean f = true;
        boolean p = true;
        boolean c = true;
        boolean pd = true;
        boolean pt = true;
        for (int i = 0; i < segs.length; ++i) {
            if (!fs.exists(segs[i])) {
                if (LOG.isWarnEnabled()) {
                    LOG.warn("Input dir " + segs[i] + " doesn't exist, skipping.");
                }
                segs[i] = null;
                continue;
            }
            if (LOG.isInfoEnabled()) {
                LOG.info("SegmentMerger:   adding " + segs[i]);
            }
            Path cDir = new Path(segs[i], "content");
            gDir = new Path(segs[i], "crawl_generate");
            Path fDir = new Path(segs[i], "crawl_fetch");
            Path pDir = new Path(segs[i], "crawl_parse");
            Path pdDir = new Path(segs[i], "parse_data");
            Path ptDir = new Path(segs[i], "parse_text");
            c = c && fs.exists(cDir);
            g = g && fs.exists(gDir);
            f = f && fs.exists(fDir);
            p = p && fs.exists(pDir);
            pd = pd && fs.exists(pdDir);
            pt = pt && fs.exists(ptDir);
        }
        StringBuffer sb = new StringBuffer();
        if (c) {
            sb.append(" content");
        }
        if (g) {
            sb.append(" crawl_generate");
        }
        if (f) {
            sb.append(" crawl_fetch");
        }
        if (p) {
            sb.append(" crawl_parse");
        }
        if (pd) {
            sb.append(" parse_data");
        }
        if (pt) {
            sb.append(" parse_text");
        }
        if (LOG.isInfoEnabled()) {
            LOG.info("SegmentMerger: using segment data from:" + sb.toString());
        }
        for (int i = 0; i < segs.length; ++i) {
            if (segs[i] == null) continue;
            if (g) {
                gDir = new Path(segs[i], "crawl_generate");
                FileInputFormat.addInputPath((JobConf)job, (Path)gDir);
            }
            if (c) {
                Path cDir = new Path(segs[i], "content");
                FileInputFormat.addInputPath((JobConf)job, (Path)cDir);
            }
            if (f) {
                Path fDir = new Path(segs[i], "crawl_fetch");
                FileInputFormat.addInputPath((JobConf)job, (Path)fDir);
            }
            if (p) {
                Path pDir = new Path(segs[i], "crawl_parse");
                FileInputFormat.addInputPath((JobConf)job, (Path)pDir);
            }
            if (pd) {
                Path pdDir = new Path(segs[i], "parse_data");
                FileInputFormat.addInputPath((JobConf)job, (Path)pdDir);
            }
            if (!pt) continue;
            Path ptDir = new Path(segs[i], "parse_text");
            FileInputFormat.addInputPath((JobConf)job, (Path)ptDir);
        }
        job.setInputFormat(ObjectInputFormat.class);
        job.setMapperClass(SegmentMerger.class);
        job.setReducerClass(SegmentMerger.class);
        FileOutputFormat.setOutputPath((JobConf)job, (Path)out);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(MetaWrapper.class);
        job.setOutputFormat(SegmentOutputFormat.class);
        this.setConf((Configuration)job);
        JobClient.runJob((JobConf)job);
    }

    public static void main(String[] args) throws Exception {
        if (args.length < 2) {
            System.err.println("SegmentMerger output_dir (-dir segments | seg1 seg2 ...) [-filter] [-slice NNNN]");
            System.err.println("\toutput_dir\tname of the parent dir for output segment slice(s)");
            System.err.println("\t-dir segments\tparent dir containing several segments");
            System.err.println("\tseg1 seg2 ...\tlist of segment dirs");
            System.err.println("\t-filter\t\tfilter out URL-s prohibited by current URLFilters");
            System.err.println("\t-normalize\t\tnormalize URL via current URLNormalizers");
            System.err.println("\t-slice NNNN\tcreate many output segments, each containing NNNN URLs");
            return;
        }
        Configuration conf = NutchConfiguration.create();
        FileSystem fs = FileSystem.get((Configuration)conf);
        Path out = new Path(args[0]);
        ArrayList<Path> segs = new ArrayList<Path>();
        long sliceSize = 0L;
        boolean filter = false;
        boolean normalize = false;
        for (int i = 1; i < args.length; ++i) {
            if (args[i].equals("-dir")) {
                FileStatus[] fstats = fs.listStatus(new Path(args[++i]), HadoopFSUtil.getPassDirectoriesFilter(fs));
                Path[] files = HadoopFSUtil.getPaths(fstats);
                for (int j = 0; j < files.length; ++j) {
                    segs.add(files[j]);
                }
                continue;
            }
            if (args[i].equals("-filter")) {
                filter = true;
                continue;
            }
            if (args[i].equals("-normalize")) {
                normalize = true;
                continue;
            }
            if (args[i].equals("-slice")) {
                sliceSize = Long.parseLong(args[++i]);
                continue;
            }
            segs.add(new Path(args[i]));
        }
        if (segs.size() == 0) {
            System.err.println("ERROR: No input segments.");
            return;
        }
        SegmentMerger merger = new SegmentMerger(conf);
        merger.merge(out, segs.toArray(new Path[segs.size()]), filter, normalize, sliceSize);
    }

    public static class SegmentOutputFormat
    extends FileOutputFormat<Text, MetaWrapper> {
        private static final String DEFAULT_SLICE = "default";

        public RecordWriter<Text, MetaWrapper> getRecordWriter(final FileSystem fs, final JobConf job, final String name, final Progressable progress) throws IOException {
            return new RecordWriter<Text, MetaWrapper>(){
                MapFile.Writer c_out = null;
                MapFile.Writer f_out = null;
                MapFile.Writer pd_out = null;
                MapFile.Writer pt_out = null;
                SequenceFile.Writer g_out = null;
                SequenceFile.Writer p_out = null;
                HashMap<String, Closeable> sliceWriters = new HashMap();
                String segmentName = job.get("segment.merger.segmentName");

                /*
                 * Enabled force condition propagation
                 * Lifted jumps to return sites
                 */
                public void write(Text key, MetaWrapper wrapper) throws IOException {
                    SegmentPart sp = SegmentPart.parse(wrapper.getMeta(SegmentMerger.SEGMENT_PART_KEY));
                    Writable o = wrapper.get();
                    String slice = wrapper.getMeta(SegmentMerger.SEGMENT_SLICE_KEY);
                    if (o instanceof CrawlDatum) {
                        if (sp.partName.equals("crawl_generate")) {
                            this.g_out = this.ensureSequenceFile(slice, "crawl_generate");
                            this.g_out.append((Writable)key, o);
                            return;
                        } else if (sp.partName.equals("crawl_fetch")) {
                            this.f_out = this.ensureMapFile(slice, "crawl_fetch", CrawlDatum.class);
                            this.f_out.append((WritableComparable)key, o);
                            return;
                        } else {
                            if (!sp.partName.equals("crawl_parse")) throw new IOException("Cannot determine segment part: " + sp.partName);
                            this.p_out = this.ensureSequenceFile(slice, "crawl_parse");
                            this.p_out.append((Writable)key, o);
                        }
                        return;
                    } else if (o instanceof Content) {
                        this.c_out = this.ensureMapFile(slice, "content", Content.class);
                        this.c_out.append((WritableComparable)key, o);
                        return;
                    } else if (o instanceof ParseData) {
                        if (slice == null) {
                            ((ParseData)o).getContentMeta().set("nutch.segment.name", this.segmentName);
                        } else {
                            ((ParseData)o).getContentMeta().set("nutch.segment.name", this.segmentName + "-" + slice);
                        }
                        this.pd_out = this.ensureMapFile(slice, "parse_data", ParseData.class);
                        this.pd_out.append((WritableComparable)key, o);
                        return;
                    } else {
                        if (!(o instanceof ParseText)) return;
                        this.pt_out = this.ensureMapFile(slice, "parse_text", ParseText.class);
                        this.pt_out.append((WritableComparable)key, o);
                    }
                }

                private SequenceFile.Writer ensureSequenceFile(String slice, String dirName) throws IOException {
                    SequenceFile.Writer res;
                    if (slice == null) {
                        slice = SegmentOutputFormat.DEFAULT_SLICE;
                    }
                    if ((res = (SequenceFile.Writer)this.sliceWriters.get(slice + dirName)) != null) {
                        return res;
                    }
                    Path out = FileOutputFormat.getOutputPath((JobConf)job);
                    Path wname = slice == SegmentOutputFormat.DEFAULT_SLICE ? new Path(new Path(new Path(out, this.segmentName), dirName), name) : new Path(new Path(new Path(out, this.segmentName + "-" + slice), dirName), name);
                    res = SequenceFile.createWriter((FileSystem)fs, (Configuration)job, (Path)wname, Text.class, CrawlDatum.class, (SequenceFile.CompressionType)SequenceFileOutputFormat.getOutputCompressionType((JobConf)job), (Progressable)progress);
                    this.sliceWriters.put(slice + dirName, (Closeable)res);
                    return res;
                }

                private MapFile.Writer ensureMapFile(String slice, String dirName, Class<? extends Writable> clazz) throws IOException {
                    MapFile.Writer res;
                    if (slice == null) {
                        slice = SegmentOutputFormat.DEFAULT_SLICE;
                    }
                    if ((res = (MapFile.Writer)this.sliceWriters.get(slice + dirName)) != null) {
                        return res;
                    }
                    Path out = FileOutputFormat.getOutputPath((JobConf)job);
                    Path wname = slice == SegmentOutputFormat.DEFAULT_SLICE ? new Path(new Path(new Path(out, this.segmentName), dirName), name) : new Path(new Path(new Path(out, this.segmentName + "-" + slice), dirName), name);
                    SequenceFile.CompressionType compType = SequenceFileOutputFormat.getOutputCompressionType((JobConf)job);
                    if (clazz.isAssignableFrom(ParseText.class)) {
                        compType = SequenceFile.CompressionType.RECORD;
                    }
                    res = new MapFile.Writer((Configuration)job, fs, wname.toString(), Text.class, clazz, compType, progress);
                    this.sliceWriters.put(slice + dirName, (Closeable)res);
                    return res;
                }

                public void close(Reporter reporter) throws IOException {
                    for (Closeable o : this.sliceWriters.values()) {
                        if (o instanceof SequenceFile.Writer) {
                            ((SequenceFile.Writer)o).close();
                            continue;
                        }
                        ((MapFile.Writer)o).close();
                    }
                }
            };
        }
    }

    public static class ObjectInputFormat
    extends SequenceFileInputFormat<Text, MetaWrapper> {
        public RecordReader<Text, MetaWrapper> getRecordReader(InputSplit split, JobConf job, Reporter reporter) throws IOException {
            Writable w;
            String spString;
            reporter.setStatus(split.toString());
            FileSplit fSplit = (FileSplit)split;
            try {
                SegmentPart segmentPart = SegmentPart.get(fSplit);
                spString = segmentPart.toString();
            }
            catch (IOException e) {
                throw new RuntimeException("Cannot identify segment:", e);
            }
            SequenceFile.Reader reader = new SequenceFile.Reader(FileSystem.get((Configuration)job), fSplit.getPath(), (Configuration)job);
            try {
                w = (Writable)reader.getValueClass().newInstance();
            }
            catch (Exception e) {
                throw new IOException(e.toString());
            }
            finally {
                try {
                    reader.close();
                }
                catch (Exception e) {}
            }
            final SequenceFileRecordReader splitReader = new SequenceFileRecordReader((Configuration)job, (FileSplit)split);
            try {
                return new SequenceFileRecordReader<Text, MetaWrapper>((Configuration)job, fSplit){

                    public synchronized boolean next(Text key, MetaWrapper wrapper) throws IOException {
                        FileInputFormat.LOG.debug((Object)"Running OIF.next()");
                        boolean res = splitReader.next((Object)key, (Object)w);
                        wrapper.set(w);
                        wrapper.setMeta(SegmentMerger.SEGMENT_PART_KEY, spString);
                        return res;
                    }

                    public synchronized void close() throws IOException {
                        splitReader.close();
                    }

                    public MetaWrapper createValue() {
                        return new MetaWrapper();
                    }
                };
            }
            catch (IOException e) {
                throw new RuntimeException("Cannot create RecordReader: ", e);
            }
        }
    }
}

