/*
 * Decompiled with CFR 0.152.
 */
package org.apache.nutch.scoring.webgraph;

import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.Random;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BooleanWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableUtils;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapFileOutputFormat;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.NutchWritable;
import org.apache.nutch.net.URLFilters;
import org.apache.nutch.net.URLNormalizers;
import org.apache.nutch.parse.Outlink;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.scoring.webgraph.LinkDatum;
import org.apache.nutch.scoring.webgraph.Node;
import org.apache.nutch.util.FSUtils;
import org.apache.nutch.util.HadoopFSUtil;
import org.apache.nutch.util.LockUtil;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.TimingUtil;
import org.apache.nutch.util.URLUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class WebGraph
extends Configured
implements Tool {
    public static final Logger LOG = LoggerFactory.getLogger(WebGraph.class);
    public static final String LOCK_NAME = ".locked";
    public static final String INLINK_DIR = "inlinks";
    public static final String OUTLINK_DIR = "outlinks/current";
    public static final String OLD_OUTLINK_DIR = "outlinks/old";
    public static final String NODE_DIR = "nodes";

    public void createWebGraph(Path webGraphDb, Path[] segments, boolean normalize, boolean filter) throws IOException {
        SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
        long start = System.currentTimeMillis();
        if (LOG.isInfoEnabled()) {
            LOG.info("WebGraphDb: starting at " + sdf.format(start));
            LOG.info("WebGraphDb: webgraphdb: " + webGraphDb);
            LOG.info("WebGraphDb: URL normalize: " + normalize);
            LOG.info("WebGraphDb: URL filter: " + filter);
        }
        Configuration conf = this.getConf();
        FileSystem fs = FileSystem.get((Configuration)conf);
        Path lock = new Path(webGraphDb, LOCK_NAME);
        if (!fs.exists(webGraphDb)) {
            fs.mkdirs(webGraphDb);
        }
        LockUtil.createLockFile(fs, lock, false);
        Path outlinkDb = new Path(webGraphDb, OUTLINK_DIR);
        Path oldOutlinkDb = new Path(webGraphDb, OLD_OUTLINK_DIR);
        if (!fs.exists(outlinkDb)) {
            fs.mkdirs(outlinkDb);
        }
        Path tempOutlinkDb = new Path(outlinkDb + "-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
        NutchJob outlinkJob = new NutchJob(conf);
        outlinkJob.setJobName("Outlinkdb: " + outlinkDb);
        boolean deleteGone = conf.getBoolean("link.delete.gone", false);
        boolean preserveBackup = conf.getBoolean("db.preserve.backup", true);
        if (deleteGone) {
            LOG.info("OutlinkDb: deleting gone links");
        }
        if (segments != null) {
            for (int i = 0; i < segments.length; ++i) {
                Path crawlFetch;
                Path parseData = new Path(segments[i], "parse_data");
                if (fs.exists(parseData)) {
                    LOG.info("OutlinkDb: adding input: " + parseData);
                    FileInputFormat.addInputPath((JobConf)outlinkJob, (Path)parseData);
                }
                if (!deleteGone || !fs.exists(crawlFetch = new Path(segments[i], "crawl_fetch"))) continue;
                LOG.info("OutlinkDb: adding input: " + crawlFetch);
                FileInputFormat.addInputPath((JobConf)outlinkJob, (Path)crawlFetch);
            }
        }
        LOG.info("OutlinkDb: adding input: " + outlinkDb);
        FileInputFormat.addInputPath((JobConf)outlinkJob, (Path)outlinkDb);
        outlinkJob.setBoolean("webgraph.url.normalizers", normalize);
        outlinkJob.setBoolean("webgraph.url.filters", filter);
        outlinkJob.setInputFormat(SequenceFileInputFormat.class);
        outlinkJob.setMapperClass(OutlinkDb.class);
        outlinkJob.setReducerClass(OutlinkDb.class);
        outlinkJob.setMapOutputKeyClass(Text.class);
        outlinkJob.setMapOutputValueClass(NutchWritable.class);
        outlinkJob.setOutputKeyClass(Text.class);
        outlinkJob.setOutputValueClass(LinkDatum.class);
        FileOutputFormat.setOutputPath((JobConf)outlinkJob, (Path)tempOutlinkDb);
        outlinkJob.setOutputFormat(MapFileOutputFormat.class);
        outlinkJob.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);
        try {
            LOG.info("OutlinkDb: running");
            JobClient.runJob((JobConf)outlinkJob);
            LOG.info("OutlinkDb: installing " + outlinkDb);
            FSUtils.replace(fs, oldOutlinkDb, outlinkDb, true);
            FSUtils.replace(fs, outlinkDb, tempOutlinkDb, true);
            if (!preserveBackup && fs.exists(oldOutlinkDb)) {
                fs.delete(oldOutlinkDb, true);
            }
            LOG.info("OutlinkDb: finished");
        }
        catch (IOException e) {
            LockUtil.removeLockFile(fs, lock);
            if (fs.exists(tempOutlinkDb)) {
                fs.delete(tempOutlinkDb, true);
            }
            LOG.error(StringUtils.stringifyException((Throwable)e));
            throw e;
        }
        Path inlinkDb = new Path(webGraphDb, INLINK_DIR);
        Path tempInlinkDb = new Path(inlinkDb + "-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
        NutchJob inlinkJob = new NutchJob(conf);
        inlinkJob.setJobName("Inlinkdb " + inlinkDb);
        LOG.info("InlinkDb: adding input: " + outlinkDb);
        FileInputFormat.addInputPath((JobConf)inlinkJob, (Path)outlinkDb);
        inlinkJob.setInputFormat(SequenceFileInputFormat.class);
        inlinkJob.setMapperClass(InlinkDb.class);
        inlinkJob.setMapOutputKeyClass(Text.class);
        inlinkJob.setMapOutputValueClass(LinkDatum.class);
        inlinkJob.setOutputKeyClass(Text.class);
        inlinkJob.setOutputValueClass(LinkDatum.class);
        FileOutputFormat.setOutputPath((JobConf)inlinkJob, (Path)tempInlinkDb);
        inlinkJob.setOutputFormat(MapFileOutputFormat.class);
        inlinkJob.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);
        try {
            LOG.info("InlinkDb: running");
            JobClient.runJob((JobConf)inlinkJob);
            LOG.info("InlinkDb: installing " + inlinkDb);
            FSUtils.replace(fs, inlinkDb, tempInlinkDb, true);
            LOG.info("InlinkDb: finished");
        }
        catch (IOException e) {
            LockUtil.removeLockFile(fs, lock);
            if (fs.exists(tempInlinkDb)) {
                fs.delete(tempInlinkDb, true);
            }
            LOG.error(StringUtils.stringifyException((Throwable)e));
            throw e;
        }
        Path nodeDb = new Path(webGraphDb, NODE_DIR);
        Path tempNodeDb = new Path(nodeDb + "-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
        NutchJob nodeJob = new NutchJob(conf);
        nodeJob.setJobName("NodeDb " + nodeDb);
        LOG.info("NodeDb: adding input: " + outlinkDb);
        LOG.info("NodeDb: adding input: " + inlinkDb);
        FileInputFormat.addInputPath((JobConf)nodeJob, (Path)outlinkDb);
        FileInputFormat.addInputPath((JobConf)nodeJob, (Path)inlinkDb);
        nodeJob.setInputFormat(SequenceFileInputFormat.class);
        nodeJob.setReducerClass(NodeDb.class);
        nodeJob.setMapOutputKeyClass(Text.class);
        nodeJob.setMapOutputValueClass(LinkDatum.class);
        nodeJob.setOutputKeyClass(Text.class);
        nodeJob.setOutputValueClass(Node.class);
        FileOutputFormat.setOutputPath((JobConf)nodeJob, (Path)tempNodeDb);
        nodeJob.setOutputFormat(MapFileOutputFormat.class);
        nodeJob.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);
        try {
            LOG.info("NodeDb: running");
            JobClient.runJob((JobConf)nodeJob);
            LOG.info("NodeDb: installing " + nodeDb);
            FSUtils.replace(fs, nodeDb, tempNodeDb, true);
            LOG.info("NodeDb: finished");
        }
        catch (IOException e) {
            LockUtil.removeLockFile(fs, lock);
            if (fs.exists(tempNodeDb)) {
                fs.delete(tempNodeDb, true);
            }
            LOG.error(StringUtils.stringifyException((Throwable)e));
            throw e;
        }
        LockUtil.removeLockFile(fs, lock);
        long end = System.currentTimeMillis();
        LOG.info("WebGraphDb: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
    }

    public static void main(String[] args) throws Exception {
        int res = ToolRunner.run((Configuration)NutchConfiguration.create(), (Tool)new WebGraph(), (String[])args);
        System.exit(res);
    }

    public int run(String[] args) throws Exception {
        Options options = new Options();
        OptionBuilder.withArgName((String)"help");
        OptionBuilder.withDescription((String)"show this help message");
        Option helpOpts = OptionBuilder.create((String)"help");
        options.addOption(helpOpts);
        OptionBuilder.withArgName((String)"webgraphdb");
        OptionBuilder.hasArg();
        OptionBuilder.withDescription((String)"the web graph database to use");
        Option webGraphDbOpts = OptionBuilder.create((String)"webgraphdb");
        options.addOption(webGraphDbOpts);
        OptionBuilder.withArgName((String)"segment");
        OptionBuilder.hasArgs();
        OptionBuilder.withDescription((String)"the segment(s) to use");
        Option segOpts = OptionBuilder.create((String)"segment");
        options.addOption(segOpts);
        OptionBuilder.withArgName((String)"segmentDir");
        OptionBuilder.hasArgs();
        OptionBuilder.withDescription((String)"the segment directory to use");
        Option segDirOpts = OptionBuilder.create((String)"segmentDir");
        options.addOption(segDirOpts);
        OptionBuilder.withArgName((String)"normalize");
        OptionBuilder.withDescription((String)"whether to use URLNormalizers on the URL's in the segment");
        Option normalizeOpts = OptionBuilder.create((String)"normalize");
        options.addOption(normalizeOpts);
        OptionBuilder.withArgName((String)"filter");
        OptionBuilder.withDescription((String)"whether to use URLFilters on the URL's in the segment");
        Option filterOpts = OptionBuilder.create((String)"filter");
        options.addOption(filterOpts);
        GnuParser parser = new GnuParser();
        try {
            CommandLine line = parser.parse(options, args);
            if (line.hasOption("help") || !line.hasOption("webgraphdb") || !line.hasOption("segment") && !line.hasOption("segmentDir")) {
                HelpFormatter formatter = new HelpFormatter();
                formatter.printHelp("WebGraph", options);
                return -1;
            }
            String webGraphDb = line.getOptionValue("webgraphdb");
            Path[] segPaths = null;
            if (line.hasOption("segment")) {
                String[] segments = line.getOptionValues("segment");
                segPaths = new Path[segments.length];
                for (int i = 0; i < segments.length; ++i) {
                    segPaths[i] = new Path(segments[i]);
                }
            }
            if (line.hasOption("segmentDir")) {
                Path dir = new Path(line.getOptionValue("segmentDir"));
                FileSystem fs = dir.getFileSystem(this.getConf());
                FileStatus[] fstats = fs.listStatus(dir, HadoopFSUtil.getPassDirectoriesFilter(fs));
                segPaths = HadoopFSUtil.getPaths(fstats);
            }
            boolean normalize = false;
            if (line.hasOption("normalize")) {
                normalize = true;
            }
            boolean filter = false;
            if (line.hasOption("filter")) {
                filter = true;
            }
            this.createWebGraph(new Path(webGraphDb), segPaths, normalize, filter);
            return 0;
        }
        catch (Exception e) {
            LOG.error("WebGraph: " + StringUtils.stringifyException((Throwable)e));
            return -2;
        }
    }

    private static class NodeDb
    extends Configured
    implements Reducer<Text, LinkDatum, Text, Node> {
        private NodeDb() {
        }

        public void configure(JobConf conf) {
        }

        public void close() {
        }

        public void reduce(Text key, Iterator<LinkDatum> values, OutputCollector<Text, Node> output, Reporter reporter) throws IOException {
            Node node = new Node();
            int numInlinks = 0;
            int numOutlinks = 0;
            while (values.hasNext()) {
                LinkDatum next = values.next();
                if (next.getLinkType() == 1) {
                    ++numInlinks;
                    continue;
                }
                if (next.getLinkType() != 2) continue;
                ++numOutlinks;
            }
            node.setNumInlinks(numInlinks);
            node.setNumOutlinks(numOutlinks);
            node.setInlinkScore(0.0f);
            output.collect((Object)key, (Object)node);
        }
    }

    private static class InlinkDb
    extends Configured
    implements Mapper<Text, LinkDatum, Text, LinkDatum> {
        private long timestamp;

        private InlinkDb() {
        }

        public void configure(JobConf conf) {
            this.timestamp = System.currentTimeMillis();
        }

        public void close() {
        }

        public void map(Text key, LinkDatum datum, OutputCollector<Text, LinkDatum> output, Reporter reporter) throws IOException {
            String fromUrl = key.toString();
            String toUrl = datum.getUrl();
            String anchor = datum.getAnchor();
            LinkDatum inlink = new LinkDatum(fromUrl, anchor, this.timestamp);
            inlink.setLinkType((byte)1);
            output.collect((Object)new Text(toUrl), (Object)inlink);
        }
    }

    public static class OutlinkDb
    extends Configured
    implements Mapper<Text, Writable, Text, NutchWritable>,
    Reducer<Text, NutchWritable, Text, LinkDatum> {
        public static final String URL_NORMALIZING = "webgraph.url.normalizers";
        public static final String URL_FILTERING = "webgraph.url.filters";
        private boolean ignoreDomain = true;
        private boolean ignoreHost = true;
        private boolean limitPages = true;
        private boolean limitDomains = true;
        private boolean normalize = false;
        private boolean filter = false;
        private URLNormalizers urlNormalizers;
        private URLFilters filters;
        private JobConf conf;

        private String normalizeUrl(String url) {
            if (!this.normalize) {
                return url;
            }
            String normalized = null;
            if (this.urlNormalizers != null) {
                try {
                    normalized = this.urlNormalizers.normalize(url, "default");
                    normalized = normalized.trim();
                }
                catch (Exception e) {
                    LOG.warn("Skipping " + url + ":" + e);
                    normalized = null;
                }
            }
            return normalized;
        }

        private String filterUrl(String url) {
            if (!this.filter) {
                return url;
            }
            try {
                url = this.filters.filter(url);
            }
            catch (Exception e) {
                url = null;
            }
            return url;
        }

        private long getFetchTime(ParseData data) {
            long fetchTime = System.currentTimeMillis();
            String fetchTimeStr = data.getContentMeta().get("_ftk_");
            try {
                fetchTime = Long.parseLong(fetchTimeStr);
            }
            catch (Exception e) {
                fetchTime = System.currentTimeMillis();
            }
            return fetchTime;
        }

        public OutlinkDb() {
        }

        public OutlinkDb(Configuration conf) {
            this.setConf(conf);
        }

        public void configure(JobConf conf) {
            this.conf = conf;
            this.ignoreHost = conf.getBoolean("link.ignore.internal.host", true);
            this.ignoreDomain = conf.getBoolean("link.ignore.internal.domain", true);
            this.limitPages = conf.getBoolean("link.ignore.limit.page", true);
            this.limitDomains = conf.getBoolean("link.ignore.limit.domain", true);
            this.normalize = conf.getBoolean(URL_NORMALIZING, false);
            this.filter = conf.getBoolean(URL_FILTERING, false);
            if (this.normalize) {
                this.urlNormalizers = new URLNormalizers((Configuration)conf, "default");
            }
            if (this.filter) {
                this.filters = new URLFilters((Configuration)conf);
            }
        }

        public void map(Text key, Writable value, OutputCollector<Text, NutchWritable> output, Reporter reporter) throws IOException {
            LinkDatum datum;
            String linkDatumUrl;
            String url = this.normalizeUrl(key.toString());
            if (url == null) {
                return;
            }
            if (this.filterUrl(url) == null) {
                return;
            }
            key.set(url);
            if (value instanceof CrawlDatum) {
                CrawlDatum datum2 = (CrawlDatum)value;
                if (datum2.getStatus() == 35 || datum2.getStatus() == 36 || datum2.getStatus() == 37) {
                    output.collect((Object)key, (Object)new NutchWritable((Writable)new BooleanWritable(true)));
                }
            } else if (value instanceof ParseData) {
                ParseData data = (ParseData)value;
                long fetchTime = this.getFetchTime(data);
                Outlink[] outlinkAr = data.getOutlinks();
                LinkedHashMap<String, String> outlinkMap = new LinkedHashMap<String, String>();
                if (outlinkAr != null && outlinkAr.length > 0) {
                    for (int i = 0; i < outlinkAr.length; ++i) {
                        Outlink outlink = outlinkAr[i];
                        String toUrl = this.normalizeUrl(outlink.getToUrl());
                        if (this.filterUrl(toUrl) == null) continue;
                        boolean existingUrl = outlinkMap.containsKey(toUrl);
                        if (toUrl == null || existingUrl && (!existingUrl || outlinkMap.get(toUrl) != null)) continue;
                        outlinkMap.put(toUrl, outlink.getAnchor());
                    }
                }
                for (String outlinkUrl : outlinkMap.keySet()) {
                    String anchor = (String)outlinkMap.get(outlinkUrl);
                    LinkDatum datum3 = new LinkDatum(outlinkUrl, anchor, fetchTime);
                    output.collect((Object)key, (Object)new NutchWritable(datum3));
                }
            } else if (value instanceof LinkDatum && this.filterUrl(linkDatumUrl = this.normalizeUrl((datum = (LinkDatum)value).getUrl())) != null) {
                datum.setUrl(linkDatumUrl);
                output.collect((Object)key, (Object)new NutchWritable(datum));
            }
        }

        public void reduce(Text key, Iterator<NutchWritable> values, OutputCollector<Text, LinkDatum> output, Reporter reporter) throws IOException {
            long mostRecent = 0L;
            ArrayList<Writable> outlinkList = new ArrayList<Writable>();
            while (values.hasNext()) {
                BooleanWritable delete;
                Writable value = values.next().get();
                if (value instanceof LinkDatum) {
                    LinkDatum next = (LinkDatum)value;
                    long timestamp = next.getTimestamp();
                    if (mostRecent == 0L || mostRecent < timestamp) {
                        mostRecent = timestamp;
                    }
                    outlinkList.add(WritableUtils.clone((Writable)next, (Configuration)this.conf));
                    reporter.incrCounter("WebGraph.outlinks", "added links", 1L);
                    continue;
                }
                if (!(value instanceof BooleanWritable) || !(delete = (BooleanWritable)value).get()) continue;
                reporter.incrCounter("WebGraph.outlinks", "removed links", 1L);
                return;
            }
            String url = key.toString();
            String domain = URLUtil.getDomainName(url);
            String host = URLUtil.getHost(url);
            HashSet<String> domains = new HashSet<String>();
            HashSet<String> pages = new HashSet<String>();
            for (LinkDatum linkDatum : outlinkList) {
                String toUrl = linkDatum.getUrl();
                String toDomain = URLUtil.getDomainName(toUrl);
                String toHost = URLUtil.getHost(toUrl);
                String toPage = URLUtil.getPage(toUrl);
                linkDatum.setLinkType((byte)2);
                if (linkDatum.getTimestamp() != mostRecent || this.limitPages && (!this.limitPages || pages.contains(toPage)) || this.limitDomains && (!this.limitDomains || domains.contains(toDomain)) || this.ignoreHost && (!this.ignoreHost || toHost.equalsIgnoreCase(host)) || this.ignoreDomain && (!this.ignoreDomain || toDomain.equalsIgnoreCase(domain))) continue;
                output.collect((Object)key, (Object)linkDatum);
                pages.add(toPage);
                domains.add(toDomain);
            }
        }

        public void close() {
        }
    }
}

