/*
 * Decompiled with CFR 0.152.
 */
package org.apache.nutch.crawl;

import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Random;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapFileOutputFormat;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.crawl.Inlink;
import org.apache.nutch.crawl.Inlinks;
import org.apache.nutch.crawl.LinkDbMerger;
import org.apache.nutch.net.URLFilters;
import org.apache.nutch.net.URLNormalizers;
import org.apache.nutch.parse.Outlink;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.util.HadoopFSUtil;
import org.apache.nutch.util.LockUtil;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.TimingUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class LinkDb
extends Configured
implements Tool,
Mapper<Text, ParseData, Text, Inlinks> {
    public static final Logger LOG = LoggerFactory.getLogger(LinkDb.class);
    public static final String IGNORE_INTERNAL_LINKS = "db.ignore.internal.links";
    public static final String CURRENT_NAME = "current";
    public static final String LOCK_NAME = ".locked";
    private int maxAnchorLength;
    private boolean ignoreInternalLinks;
    private URLFilters urlFilters;
    private URLNormalizers urlNormalizers;

    public LinkDb() {
    }

    public LinkDb(Configuration conf) {
        this.setConf(conf);
    }

    public void configure(JobConf job) {
        this.maxAnchorLength = job.getInt("db.max.anchor.length", 100);
        this.ignoreInternalLinks = job.getBoolean(IGNORE_INTERNAL_LINKS, true);
        if (job.getBoolean("linkdb.url.filters", false)) {
            this.urlFilters = new URLFilters((Configuration)job);
        }
        if (job.getBoolean("linkdb.url.normalizer", false)) {
            this.urlNormalizers = new URLNormalizers((Configuration)job, "linkdb");
        }
    }

    public void close() {
    }

    public void map(Text key, ParseData parseData, OutputCollector<Text, Inlinks> output, Reporter reporter) throws IOException {
        String fromUrl = key.toString();
        String fromHost = this.getHost(fromUrl);
        if (this.urlNormalizers != null) {
            try {
                fromUrl = this.urlNormalizers.normalize(fromUrl, "linkdb");
            }
            catch (Exception e) {
                LOG.warn("Skipping " + fromUrl + ":" + e);
                fromUrl = null;
            }
        }
        if (fromUrl != null && this.urlFilters != null) {
            try {
                fromUrl = this.urlFilters.filter(fromUrl);
            }
            catch (Exception e) {
                LOG.warn("Skipping " + fromUrl + ":" + e);
                fromUrl = null;
            }
        }
        if (fromUrl == null) {
            return;
        }
        Outlink[] outlinks = parseData.getOutlinks();
        Inlinks inlinks = new Inlinks();
        for (int i = 0; i < outlinks.length; ++i) {
            String toHost;
            Outlink outlink = outlinks[i];
            String toUrl = outlink.getToUrl();
            if (this.ignoreInternalLinks && ((toHost = this.getHost(toUrl)) == null || toHost.equals(fromHost))) continue;
            if (this.urlNormalizers != null) {
                try {
                    toUrl = this.urlNormalizers.normalize(toUrl, "linkdb");
                }
                catch (Exception e) {
                    LOG.warn("Skipping " + toUrl + ":" + e);
                    toUrl = null;
                }
            }
            if (toUrl != null && this.urlFilters != null) {
                try {
                    toUrl = this.urlFilters.filter(toUrl);
                }
                catch (Exception e) {
                    LOG.warn("Skipping " + toUrl + ":" + e);
                    toUrl = null;
                }
            }
            if (toUrl == null) continue;
            inlinks.clear();
            String anchor = outlink.getAnchor();
            if (anchor.length() > this.maxAnchorLength) {
                anchor = anchor.substring(0, this.maxAnchorLength);
            }
            inlinks.add(new Inlink(fromUrl, anchor));
            output.collect((Object)new Text(toUrl), (Object)inlinks);
        }
    }

    private String getHost(String url) {
        try {
            return new URL(url).getHost().toLowerCase();
        }
        catch (MalformedURLException e) {
            return null;
        }
    }

    public void invert(Path linkDb, Path segmentsDir, boolean normalize, boolean filter, boolean force) throws IOException {
        FileSystem fs = FileSystem.get((Configuration)this.getConf());
        FileStatus[] files = fs.listStatus(segmentsDir, HadoopFSUtil.getPassDirectoriesFilter(fs));
        this.invert(linkDb, HadoopFSUtil.getPaths(files), normalize, filter, force);
    }

    public void invert(Path linkDb, Path[] segments, boolean normalize, boolean filter, boolean force) throws IOException {
        JobConf job = LinkDb.createJob(this.getConf(), linkDb, normalize, filter);
        Path lock = new Path(linkDb, LOCK_NAME);
        FileSystem fs = FileSystem.get((Configuration)this.getConf());
        LockUtil.createLockFile(fs, lock, force);
        Path currentLinkDb = new Path(linkDb, CURRENT_NAME);
        SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
        long start = System.currentTimeMillis();
        if (LOG.isInfoEnabled()) {
            LOG.info("LinkDb: starting at " + sdf.format(start));
            LOG.info("LinkDb: linkdb: " + linkDb);
            LOG.info("LinkDb: URL normalize: " + normalize);
            LOG.info("LinkDb: URL filter: " + filter);
            if (job.getBoolean(IGNORE_INTERNAL_LINKS, true)) {
                LOG.info("LinkDb: internal links will be ignored.");
            }
        }
        for (int i = 0; i < segments.length; ++i) {
            if (LOG.isInfoEnabled()) {
                LOG.info("LinkDb: adding segment: " + segments[i]);
            }
            FileInputFormat.addInputPath((JobConf)job, (Path)new Path(segments[i], "parse_data"));
        }
        try {
            JobClient.runJob((JobConf)job);
        }
        catch (IOException e) {
            LockUtil.removeLockFile(fs, lock);
            throw e;
        }
        if (fs.exists(currentLinkDb)) {
            if (LOG.isInfoEnabled()) {
                LOG.info("LinkDb: merging with existing linkdb: " + linkDb);
            }
            Path newLinkDb = FileOutputFormat.getOutputPath((JobConf)job);
            job = LinkDbMerger.createMergeJob(this.getConf(), linkDb, normalize, filter);
            FileInputFormat.addInputPath((JobConf)job, (Path)currentLinkDb);
            FileInputFormat.addInputPath((JobConf)job, (Path)newLinkDb);
            try {
                JobClient.runJob((JobConf)job);
            }
            catch (IOException e) {
                LockUtil.removeLockFile(fs, lock);
                fs.delete(newLinkDb, true);
                throw e;
            }
            fs.delete(newLinkDb, true);
        }
        LinkDb.install(job, linkDb);
        long end = System.currentTimeMillis();
        LOG.info("LinkDb: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
    }

    private static JobConf createJob(Configuration config, Path linkDb, boolean normalize, boolean filter) {
        Path newLinkDb = new Path("linkdb-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
        NutchJob job = new NutchJob(config);
        job.setJobName("linkdb " + linkDb);
        job.setInputFormat(SequenceFileInputFormat.class);
        job.setMapperClass(LinkDb.class);
        job.setCombinerClass(LinkDbMerger.class);
        if (normalize || filter) {
            try {
                FileSystem fs = FileSystem.get((Configuration)config);
                if (!fs.exists(linkDb)) {
                    job.setBoolean("linkdb.url.filters", filter);
                    job.setBoolean("linkdb.url.normalizer", normalize);
                }
            }
            catch (Exception e) {
                LOG.warn("LinkDb createJob: " + e);
            }
        }
        job.setReducerClass(LinkDbMerger.class);
        FileOutputFormat.setOutputPath((JobConf)job, (Path)newLinkDb);
        job.setOutputFormat(MapFileOutputFormat.class);
        job.setBoolean("mapred.output.compress", true);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Inlinks.class);
        return job;
    }

    public static void install(JobConf job, Path linkDb) throws IOException {
        Path newLinkDb = FileOutputFormat.getOutputPath((JobConf)job);
        FileSystem fs = new JobClient(job).getFs();
        Path old = new Path(linkDb, "old");
        Path current = new Path(linkDb, CURRENT_NAME);
        if (fs.exists(current)) {
            if (fs.exists(old)) {
                fs.delete(old, true);
            }
            fs.rename(current, old);
        }
        fs.mkdirs(linkDb);
        fs.rename(newLinkDb, current);
        if (fs.exists(old)) {
            fs.delete(old, true);
        }
        LockUtil.removeLockFile(fs, new Path(linkDb, LOCK_NAME));
    }

    public static void main(String[] args) throws Exception {
        int res = ToolRunner.run((Configuration)NutchConfiguration.create(), (Tool)new LinkDb(), (String[])args);
        System.exit(res);
    }

    public int run(String[] args) throws Exception {
        if (args.length < 2) {
            System.err.println("Usage: LinkDb <linkdb> (-dir <segmentsDir> | <seg1> <seg2> ...) [-force] [-noNormalize] [-noFilter]");
            System.err.println("\tlinkdb\toutput LinkDb to create or update");
            System.err.println("\t-dir segmentsDir\tparent directory of several segments, OR");
            System.err.println("\tseg1 seg2 ...\t list of segment directories");
            System.err.println("\t-force\tforce update even if LinkDb appears to be locked (CAUTION advised)");
            System.err.println("\t-noNormalize\tdon't normalize link URLs");
            System.err.println("\t-noFilter\tdon't apply URLFilters to link URLs");
            return -1;
        }
        Object segDir = null;
        FileSystem fs = FileSystem.get((Configuration)this.getConf());
        Path db = new Path(args[0]);
        ArrayList<Path> segs = new ArrayList<Path>();
        boolean filter = true;
        boolean normalize = true;
        boolean force = false;
        for (int i = 1; i < args.length; ++i) {
            if (args[i].equals("-dir")) {
                FileStatus[] paths = fs.listStatus(new Path(args[++i]), HadoopFSUtil.getPassDirectoriesFilter(fs));
                segs.addAll(Arrays.asList(HadoopFSUtil.getPaths(paths)));
                continue;
            }
            if (args[i].equalsIgnoreCase("-noNormalize")) {
                normalize = false;
                continue;
            }
            if (args[i].equalsIgnoreCase("-noFilter")) {
                filter = false;
                continue;
            }
            if (args[i].equalsIgnoreCase("-force")) {
                force = true;
                continue;
            }
            segs.add(new Path(args[i]));
        }
        try {
            this.invert(db, segs.toArray(new Path[segs.size()]), normalize, filter, force);
            return 0;
        }
        catch (Exception e) {
            LOG.error("LinkDb: " + StringUtils.stringifyException((Throwable)e));
            return -1;
        }
    }
}

