/*
 * Decompiled with CFR 0.152.
 */
package crawlercommons.robots;

import crawlercommons.robots.BaseRobotRules;
import crawlercommons.robots.BaseRobotsParser;
import crawlercommons.robots.SimpleRobotRules;
import java.io.UnsupportedEncodingException;
import java.net.URI;
import java.net.URL;
import java.net.URLDecoder;
import java.util.HashMap;
import java.util.Map;
import java.util.StringTokenizer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class SimpleRobotRulesParser
extends BaseRobotsParser {
    private static final Logger LOGGER = LoggerFactory.getLogger(SimpleRobotRulesParser.class);
    private static Map<String, RobotDirective> DIRECTIVE_PREFIX = new HashMap<String, RobotDirective>();
    private static final Pattern COLON_DIRECTIVE_DELIMITER;
    private static final Pattern BLANK_DIRECTIVE_DELIMITER;
    private static final Pattern DIRECTIVE_SUFFIX_PATTERN;
    private static final Pattern SIMPLE_HTML_PATTERN;
    private static final Pattern USER_AGENT_PATTERN;
    private static final int MAX_WARNINGS = 5;
    private static final long MAX_CRAWL_DELAY = 300000L;
    private int _numWarnings;

    private static RobotToken tokenize(String line) {
        String lowerLine = line.toLowerCase();
        for (String prefix : DIRECTIVE_PREFIX.keySet()) {
            Matcher m;
            int prefixLength = prefix.length();
            if (!lowerLine.startsWith(prefix)) continue;
            RobotDirective directive = DIRECTIVE_PREFIX.get(prefix);
            String dataPortion = lowerLine.substring(prefixLength);
            if (directive.equals((Object)RobotDirective.SITEMAP)) {
                dataPortion = line.substring(prefixLength);
            }
            if (directive.isPrefix()) {
                m = DIRECTIVE_SUFFIX_PATTERN.matcher(dataPortion);
                if (!m.matches()) continue;
                dataPortion = m.group(1);
            }
            if (!(m = COLON_DIRECTIVE_DELIMITER.matcher(dataPortion)).matches()) {
                m = BLANK_DIRECTIVE_DELIMITER.matcher(dataPortion);
            }
            if (!m.matches()) continue;
            return new RobotToken(directive, m.group(1).trim());
        }
        Matcher m = COLON_DIRECTIVE_DELIMITER.matcher(lowerLine);
        if (m.matches()) {
            return new RobotToken(RobotDirective.UNKNOWN, line);
        }
        return new RobotToken(RobotDirective.MISSING, line);
    }

    @Override
    public BaseRobotRules failedFetch(int httpStatusCode) {
        SimpleRobotRules result;
        if (httpStatusCode >= 200 && httpStatusCode < 300) {
            throw new IllegalStateException("Can't use status code constructor with 2xx response");
        }
        if (httpStatusCode >= 300 && httpStatusCode < 400) {
            result = new SimpleRobotRules(SimpleRobotRules.RobotRulesMode.ALLOW_NONE);
            result.setDeferVisits(true);
        } else if (httpStatusCode >= 400 && httpStatusCode < 500) {
            result = new SimpleRobotRules(SimpleRobotRules.RobotRulesMode.ALLOW_ALL);
        } else {
            result = new SimpleRobotRules(SimpleRobotRules.RobotRulesMode.ALLOW_NONE);
            result.setDeferVisits(true);
        }
        return result;
    }

    @Override
    public BaseRobotRules parseContent(String url, byte[] content, String contentType, String robotName) {
        String contentAsStr;
        this._numWarnings = 0;
        if (content == null || content.length == 0) {
            return new SimpleRobotRules(SimpleRobotRules.RobotRulesMode.ALLOW_ALL);
        }
        int bytesLen = content.length;
        int offset = 0;
        String encoding = "us-ascii";
        if (bytesLen >= 3 && content[0] == -17 && content[1] == -69 && content[2] == -65) {
            offset = 3;
            bytesLen -= 3;
            encoding = "UTF-8";
        } else if (bytesLen >= 2 && content[0] == -1 && content[1] == -2) {
            offset = 2;
            bytesLen -= 2;
            encoding = "UTF-16LE";
        } else if (bytesLen >= 2 && content[0] == -2 && content[1] == -1) {
            offset = 2;
            bytesLen -= 2;
            encoding = "UTF-16BE";
        }
        try {
            contentAsStr = new String(content, offset, bytesLen, encoding);
        }
        catch (UnsupportedEncodingException e) {
            throw new RuntimeException("Impossible unsupported encoding exception for " + encoding);
        }
        boolean isHtmlType = contentType != null && contentType.toLowerCase().startsWith("text/html");
        boolean hasHTML = false;
        if (isHtmlType || SIMPLE_HTML_PATTERN.matcher(contentAsStr).find()) {
            if (!USER_AGENT_PATTERN.matcher(contentAsStr).find()) {
                LOGGER.trace("Found non-robots.txt HTML file: " + url);
                return new SimpleRobotRules(SimpleRobotRules.RobotRulesMode.ALLOW_ALL);
            }
            if (isHtmlType) {
                LOGGER.debug("HTML content type returned for robots.txt file: " + url);
            } else {
                LOGGER.debug("Found HTML in robots.txt file: " + url);
            }
            hasHTML = true;
        }
        StringTokenizer lineParser = new StringTokenizer(contentAsStr, "\n\r\u0085\u2028\u2029");
        ParseState parseState = new ParseState(url, robotName.toLowerCase());
        boolean keepGoing = true;
        while (keepGoing && lineParser.hasMoreTokens()) {
            int hashPos;
            String line = lineParser.nextToken();
            if (hasHTML) {
                line = line.replaceAll("<[^>]+>", "");
            }
            if ((hashPos = line.indexOf("#")) >= 0) {
                line = line.substring(0, hashPos);
            }
            if ((line = line.trim()).length() == 0) continue;
            RobotToken token = SimpleRobotRulesParser.tokenize(line);
            switch (token.getDirective()) {
                case USER_AGENT: {
                    keepGoing = this.handleUserAgent(parseState, token);
                    break;
                }
                case DISALLOW: {
                    keepGoing = this.handleDisallow(parseState, token);
                    break;
                }
                case ALLOW: {
                    keepGoing = this.handleAllow(parseState, token);
                    break;
                }
                case CRAWL_DELAY: {
                    keepGoing = this.handleCrawlDelay(parseState, token);
                    break;
                }
                case SITEMAP: {
                    keepGoing = this.handleSitemap(parseState, token);
                    break;
                }
                case HTTP: {
                    keepGoing = this.handleHttp(parseState, token);
                    break;
                }
                case UNKNOWN: {
                    this.reportWarning("Unknown directive in robots.txt file: " + line, url);
                    parseState.setFinishedAgentFields(true);
                    break;
                }
                case MISSING: {
                    this.reportWarning(String.format("Unknown line in robots.txt file (size %d): %s", content.length, line), url);
                    parseState.setFinishedAgentFields(true);
                    break;
                }
            }
        }
        SimpleRobotRules result = parseState.getRobotRules();
        if (result.getCrawlDelay() > 300000L) {
            LOGGER.debug("Crawl delay exceeds max value - so disallowing all URLs: " + url);
            return new SimpleRobotRules(SimpleRobotRules.RobotRulesMode.ALLOW_NONE);
        }
        return result;
    }

    private void reportWarning(String msg, String url) {
        ++this._numWarnings;
        if (this._numWarnings == 1) {
            LOGGER.warn("Problem processing robots.txt for " + url);
        }
        if (this._numWarnings < 5) {
            LOGGER.warn("\t" + msg);
        }
    }

    private boolean handleUserAgent(ParseState state, RobotToken token) {
        if (state.isMatchedRealName()) {
            return !state.isFinishedAgentFields();
        }
        if (state.isFinishedAgentFields()) {
            state.setFinishedAgentFields(false);
            state.setAddingRules(false);
        }
        String[] targetNames = state.getTargetName().split(",");
        for (int count = 0; count < targetNames.length; ++count) {
            String[] agentNames;
            String[] targetNameSplits = targetNames[count].trim().split(" ");
            block1: for (String agentName : agentNames = token.getData().split("[ \t,]")) {
                if (agentName.equals("*") && !state.isMatchedWildcard()) {
                    state.setMatchedWildcard(true);
                    state.setAddingRules(true);
                    continue;
                }
                for (String targetName : targetNameSplits) {
                    if (!targetName.startsWith(agentName)) continue;
                    state.setMatchedRealName(true);
                    state.setAddingRules(true);
                    state.clearRules();
                    continue block1;
                }
            }
        }
        return true;
    }

    private boolean handleDisallow(ParseState state, RobotToken token) {
        state.setFinishedAgentFields(true);
        if (!state.isAddingRules()) {
            return true;
        }
        String path = token.getData();
        try {
            path = URLDecoder.decode(path, "UTF-8");
            if (path.length() == 0) {
                state.clearRules();
            } else {
                state.addRule(path, false);
            }
        }
        catch (Exception e) {
            this.reportWarning("Error parsing robots rules - can't decode path: " + path, state.getUrl());
        }
        return true;
    }

    private boolean handleAllow(ParseState state, RobotToken token) {
        state.setFinishedAgentFields(true);
        if (!state.isAddingRules()) {
            return true;
        }
        String path = token.getData();
        try {
            path = URLDecoder.decode(path, "UTF-8");
        }
        catch (Exception e) {
            this.reportWarning("Error parsing robots rules - can't decode path: " + path, state.getUrl());
        }
        if (path.length() == 0) {
            state.clearRules();
        } else {
            state.addRule(path, true);
        }
        return true;
    }

    private boolean handleCrawlDelay(ParseState state, RobotToken token) {
        state.setFinishedAgentFields(true);
        if (!state.isAddingRules()) {
            return true;
        }
        String delayString = token.getData();
        if (delayString.length() > 0) {
            try {
                if (delayString.indexOf(46) != -1) {
                    double delayValue = Double.parseDouble(delayString) * 1000.0;
                    state.setCrawlDelay(Math.round(delayValue));
                } else {
                    long delayValue = (long)Integer.parseInt(delayString) * 1000L;
                    state.setCrawlDelay(delayValue);
                }
            }
            catch (Exception e) {
                this.reportWarning("Error parsing robots rules - can't decode crawl delay: " + delayString, state.getUrl());
            }
        }
        return true;
    }

    private boolean handleSitemap(ParseState state, RobotToken token) {
        String sitemap = token.getData();
        try {
            String hostname = new URL(sitemap).getHost();
            if (hostname != null && hostname.length() > 0 && (hostname = new URI(sitemap).getHost()) != null && hostname.length() > 0) {
                state.addSitemap(sitemap);
            }
        }
        catch (Exception e) {
            this.reportWarning("Invalid URL with sitemap directive: " + sitemap, state.getUrl());
        }
        return true;
    }

    private boolean handleHttp(ParseState state, RobotToken token) {
        String urlFragment = token.getData();
        if (urlFragment.contains("sitemap")) {
            RobotToken fixedToken = new RobotToken(RobotDirective.SITEMAP, "http:" + token.getData());
            return this.handleSitemap(state, fixedToken);
        }
        this.reportWarning("Found raw non-sitemap URL: http:" + urlFragment, state.getUrl());
        return true;
    }

    public int getNumWarnings() {
        return this._numWarnings;
    }

    static {
        for (RobotDirective directive : RobotDirective.values()) {
            if (directive.isSpecial()) continue;
            String prefix = directive.name().toLowerCase().replaceAll("_", "-");
            DIRECTIVE_PREFIX.put(prefix, directive);
        }
        DIRECTIVE_PREFIX.put("useragent", RobotDirective.USER_AGENT);
        DIRECTIVE_PREFIX.put("useg-agent", RobotDirective.USER_AGENT);
        DIRECTIVE_PREFIX.put("ser-agent", RobotDirective.USER_AGENT);
        DIRECTIVE_PREFIX.put("desallow", RobotDirective.DISALLOW);
        DIRECTIVE_PREFIX.put("dissalow", RobotDirective.DISALLOW);
        DIRECTIVE_PREFIX.put("dssalow", RobotDirective.DISALLOW);
        DIRECTIVE_PREFIX.put("dsallow", RobotDirective.DISALLOW);
        DIRECTIVE_PREFIX.put("crawl delay", RobotDirective.CRAWL_DELAY);
        COLON_DIRECTIVE_DELIMITER = Pattern.compile("[ \t]*:[ \t]*(.*)");
        BLANK_DIRECTIVE_DELIMITER = Pattern.compile("[ \t]+(.*)");
        DIRECTIVE_SUFFIX_PATTERN = Pattern.compile("[^: \t]+(.*)");
        SIMPLE_HTML_PATTERN = Pattern.compile("(?is)<(html|head|body)\\s*>");
        USER_AGENT_PATTERN = Pattern.compile("(?i)user-agent:");
    }

    private static class RobotToken {
        private RobotDirective _directive;
        private String _data;

        public RobotToken(RobotDirective directive, String data) {
            this._directive = directive;
            this._data = data;
        }

        public RobotDirective getDirective() {
            return this._directive;
        }

        public String getData() {
            return this._data;
        }
    }

    private static class ParseState {
        private boolean _matchedRealName;
        private boolean _matchedWildcard;
        private boolean _addingRules;
        private boolean _finishedAgentFields;
        private String _url;
        private String _targetName;
        private SimpleRobotRules _curRules;

        public ParseState(String url, String targetName) {
            this._url = url;
            this._targetName = targetName;
            this._curRules = new SimpleRobotRules();
        }

        public String getTargetName() {
            return this._targetName;
        }

        public boolean isMatchedRealName() {
            return this._matchedRealName;
        }

        public void setMatchedRealName(boolean matchedRealName) {
            this._matchedRealName = matchedRealName;
        }

        public boolean isMatchedWildcard() {
            return this._matchedWildcard;
        }

        public void setMatchedWildcard(boolean matchedWildcard) {
            this._matchedWildcard = matchedWildcard;
        }

        public boolean isAddingRules() {
            return this._addingRules;
        }

        public void setAddingRules(boolean addingRules) {
            this._addingRules = addingRules;
        }

        public boolean isFinishedAgentFields() {
            return this._finishedAgentFields;
        }

        public void setFinishedAgentFields(boolean finishedAgentFields) {
            this._finishedAgentFields = finishedAgentFields;
        }

        public void clearRules() {
            this._curRules.clearRules();
        }

        public void addRule(String prefix, boolean allow) {
            this._curRules.addRule(prefix, allow);
        }

        public void setCrawlDelay(long delay) {
            this._curRules.setCrawlDelay(delay);
        }

        public SimpleRobotRules getRobotRules() {
            return this._curRules;
        }

        public String getUrl() {
            return this._url;
        }

        public void addSitemap(String sitemap) {
            this._curRules.addSitemap(sitemap);
        }
    }

    private static enum RobotDirective {
        USER_AGENT,
        DISALLOW,
        ALLOW,
        CRAWL_DELAY,
        SITEMAP,
        HOST,
        NO_INDEX,
        ACAP_(true, false),
        REQUEST_RATE,
        VISIT_TIME,
        ROBOT_VERSION,
        COMMENT,
        HTTP,
        UNKNOWN(false, true),
        MISSING(false, true);

        private boolean _prefix;
        private boolean _special;

        private RobotDirective() {
            this._prefix = false;
            this._special = false;
        }

        private RobotDirective(boolean isPrefix, boolean isSpecial) {
            this._prefix = isPrefix;
            this._special = isSpecial;
        }

        public boolean isSpecial() {
            return this._special;
        }

        public boolean isPrefix() {
            return this._prefix;
        }
    }
}

