/*
 * Decompiled with CFR 0.152.
 */
package crawlercommons.sitemaps;

import crawlercommons.sitemaps.AbstractSiteMap;
import crawlercommons.sitemaps.SiteMap;
import crawlercommons.sitemaps.SiteMapIndex;
import crawlercommons.sitemaps.SiteMapURL;
import crawlercommons.sitemaps.UnknownFormatException;
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Date;
import java.util.zip.GZIPInputStream;
import javax.xml.parsers.DocumentBuilderFactory;
import org.apache.commons.io.input.BOMInputStream;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;

public class SiteMapParser {
    public static final Logger LOG = LoggerFactory.getLogger(SiteMapParser.class);
    private static final int MAX_URLS = 50000;
    public static int MAX_BYTES_ALLOWED = 0xA00000;
    private boolean strict;

    public SiteMapParser() {
        this(true);
    }

    public SiteMapParser(boolean strict) {
        this.strict = strict;
    }

    public boolean isStrict() {
        return this.strict;
    }

    public AbstractSiteMap parseSiteMap(String contentType, byte[] content, AbstractSiteMap sitemap) throws UnknownFormatException, IOException {
        AbstractSiteMap asmCopy = this.parseSiteMap(contentType, content, sitemap.getUrl());
        asmCopy.setLastModified(sitemap.getLastModified());
        sitemap.setProcessed(true);
        return asmCopy;
    }

    public AbstractSiteMap parseSiteMap(String contentType, byte[] content, URL url) throws UnknownFormatException, IOException {
        if (url.getPath().endsWith(".xml") || contentType.contains("text/xml") || contentType.contains("application/xml") || contentType.contains("application/x-xml") || contentType.contains("application/atom+xml") || contentType.contains("application/rss+xml")) {
            return this.processXml(url, content);
        }
        if (url.getPath().endsWith(".txt") || contentType.contains("text/plain")) {
            return this.processText(content, url.toString());
        }
        if (url.getPath().endsWith(".gz") || contentType.contains("application/gzip") || contentType.contains("application/x-gzip") || contentType.contains("application/x-gunzip") || contentType.contains("application/gzipped") || contentType.contains("application/gzip-compressed") || contentType.contains("application/x-compress") || contentType.contains("gzip/document") || contentType.contains("application/octet-stream")) {
            return this.processGzip(url, content);
        }
        throw new UnknownFormatException("Unknown format " + contentType + " at " + url);
    }

    private AbstractSiteMap processXml(URL sitemapUrl, byte[] xmlContent) throws UnknownFormatException {
        BOMInputStream bomIs = new BOMInputStream((InputStream)new ByteArrayInputStream(xmlContent));
        InputSource is = new InputSource();
        is.setCharacterStream(new BufferedReader(new InputStreamReader((InputStream)bomIs)));
        return this.processXml(sitemapUrl, is);
    }

    private SiteMap processText(byte[] content, String sitemapUrl) throws IOException {
        String line;
        LOG.debug("Processing textual Sitemap");
        SiteMap textSiteMap = new SiteMap(sitemapUrl);
        textSiteMap.setType(AbstractSiteMap.SitemapType.TEXT);
        BOMInputStream bomIs = new BOMInputStream((InputStream)new ByteArrayInputStream(content));
        BufferedReader reader = new BufferedReader(new InputStreamReader((InputStream)bomIs));
        int i = 1;
        while ((line = reader.readLine()) != null) {
            if (line.length() <= 0 || i > 50000) continue;
            try {
                URL url = new URL(line);
                boolean valid = this.urlIsLegal(textSiteMap.getBaseUrl(), url.toString());
                if (!valid && this.strict) continue;
                if (LOG.isDebugEnabled()) {
                    StringBuffer sb = new StringBuffer("  ");
                    sb.append(i).append(". ").append(url);
                    LOG.debug(sb.toString());
                }
                ++i;
                SiteMapURL surl = new SiteMapURL(url, valid);
                textSiteMap.addSiteMapUrl(surl);
            }
            catch (MalformedURLException e) {
                LOG.debug("Bad URL [" + line + "].");
            }
        }
        textSiteMap.setProcessed(true);
        return textSiteMap;
    }

    private AbstractSiteMap processGzip(URL url, byte[] response) throws MalformedURLException, IOException, UnknownFormatException {
        LOG.debug("Processing gzip");
        ByteArrayInputStream is = new ByteArrayInputStream(response);
        String xmlUrl = url.toString().replaceFirst("\\.gz$", "");
        LOG.debug("XML url = " + xmlUrl);
        BOMInputStream decompressed = new BOMInputStream((InputStream)new GZIPInputStream(is));
        InputSource in = new InputSource((InputStream)decompressed);
        in.setSystemId(xmlUrl);
        AbstractSiteMap smi = this.processXml(url, in);
        decompressed.close();
        return smi;
    }

    private AbstractSiteMap processXml(URL sitemapUrl, InputSource is) throws UnknownFormatException {
        Document doc = null;
        try {
            DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
            doc = dbf.newDocumentBuilder().parse(is);
        }
        catch (Exception e) {
            throw new UnknownFormatException("Error parsing XML for " + sitemapUrl);
        }
        NodeList nodeList = doc.getElementsByTagName("sitemapindex");
        if (nodeList.getLength() > 0) {
            nodeList = doc.getElementsByTagName("sitemap");
            return this.parseSitemapIndex(sitemapUrl, nodeList);
        }
        if (doc.getElementsByTagName("urlset").getLength() > 0) {
            return this.parseXmlSitemap(sitemapUrl, doc);
        }
        if (doc.getElementsByTagName("link").getLength() > 0) {
            return this.parseSyndicationFormat(sitemapUrl, doc);
        }
        throw new UnknownFormatException("Unknown XML format for " + sitemapUrl);
    }

    private SiteMap parseXmlSitemap(URL sitemapUrl, Document doc) {
        SiteMap sitemap = new SiteMap(sitemapUrl);
        sitemap.setType(AbstractSiteMap.SitemapType.XML);
        NodeList list = doc.getElementsByTagName("url");
        for (int i = 0; i < list.getLength(); ++i) {
            Node n = list.item(i);
            if (n.getNodeType() != 1) continue;
            Element elem = (Element)n;
            String loc = this.getElementValue(elem, "loc");
            URL url = null;
            try {
                url = new URL(loc);
                String lastMod = this.getElementValue(elem, "lastmod");
                String changeFreq = this.getElementValue(elem, "changefreq");
                String priority = this.getElementValue(elem, "priority");
                boolean valid = this.urlIsLegal(sitemap.getBaseUrl(), url.toString());
                if (!valid && this.strict) continue;
                SiteMapURL sUrl = new SiteMapURL(url.toString(), lastMod, changeFreq, priority, valid);
                sitemap.addSiteMapUrl(sUrl);
                if (!LOG.isDebugEnabled()) continue;
                StringBuffer sb = new StringBuffer("  ");
                sb.append(i + 1).append(". ").append(sUrl);
                LOG.debug(sb.toString());
                continue;
            }
            catch (MalformedURLException e) {
                LOG.debug("Bad url: [" + loc + "]");
            }
        }
        sitemap.setProcessed(true);
        return sitemap;
    }

    private SiteMapIndex parseSitemapIndex(URL url, NodeList nodeList) {
        LOG.debug("Parsing Sitemap Index");
        SiteMapIndex sitemapIndex = new SiteMapIndex(url);
        sitemapIndex.setType(AbstractSiteMap.SitemapType.INDEX);
        for (int i = 0; i < nodeList.getLength() && i < 50000; ++i) {
            Node firstNode = nodeList.item(i);
            URL sitemapUrl = null;
            Date lastModified = null;
            if (firstNode.getNodeType() != 1) continue;
            Element elem = (Element)firstNode;
            String loc = this.getElementValue(elem, "loc");
            if (loc == null) {
                loc = elem.getTextContent().trim();
            }
            try {
                sitemapUrl = new URL(loc);
                String lastmod = this.getElementValue(elem, "lastmod");
                lastModified = SiteMap.convertToDate(lastmod);
                SiteMap s = new SiteMap(sitemapUrl, lastModified);
                sitemapIndex.addSitemap(s);
                if (!LOG.isDebugEnabled()) continue;
                StringBuffer sb = new StringBuffer("  ");
                sb.append(i + 1).append(". ").append(s);
                LOG.debug(sb.toString());
                continue;
            }
            catch (MalformedURLException e) {
                LOG.debug("Bad url: [" + loc + "]");
            }
        }
        sitemapIndex.setProcessed(true);
        return sitemapIndex;
    }

    private SiteMap parseSyndicationFormat(URL sitemapUrl, Document doc) throws UnknownFormatException {
        SiteMap sitemap = new SiteMap(sitemapUrl);
        NodeList list = doc.getElementsByTagName("feed");
        if (list.getLength() > 0) {
            this.parseAtom(sitemap, (Element)list.item(0), doc);
            sitemap.setProcessed(true);
            return sitemap;
        }
        list = doc.getElementsByTagName("rss");
        if (list.getLength() > 0) {
            this.parseRSS(sitemap, doc);
            sitemap.setProcessed(true);
            return sitemap;
        }
        throw new UnknownFormatException("Unknown syndication format at " + sitemapUrl);
    }

    private void parseAtom(SiteMap sitemap, Element elem, Document doc) {
        LOG.debug("Parsing Atom XML");
        sitemap.setType(AbstractSiteMap.SitemapType.ATOM);
        String lastMod = this.getElementValue(elem, "modified");
        LOG.debug("lastMod=" + lastMod);
        NodeList list = doc.getElementsByTagName("entry");
        for (int i = 0; i < list.getLength() && i < 50000; ++i) {
            Node n = list.item(i);
            if (n.getNodeType() != 1) continue;
            elem = (Element)n;
            String href = this.getElementAttributeValue(elem, "link", "href");
            LOG.debug("href=" + href);
            URL url = null;
            try {
                url = new URL(href);
                boolean valid = this.urlIsLegal(sitemap.getBaseUrl(), url.toString());
                if (!valid && this.strict) continue;
                SiteMapURL sUrl = new SiteMapURL(url.toString(), lastMod, null, null, valid);
                sitemap.addSiteMapUrl(sUrl);
                if (!LOG.isDebugEnabled()) continue;
                StringBuffer sb = new StringBuffer("  ");
                sb.append(i + 1).append(". ").append(sUrl);
                LOG.debug(sb.toString());
                continue;
            }
            catch (MalformedURLException e) {
                LOG.debug("Bad url: [" + href + "]");
            }
        }
    }

    private void parseRSS(SiteMap sitemap, Document doc) {
        LOG.debug("Parsing RSS doc");
        sitemap.setType(AbstractSiteMap.SitemapType.RSS);
        NodeList list = doc.getElementsByTagName("channel");
        Element elem = (Element)list.item(0);
        String lastMod = this.getElementValue(elem, "pubDate");
        LOG.debug("lastMod=" + lastMod);
        list = doc.getElementsByTagName("item");
        for (int i = 0; i < list.getLength() && i < 50000; ++i) {
            Node n = list.item(i);
            if (n.getNodeType() != 1) continue;
            elem = (Element)n;
            String link = this.getElementValue(elem, "link");
            LOG.debug("link=" + link);
            try {
                URL url = new URL(link);
                boolean valid = this.urlIsLegal(sitemap.getBaseUrl(), url.toString());
                if (!valid && this.strict) continue;
                SiteMapURL sUrl = new SiteMapURL(url.toString(), lastMod, null, null, valid);
                sitemap.addSiteMapUrl(sUrl);
                if (!LOG.isDebugEnabled()) continue;
                StringBuffer sb = new StringBuffer("  ");
                sb.append(i + 1).append(". ").append(sUrl);
                LOG.debug(sb.toString());
                continue;
            }
            catch (MalformedURLException e) {
                LOG.debug("Bad url: [" + link + "]");
            }
        }
    }

    private String getElementValue(Element elem, String elementName) {
        NodeList children;
        NodeList list = elem.getElementsByTagName(elementName);
        Element e = (Element)list.item(0);
        if (e != null && (children = e.getChildNodes()).item(0) != null) {
            return children.item(0).getNodeValue().trim();
        }
        return null;
    }

    private String getElementAttributeValue(Element elem, String elementName, String attributeName) {
        NodeList list = elem.getElementsByTagName(elementName);
        Element e = (Element)list.item(0);
        if (e != null) {
            return e.getAttribute(attributeName);
        }
        return null;
    }

    private boolean urlIsLegal(String sitemapBaseUrl, String testUrl) {
        boolean ret = false;
        if (sitemapBaseUrl != null && sitemapBaseUrl.length() <= testUrl.length()) {
            String u = testUrl.substring(0, sitemapBaseUrl.length()).toLowerCase();
            ret = sitemapBaseUrl.equals(u);
        }
        if (LOG.isTraceEnabled()) {
            StringBuffer sb = new StringBuffer("urlIsLegal: ");
            sb.append(sitemapBaseUrl).append(" <= ").append(testUrl);
            sb.append(" ? ").append(ret);
            LOG.trace(sb.toString());
        }
        return ret;
    }
}

