/*
 * Decompiled with CFR 0.152.
 */
package org.apache.nutch.parse.headings;

import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.parse.HTMLMetaTags;
import org.apache.nutch.parse.HtmlParseFilter;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseResult;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.util.NodeWalker;
import org.w3c.dom.DocumentFragment;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;

public class HeadingsParseFilter
implements HtmlParseFilter {
    protected static Pattern whitespacePattern = Pattern.compile("\\s+");
    private Configuration conf;
    private String[] headings;
    private boolean multiValued = false;

    public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) {
        Parse parse = parseResult.get(content.getUrl());
        for (int i = 0; this.headings != null && i < this.headings.length; ++i) {
            List<String> discoveredHeadings = this.getElement(doc, this.headings[i]);
            if (discoveredHeadings.size() <= 0) continue;
            for (String heading : discoveredHeadings) {
                if (heading == null) continue;
                heading.trim();
                if (heading.length() <= 0) continue;
                parse.getData().getParseMeta().add(this.headings[i], heading);
            }
        }
        return parseResult;
    }

    public void setConf(Configuration conf) {
        this.conf = conf;
        this.headings = conf.getStrings("headings");
        this.multiValued = conf.getBoolean("headings.multivalued", false);
    }

    public Configuration getConf() {
        return this.conf;
    }

    protected List<String> getElement(DocumentFragment doc, String element) {
        ArrayList<String> headings = new ArrayList<String>();
        NodeWalker walker = new NodeWalker((Node)doc);
        while (walker.hasNext()) {
            Node currentNode = walker.nextNode();
            if (currentNode.getNodeType() != 1 || !element.equalsIgnoreCase(currentNode.getNodeName())) continue;
            headings.add(HeadingsParseFilter.getNodeValue(currentNode));
            if (this.multiValued) continue;
            break;
        }
        return headings;
    }

    protected static String getNodeValue(Node node) {
        StringBuilder buffer = new StringBuilder();
        NodeList children = node.getChildNodes();
        for (int i = 0; i < children.getLength(); ++i) {
            if (children.item(i).getNodeType() != 3) continue;
            buffer.append(children.item(i).getNodeValue());
        }
        Matcher matcher = whitespacePattern.matcher(buffer.toString().trim());
        return matcher.replaceAll(" ").trim();
    }
}

