org.apache.manifoldcf.crawler.connectors.webcrawler
Class BasicParseState

java.lang.Object
  extended by org.apache.manifoldcf.crawler.connectors.webcrawler.BasicParseState
Direct Known Subclasses:
ScriptParseState

public class BasicParseState
extends java.lang.Object

This class represents the basic, outermost parse state.


Field Summary
protected static int BASICPARSESTATE_IN_ATTR_LOOKING_FOR_VALUE
           
protected static int BASICPARSESTATE_IN_ATTR_NAME
           
protected static int BASICPARSESTATE_IN_ATTR_VALUE
           
protected static int BASICPARSESTATE_IN_COMMENT
           
protected static int BASICPARSESTATE_IN_DOUBLE_QUOTES_ATTR_VALUE
           
protected static int BASICPARSESTATE_IN_END_TAG_NAME
           
protected static int BASICPARSESTATE_IN_SINGLE_QUOTES_ATTR_VALUE
           
protected static int BASICPARSESTATE_IN_TAG_NAME
           
protected static int BASICPARSESTATE_IN_TAG_SAW_SLASH
           
protected static int BASICPARSESTATE_IN_UNQUOTED_ATTR_VALUE
           
protected static int BASICPARSESTATE_NORMAL
           
protected static int BASICPARSESTATE_SAWCOMMENTDASH
           
protected static int BASICPARSESTATE_SAWDASH
           
protected static int BASICPARSESTATE_SAWEXCLAMATION
           
protected static int BASICPARSESTATE_SAWLEFTBRACKET
           
protected static int BASICPARSESTATE_SAWSECONDCOMMENTDASH
           
protected  java.util.Map currentAttrMap
           
protected  java.lang.String currentAttrName
           
protected  java.lang.StringBuffer currentAttrNameBuffer
           
protected  int currentState
           
protected  java.lang.String currentTagName
           
protected  java.lang.StringBuffer currentTagNameBuffer
           
protected  java.lang.StringBuffer currentValueBuffer
           
protected static java.util.Map mapLookup
           
 
Constructor Summary
BasicParseState()
           
 
Method Summary
 void dealWithCharacter(char thisChar)
          Deal with a character.
 void finishUp()
           
protected static java.lang.String htmlAttributeDecode(java.lang.String input)
          Decode an html attribute
protected static boolean isHTMLWhitespace(char x)
          Is a character HTML whitespace?
protected static java.lang.String mapChunk(java.lang.String input)
          Map an entity reference back to a character
protected  void noteEndTag(java.lang.String tagName)
           
protected  void noteTag(java.lang.String tagName, java.util.Map attributes)
           
 
Methods inherited from class java.lang.Object
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
 

Field Detail

BASICPARSESTATE_NORMAL

protected static final int BASICPARSESTATE_NORMAL
See Also:
Constant Field Values

BASICPARSESTATE_SAWLEFTBRACKET

protected static final int BASICPARSESTATE_SAWLEFTBRACKET
See Also:
Constant Field Values

BASICPARSESTATE_SAWEXCLAMATION

protected static final int BASICPARSESTATE_SAWEXCLAMATION
See Also:
Constant Field Values

BASICPARSESTATE_SAWDASH

protected static final int BASICPARSESTATE_SAWDASH
See Also:
Constant Field Values

BASICPARSESTATE_IN_COMMENT

protected static final int BASICPARSESTATE_IN_COMMENT
See Also:
Constant Field Values

BASICPARSESTATE_SAWCOMMENTDASH

protected static final int BASICPARSESTATE_SAWCOMMENTDASH
See Also:
Constant Field Values

BASICPARSESTATE_SAWSECONDCOMMENTDASH

protected static final int BASICPARSESTATE_SAWSECONDCOMMENTDASH
See Also:
Constant Field Values

BASICPARSESTATE_IN_TAG_NAME

protected static final int BASICPARSESTATE_IN_TAG_NAME
See Also:
Constant Field Values

BASICPARSESTATE_IN_ATTR_NAME

protected static final int BASICPARSESTATE_IN_ATTR_NAME
See Also:
Constant Field Values

BASICPARSESTATE_IN_ATTR_VALUE

protected static final int BASICPARSESTATE_IN_ATTR_VALUE
See Also:
Constant Field Values

BASICPARSESTATE_IN_TAG_SAW_SLASH

protected static final int BASICPARSESTATE_IN_TAG_SAW_SLASH
See Also:
Constant Field Values

BASICPARSESTATE_IN_END_TAG_NAME

protected static final int BASICPARSESTATE_IN_END_TAG_NAME
See Also:
Constant Field Values

BASICPARSESTATE_IN_ATTR_LOOKING_FOR_VALUE

protected static final int BASICPARSESTATE_IN_ATTR_LOOKING_FOR_VALUE
See Also:
Constant Field Values

BASICPARSESTATE_IN_SINGLE_QUOTES_ATTR_VALUE

protected static final int BASICPARSESTATE_IN_SINGLE_QUOTES_ATTR_VALUE
See Also:
Constant Field Values

BASICPARSESTATE_IN_DOUBLE_QUOTES_ATTR_VALUE

protected static final int BASICPARSESTATE_IN_DOUBLE_QUOTES_ATTR_VALUE
See Also:
Constant Field Values

BASICPARSESTATE_IN_UNQUOTED_ATTR_VALUE

protected static final int BASICPARSESTATE_IN_UNQUOTED_ATTR_VALUE
See Also:
Constant Field Values

currentState

protected int currentState

currentTagNameBuffer

protected java.lang.StringBuffer currentTagNameBuffer

currentAttrNameBuffer

protected java.lang.StringBuffer currentAttrNameBuffer

currentValueBuffer

protected java.lang.StringBuffer currentValueBuffer

currentTagName

protected java.lang.String currentTagName

currentAttrName

protected java.lang.String currentAttrName

currentAttrMap

protected java.util.Map currentAttrMap

mapLookup

protected static final java.util.Map mapLookup
Constructor Detail

BasicParseState

public BasicParseState()
Method Detail

dealWithCharacter

public void dealWithCharacter(char thisChar)
                       throws org.apache.manifoldcf.core.interfaces.ManifoldCFException
Deal with a character. No exceptions are allowed, since those would represent syntax errors, and we don't want those to cause difficulty.

Throws:
org.apache.manifoldcf.core.interfaces.ManifoldCFException

noteTag

protected void noteTag(java.lang.String tagName,
                       java.util.Map attributes)
                throws org.apache.manifoldcf.core.interfaces.ManifoldCFException
Throws:
org.apache.manifoldcf.core.interfaces.ManifoldCFException

noteEndTag

protected void noteEndTag(java.lang.String tagName)
                   throws org.apache.manifoldcf.core.interfaces.ManifoldCFException
Throws:
org.apache.manifoldcf.core.interfaces.ManifoldCFException

finishUp

public void finishUp()
              throws org.apache.manifoldcf.core.interfaces.ManifoldCFException
Throws:
org.apache.manifoldcf.core.interfaces.ManifoldCFException

htmlAttributeDecode

protected static java.lang.String htmlAttributeDecode(java.lang.String input)
Decode an html attribute


mapChunk

protected static java.lang.String mapChunk(java.lang.String input)
Map an entity reference back to a character


isHTMLWhitespace

protected static boolean isHTMLWhitespace(char x)
Is a character HTML whitespace?