[Htmlparser-cvs] htmlparser/src/org/htmlparser/scanners CompositeTagScanner.java,1.68,1.69 ImageScan
Brought to you by:
derrickoswald
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners In directory sc8-pr-cvs1:/tmp/cvs-serv9618/scanners Modified Files: CompositeTagScanner.java ImageScanner.java LinkScanner.java ScriptScanner.java TagScanner.java Log Message: Add bean like accessors for positions on Node, AbstractNode and AbstractNodeDecorator. Handle null page in Cursor. Add smartquotes mode in Lexer and CompositeTagScannerHelper. Add simple name constructor in Attribute. Remove emptyxmltag member, replace with computing accessors in TagNode. Removed ScriptScannerHelper and moved scanning logic to ScriptScanner. Reworked extractImageLocn in ImageScanner Implement extractXMLData in TagScanner. Made virtual tags zero length in TagData. Added push() to IteratorImpl. Added single node constructor to NodeList. Numerous and various test adjustments. Still 133 failures. Index: CompositeTagScanner.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/CompositeTagScanner.java,v retrieving revision 1.68 retrieving revision 1.69 diff -C2 -d -r1.68 -r1.69 *** CompositeTagScanner.java 28 Sep 2003 15:33:58 -0000 1.68 --- CompositeTagScanner.java 5 Oct 2003 13:49:52 -0000 1.69 *************** *** 216,229 **** public abstract Tag createTag(TagData tagData, CompositeTagData compositeTagData) throws ParserException; ! public final boolean isTagToBeEndedFor(Tag tag) { ! boolean isEndTag = tag.isEndTag (); ! String tagName = tag.getTagName(); ! if (isEndTag) ! tagName = tagName.substring (1); ! if ( ! ( isEndTag && endTagEnderSet.contains(tagName)) || ! (!isEndTag && tagEnderSet.contains(tagName)) ! ) ! return true; else return false; } --- 216,232 ---- public abstract Tag createTag(TagData tagData, CompositeTagData compositeTagData) throws ParserException; ! public final boolean isTagToBeEndedFor(Tag tag) ! { ! String name; ! boolean ret; ! ! ret = false; ! name = tag.getTagName (); ! if (tag.isEndTag ()) ! ret = endTagEnderSet.contains (name); ! else ! ret = tagEnderSet.contains (name); ! ! return (ret); } Index: ImageScanner.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/ImageScanner.java,v retrieving revision 1.29 retrieving revision 1.30 diff -C2 -d -r1.29 -r1.30 *** ImageScanner.java 28 Sep 2003 15:33:58 -0000 1.29 --- ImageScanner.java 5 Oct 2003 13:49:52 -0000 1.30 *************** *** 33,36 **** --- 33,38 ---- ////////////////// import java.util.Hashtable; + import java.util.Vector; + import org.htmlparser.lexer.nodes.Attribute; import org.htmlparser.tags.ImageTag; *************** *** 66,104 **** this.processor = processor; } ! /** ! * Extract the location of the image, given the string to be parsed, and the url ! * of the html page in which this tag exists. ! * @param tag The tag with the 'SRC' attribute. ! * @param url URL of web page being parsed. ! */ ! public String extractImageLocn (Tag tag,String url) throws ParserException { String ret; - Hashtable table; ret = ""; ! try { ! table = tag.getAttributes (); ! ret = (String)table.get ("SRC"); ! if (null != ret) { ! ret = ParserUtils.removeChars (ret, '\n'); ! ret = ParserUtils.removeChars (ret, '\r'); ! ret = processor.extract (ret, url); } - else - ret = ""; - } - catch (Exception e) - { - throw new ParserException ( - "ImageScanner.extractImageLocn() : " - + "Error in extracting image location, relativeLink = " - + ret - + ", url = " - + url, - e); } return (ret); --- 68,172 ---- this.processor = processor; } ! ! /** ! * Extract the location of the image ! * Given the tag (with attributes), and the url of the html page in which ! * this tag exists, perform best effort to extract the 'intended' URL. ! * Attempts to handle such attributes as: ! * <pre> ! * <IMG SRC=http://www.redgreen.com> - normal ! * <IMG SRC =http://www.redgreen.com> - space between attribute name and equals sign ! * <IMG SRC= http://www.redgreen.com> - space between equals sign and attribute value ! * <IMG SRC = http://www.redgreen.com> - space both sides of equals sign ! * </pre> ! * @param tag The tag with the 'SRC' attribute. ! * @param url URL of web page being parsed. ! */ ! public String extractImageLocn (Tag tag, String url) throws ParserException { + Vector attributes; + int size; + Attribute attribute; + String string; + String data; + int state; + String name; String ret; ret = ""; ! state = 0; ! attributes = tag.getAttributesEx (); ! size = attributes.size (); ! for (int i = 0; (i < size) && (state < 3); i++) { ! attribute = (Attribute)attributes.elementAt (i); ! string = attribute.getName (); ! data = attribute.getValue (); ! switch (state) { ! case 0: // looking for 'src' ! if (null != string) ! { ! name = string.toUpperCase (); ! if (name.equals ("SRC")) ! { ! state = 1; ! if (null != data) ! { ! if ("".equals (data)) ! state = 2; // empty attribute, SRC= ! else ! { ! ret = data; ! i = size; // exit fast ! } ! } ! ! } ! else if (name.startsWith ("SRC")) ! { ! // missing equals sign ! ret = string.substring (3); ! state = 0; // go back to searching for SRC ! // because, maybe we found SRCXXX ! // where XXX isn't a URL ! } ! } ! break; ! case 1: // looking for equals sign ! if (null != string) ! { ! if (string.startsWith ("=")) ! { ! state = 2; ! if (1 < string.length ()) ! { ! ret = string.substring (1); ! state = 0; // keep looking ? ! } ! else if (null != data) ! { ! ret = string.substring (1); ! state = 0; // keep looking ? ! } ! } ! } ! break; ! case 2: // looking for a valueless attribute that could be a relative or absolute URL ! if (null != string) ! { ! if (null == data) ! ret = string; ! state = 0; // only check first non-whitespace item ! // not every valid attribute after an equals ! } ! break; ! default: ! throw new IllegalStateException ("we're not supposed to in state " + state); } } + ret = ParserUtils.removeChars (ret, '\n'); + ret = ParserUtils.removeChars (ret, '\r'); + ret = processor.extract (ret, url); return (ret); Index: LinkScanner.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/LinkScanner.java,v retrieving revision 1.53 retrieving revision 1.54 diff -C2 -d -r1.53 -r1.54 *** LinkScanner.java 22 Sep 2003 02:40:00 -0000 1.53 --- LinkScanner.java 5 Oct 2003 13:49:53 -0000 1.54 *************** *** 53,57 **** private static final String MATCH_NAME [] = {"A"}; public static final String LINK_SCANNER_ID = "A"; - public static final String DIRTY_TAG_MESSAGE=" is a dirty link tag - the tag was not closed. \nWe encountered an open tag, before the previous end tag was found.\nCorrecting this.."; private LinkProcessor processor; private final static String ENDERS [] = { "TD","TR","FORM","LI","BODY", "HTML" }; --- 53,56 ---- Index: ScriptScanner.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/ScriptScanner.java,v retrieving revision 1.40 retrieving revision 1.41 diff -C2 -d -r1.40 -r1.41 *** ScriptScanner.java 28 Sep 2003 15:33:58 -0000 1.40 --- ScriptScanner.java 5 Oct 2003 13:49:53 -0000 1.41 *************** *** 28,44 **** package org.htmlparser.scanners; ! ///////////////////////// ! // HTML Parser Imports // ! ///////////////////////// ! import org.htmlparser.*; import org.htmlparser.lexer.Lexer; ! import org.htmlparser.parserHelper.*; ! import org.htmlparser.tags.*; ! import org.htmlparser.tags.data.*; ! import org.htmlparser.util.*; /** ! * The HTMLScriptScanner identifies javascript code */ - public class ScriptScanner extends CompositeTagScanner { private static final String SCRIPT_END_TAG = "</SCRIPT>"; --- 28,50 ---- package org.htmlparser.scanners; ! ! import java.util.Vector; ! import org.htmlparser.Node; ! import org.htmlparser.Parser; ! import org.htmlparser.RemarkNode; ! import org.htmlparser.StringNode; import org.htmlparser.lexer.Lexer; ! import org.htmlparser.lexer.nodes.NodeFactory; ! import org.htmlparser.tags.ScriptTag; ! import org.htmlparser.tags.Tag; ! import org.htmlparser.tags.data.CompositeTagData; ! import org.htmlparser.tags.data.TagData; ! import org.htmlparser.util.NodeList; ! import org.htmlparser.util.ParserException; ! /** ! * The ScriptScanner handles javascript code. ! * It gathers all interior nodes into one undifferentiated string node. */ public class ScriptScanner extends CompositeTagScanner { private static final String SCRIPT_END_TAG = "</SCRIPT>"; *************** *** 68,84 **** } ! public Tag scan (Tag tag, Lexer lexer) ! throws ParserException { ! try { ! ScriptScannerHelper helper = ! new ScriptScannerHelper(tag, lexer, this); ! return helper.scan(); } ! catch (Exception e) { ! throw new ParserException("Error in ScriptScanner: ",e); } - } /** --- 74,188 ---- } ! /** ! * Scan for script. ! * Accumulates nodes returned from the lexer, until </SCRIPT>, ! * <BODY> or <HTML> is encountered. Replaces the node factory ! * in the lexer with a new Parser to avoid other scanners missing their ! * end tags and accumulating even the </SCRIPT>. ! */ ! public Tag scan (Tag tag, String url, Lexer lexer) ! throws ParserException ! { ! Node node; ! boolean done; ! int position; ! StringNode last; ! Tag end; ! NodeFactory factory; ! TagData data; ! Tag ret; ! ! done = false; ! last = null; ! end = null; ! factory = lexer.getNodeFactory (); ! lexer.setNodeFactory (new Parser ()); // no scanners on a new Parser right? ! try ! { ! do ! { ! position = lexer.getPosition (); ! node = lexer.nextNode (true); ! if (null == node) ! break; ! else ! if (node instanceof Tag) ! if ( ((Tag)node).isEndTag () ! && ((Tag)node).getTagName ().equals (MATCH_NAME[0])) ! { ! end = (Tag)node; ! done = true; ! } ! else if (isTagToBeEndedFor ((Tag)node)) ! { ! lexer.setPosition (position); ! done = true; ! } ! else ! { ! // must be a string, even though it looks like a tag ! if (null != last) ! // append it to the previous one ! last.setEndPosition (node.elementEnd ()); ! else ! // TODO: need to remove this cast ! last = (StringNode)lexer.createStringNode (lexer, node.elementBegin (), node.elementEnd ()); ! } ! else if (node instanceof RemarkNode) ! { ! if (null != last) ! last.setEndPosition (node.getEndPosition ()); ! else ! // TODO: need to remove this cast ! last = (StringNode)lexer.createStringNode (lexer, node.elementBegin (), node.elementEnd ()); ! } ! else // StringNode ! { ! if (null != last) ! last.setEndPosition (node.getEndPosition ()); ! else ! // TODO: need to remove this cast ! last = (StringNode)node; ! } ! ! } ! while (!done); ! ! // build new string tag if required ! if (null == last) ! // TODO: need to remove this cast ! last = (StringNode)factory.createStringNode (lexer, position, position); ! // build new end tag if required ! if (null == end) ! { ! data = new TagData( ! "/" + tag.getTagName (), ! tag.getEndPosition (), ! new Vector (), ! lexer.getPage ().getUrl (), ! false); ! end = new Tag (data); ! //TODO: use the factory: end = factory.createTagNode (mLexer, last.getEndPosition (), last.getEndPosition () + ! } ! data = new TagData( ! lexer.getPage (), ! tag.elementBegin(), ! end.elementEnd(), ! tag.getAttributesEx (), ! lexer.getPage ().getUrl (), ! tag.isEmptyXmlTag ()); + ret = createTag( + data, + new CompositeTagData(tag, end, new NodeList (last)) + ); } ! finally ! { ! lexer.setNodeFactory (factory); } + return (ret); + } /** *************** *** 87,95 **** * @return String containing the end tag to search for, i.e. </SCRIPT> */ ! public String getEndTag() { return SCRIPT_END_TAG; } - - - } --- 191,197 ---- * @return String containing the end tag to search for, i.e. </SCRIPT> */ ! public String getEndTag() ! { return SCRIPT_END_TAG; } } Index: TagScanner.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/TagScanner.java,v retrieving revision 1.41 retrieving revision 1.42 diff -C2 -d -r1.41 -r1.42 *** TagScanner.java 28 Sep 2003 15:33:58 -0000 1.41 --- TagScanner.java 5 Oct 2003 13:49:53 -0000 1.42 *************** *** 43,46 **** --- 43,47 ---- import org.htmlparser.tags.Tag; import org.htmlparser.tags.data.TagData; + import org.htmlparser.util.NodeIterator; import org.htmlparser.util.ParserException; import org.htmlparser.util.ParserFeedback; *************** *** 142,188 **** return true; } ! ! // public static String extractXMLData(Node node, String tagName, NodeReader reader) throws ParserException{ ! // try { ! // String xmlData = ""; ! // ! // boolean xmlTagFound = isXMLTagFound(node, tagName); ! // if (xmlTagFound) { ! // try{ ! // do { ! // node = reader.readElement(); ! // if (node!=null) { ! // if (node instanceof StringNode) { ! // StringNode stringNode = (StringNode)node; ! // if (xmlData.length()>0) xmlData+=" "; ! // xmlData += stringNode.getText(); ! // } else if (!(node instanceof org.htmlparser.tags.EndTag)) ! // xmlTagFound = false; ! // } ! // } ! // while (node instanceof StringNode); ! // ! // } ! // ! // catch (Exception e) { ! // throw new ParserException("HTMLTagScanner.extractXMLData() : error while trying to find xml tag",e); ! // } ! // } ! // if (xmlTagFound) { ! // if (node!=null) { ! // if (node instanceof org.htmlparser.tags.EndTag) { ! // org.htmlparser.tags.EndTag endTag = (org.htmlparser.tags.EndTag)node; ! // if (!endTag.getText().equals(tagName)) xmlTagFound = false; ! // } ! // ! // } ! // ! // } ! // if (xmlTagFound) return xmlData; else return null; ! // } ! // catch (Exception e) { ! // throw new ParserException("HTMLTagScanner.extractXMLData() : Error occurred while trying to extract xml tag",e); ! // } ! // } public String getFilter() { --- 143,215 ---- return true; } ! ! /** ! * Pull the text between two matching capitalized 'XML' tags. ! * @deprecated This reads ahead on your iterator and doesn't put them back if it's not an XML tag. ! */ ! public static String extractXMLData (Node node, String tagName, NodeIterator iterator) ! throws ! ParserException ! { ! try ! { ! String xmlData = ""; ! ! boolean xmlTagFound = isXMLTagFound (node, tagName); ! if (xmlTagFound) ! { ! try ! { ! do ! { ! node = iterator.nextNode (); ! if (node!=null) ! { ! if (node instanceof StringNode) ! { ! StringNode stringNode = (StringNode)node; ! if (xmlData.length ()>0) ! xmlData+=" "; ! xmlData += stringNode.getText (); ! } ! else ! if (!(node instanceof Tag && ((Tag)node).isEndTag ())) ! xmlTagFound = false; ! } ! } ! while (node instanceof StringNode); ! ! } ! ! catch (Exception e) ! { ! throw new ParserException ("TagScanner.extractXMLData() : error while trying to find xml tag",e); ! } ! } ! // check end tag matches start tag ! if (xmlTagFound) ! { ! if (node!=null) ! { ! if (node instanceof Tag && ((Tag)node).isEndTag ()) ! { ! Tag endTag = (Tag)node; ! if (!endTag.getTagName ().equals (tagName)) ! xmlTagFound = false; ! } ! ! } ! ! } ! if (xmlTagFound) ! return xmlData; ! else ! return null; ! } ! catch (Exception e) ! { ! throw new ParserException ("TagScanner.extractXMLData() : Error occurred while trying to extract xml tag",e); ! } ! } public String getFilter() { |