[Htmlparser-cvs] htmlparser/src/org/htmlparser/scanners BulletListScanner.java,1.19,1.20 BulletScann
Brought to you by:
derrickoswald
From: <der...@us...> - 2003-10-28 03:05:42
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners In directory sc8-pr-cvs1:/tmp/cvs-serv19975/scanners Modified Files: BulletListScanner.java BulletScanner.java CompositeTagScanner.java FormScanner.java OptionTagScanner.java ScriptScanner.java SelectTagScanner.java TextareaTagScanner.java Log Message: Moved the recursion from the NodeFactory to the CompositeTagScanner where it belongs. Also needed to kick off the recursion in IteratorImpl. The scnner is obtained in a kludgy way -- just 'til tags know their own scanners. Also fixed the other NodeFactory signatures to have a Page rather than a Lexer. Index: BulletListScanner.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/BulletListScanner.java,v retrieving revision 1.19 retrieving revision 1.20 diff -C2 -d -r1.19 -r1.20 *** BulletListScanner.java 26 Oct 2003 19:46:19 -0000 1.19 --- BulletListScanner.java 28 Oct 2003 03:04:18 -0000 1.20 *************** *** 29,33 **** package org.htmlparser.scanners; - import java.util.Stack; import java.util.Vector; --- 29,32 ---- *************** *** 47,51 **** private static final String [] MATCH_STRING = { "UL", "OL" }; private final static String ENDERS [] = { "BODY", "HTML" }; - private Stack ulli = new Stack(); public BulletListScanner(Parser parser) --- 46,49 ---- *************** *** 57,61 **** { super(filter, MATCH_STRING, ENDERS); ! parser.addScanner(new BulletScanner("-bullet",ulli)); } --- 55,59 ---- { super(filter, MATCH_STRING, ENDERS); ! parser.addScanner(new BulletScanner("-bullet")); } *************** *** 80,88 **** return MATCH_STRING; } - - public void beforeScanningStarts() - { - ulli.push(this); - } - } --- 78,80 ---- Index: BulletScanner.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/BulletScanner.java,v retrieving revision 1.24 retrieving revision 1.25 diff -C2 -d -r1.24 -r1.25 *** BulletScanner.java 26 Oct 2003 19:46:19 -0000 1.24 --- BulletScanner.java 28 Oct 2003 03:04:18 -0000 1.25 *************** *** 29,33 **** package org.htmlparser.scanners; - import java.util.Stack; import java.util.Vector; import org.htmlparser.lexer.Page; --- 29,32 ---- *************** *** 50,68 **** { private static final String [] MATCH_STRING = {"LI"}; ! private final static String ENDERS [] = { "BODY", "HTML" }; private final static String END_TAG_ENDERS [] = { "UL" }; - private Stack ulli; - - public BulletScanner(Stack ulli) - { - this("",ulli); - } ! public BulletScanner(String filter, Stack ulli) { super(filter, MATCH_STRING, ENDERS, END_TAG_ENDERS, false); - this.ulli = ulli; } ! public Tag createTag(Page page, int start, int end, Vector attributes, Tag startTag, Tag endTag, NodeList children) throws ParserException { --- 49,60 ---- { private static final String [] MATCH_STRING = {"LI"}; ! private final static String ENDERS [] = { "LI", "BODY", "HTML" }; private final static String END_TAG_ENDERS [] = { "UL" }; ! public BulletScanner(String filter) { super(filter, MATCH_STRING, ENDERS, END_TAG_ENDERS, false); } ! public Tag createTag(Page page, int start, int end, Vector attributes, Tag startTag, Tag endTag, NodeList children) throws ParserException { *************** *** 85,110 **** return MATCH_STRING; } - - /** - * This is the logic that decides when a bullet tag can be allowed - */ - public boolean shouldCreateEndTagAndExit() - { - if (ulli.size()==0) - return false; - CompositeTagScanner parentScanner = (CompositeTagScanner)ulli.peek(); - if (parentScanner == this) - { - ulli.pop(); - return true; - } - else - return false; - } - - public void beforeScanningStarts() - { - ulli.push(this); - } - } --- 77,79 ---- Index: CompositeTagScanner.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/CompositeTagScanner.java,v retrieving revision 1.73 retrieving revision 1.74 diff -C2 -d -r1.73 -r1.74 *** CompositeTagScanner.java 26 Oct 2003 19:46:19 -0000 1.73 --- CompositeTagScanner.java 28 Oct 2003 03:04:18 -0000 1.74 *************** *** 188,196 **** /** * Collect the children. - * Performs an immediate call to {@link #shouldCreateEndTagAndExit} to - * allow subclasses to override the scan is a primitive way. If - * <code>true</code>, returns a virtual end tag and repositions the lexer - * to re-read that same tag.<p> - * Otherwise, calls {@link #beforeScanningStarts} and begins scanning. * An initial test is performed for an empty XML tag, in which case * the start tag and end tag of the returned tag are the same and it has --- 188,191 ---- *************** *** 202,206 **** * <code>false</code>. In all but the first case, a virtual end tag * is created. Each node found that is not the end tag is added to ! * the list of children and a call made to {@link #childNodeEncountered}.<p> * The scanner's {@link #createTag} method is called with details about * the start tag, end tag and children. The attributes from the start tag --- 197,201 ---- * <code>false</code>. In all but the first case, a virtual end tag * is created. Each node found that is not the end tag is added to ! * the list of children.<p> * The scanner's {@link #createTag} method is called with details about * the start tag, end tag and children. The attributes from the start tag *************** *** 211,217 **** * @param url The url for the page the tag is discovered on. * @param lexer The source of subsequent nodes. ! * @return The scanner specific tag from the call to {@link #createTag}., ! * or the virtual end tag if {@link #shouldCreateEndTagAndExit} returned ! * <code>true</code>. */ public Tag scan (Tag tag, String url, Lexer lexer) throws ParserException --- 206,210 ---- * @param url The url for the page the tag is discovered on. * @param lexer The source of subsequent nodes. ! * @return The scanner specific tag from the call to {@link #createTag}. */ public Tag scan (Tag tag, String url, Lexer lexer) throws ParserException *************** *** 220,284 **** NodeList nodeList; Tag endTag; CompositeTag composite; Tag ret; ! if (shouldCreateEndTagAndExit ()) ! { ! ret = createVirtualEndTag (tag, lexer.getPage (), tag.elementBegin ()); ! lexer.setPosition (tag.elementBegin ()); ! } else ! { ! beforeScanningStarts (); ! nodeList = new NodeList (); ! endTag = null; ! ! if (tag.isEmptyXmlTag ()) ! endTag = tag; ! else ! do { ! node = lexer.nextNode (balance_quotes); ! if (null != node) { ! if (node instanceof Tag) { ! Tag end = (Tag)node; ! // check for normal end tag ! if (end.isEndTag () && end.getTagName ().equals (tag.getTagName ())) ! { ! endTag = end; ! node = null; ! } ! else if (isTagToBeEndedFor (end) || // check DTD ! ( // check for child of same name not allowed ! !(end.isEndTag ()) && ! !isAllowSelfChildren () && ! end.getTagName ().equals (tag.getTagName ()) ! )) ! { ! endTag = createVirtualEndTag (tag, lexer.getPage (), end.elementBegin ()); ! lexer.setPosition (end.elementBegin ()); ! node = null; ! } } ! ! if (null != node) { ! nodeList.add (node); ! childNodeEncountered (node); } } } while (null != node); ! ! if (null == endTag) ! endTag = createVirtualEndTag (tag, lexer.getPage (), lexer.getCursor ().getPosition ()); ! ! composite = (CompositeTag)createTag (lexer.getPage (), tag.elementBegin (), endTag.elementEnd (), tag.getAttributesEx (), tag, endTag, nodeList); ! for (int i = 0; i < composite.getChildCount (); i++) ! composite.childAt (i).setParent (composite); ! ret = composite; ! } return (ret); --- 213,280 ---- NodeList nodeList; Tag endTag; + String name; + TagScanner scanner; CompositeTag composite; Tag ret; ! nodeList = new NodeList (); ! endTag = null; ! ! if (tag.isEmptyXmlTag ()) ! endTag = tag; else ! do ! { ! node = lexer.nextNode (balance_quotes); ! if (null != node) { ! if (node instanceof Tag) { ! Tag next = (Tag)node; ! name = next.getTagName (); ! // check for normal end tag ! if (next.isEndTag () && name.equals (tag.getTagName ())) { ! endTag = next; ! node = null; } ! else if (isTagToBeEndedFor (next) || // check DTD ! ( // check for child of same name not allowed ! !(next.isEndTag ()) && ! !isAllowSelfChildren () && ! name.equals (tag.getTagName ()) ! )) { ! // insert a virtual end tag and backup one node ! endTag = createVirtualEndTag (tag, lexer.getPage (), next.elementBegin ()); ! lexer.setPosition (next.elementBegin ()); ! node = null; ! } ! else if (!next.isEndTag ()) ! { ! // now recurse if there is a scanner for this type of tag ! // whoah! really cheat here to get the parser ! // maybe eventually the tag will know it's own scanner eh ! org.htmlparser.Parser parser = (org.htmlparser.Parser)lexer.getNodeFactory (); ! scanner = parser.getScanner (name); ! if ((null != scanner) && scanner.evaluate (next, this)) ! node = scanner.createScannedNode (next, lexer.getPage ().getUrl (), lexer); } } + + if (null != node) + nodeList.add (node); } + } while (null != node); ! ! if (null == endTag) ! endTag = createVirtualEndTag (tag, lexer.getPage (), lexer.getCursor ().getPosition ()); ! ! composite = (CompositeTag)createTag (lexer.getPage (), tag.elementBegin (), endTag.elementEnd (), tag.getAttributesEx (), tag, endTag, nodeList); ! for (int i = 0; i < composite.getChildCount (); i++) ! composite.childAt (i).setParent (composite); ! ret = composite; ! return (ret); *************** *** 312,332 **** /** - * Override this method if you wish to create any data structures or do anything - * before the start of the scan. This is just after a tag has triggered the scanner - * but before the scanner begins its processing. - */ - public void beforeScanningStarts() - { - } - - /** - * This method is called everytime a child to the composite is found. It is useful when we - * need to store special children seperately. Though, all children are collected anyway into a node list. - */ - public void childNodeEncountered(Node node) - { - } - - /** * For composite tags this shouldn't be used and hence throws an exception. */ --- 308,311 ---- *************** *** 368,383 **** } ! public final boolean isAllowSelfChildren() { return allowSelfChildren; - } - - /** - * Override this method to implement scanner logic that determines if the current scanner is - * to be allowed. This is useful when there are rules which dont allow recursive tags of the same - * type. @see BulletScanner - * @return boolean true/false - */ - public boolean shouldCreateEndTagAndExit() { - return false; } } --- 347,353 ---- } ! public final boolean isAllowSelfChildren() ! { return allowSelfChildren; } } Index: FormScanner.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/FormScanner.java,v retrieving revision 1.48 retrieving revision 1.49 diff -C2 -d -r1.48 -r1.49 *** FormScanner.java 27 Oct 2003 02:18:04 -0000 1.48 --- FormScanner.java 28 Oct 2003 03:04:18 -0000 1.49 *************** *** 29,33 **** package org.htmlparser.scanners; - import java.util.Stack; import java.util.Vector; --- 29,32 ---- *************** *** 50,55 **** private static final String [] formTagEnders = {"HTML","BODY"}; - private Stack stack = new Stack(); - /** * Constructs a form scanner. --- 49,52 ---- *************** *** 69,75 **** super(filter,MATCH_ID,formTagEnders,false); parser.addScanner(new InputTagScanner("-i")); ! parser.addScanner(new TextareaTagScanner("-t",stack)); ! parser.addScanner(new SelectTagScanner("-select", stack)); ! parser.addScanner(new OptionTagScanner("-option",stack)); } --- 66,72 ---- super(filter,MATCH_ID,formTagEnders,false); parser.addScanner(new InputTagScanner("-i")); ! parser.addScanner(new TextareaTagScanner("-t")); ! parser.addScanner(new SelectTagScanner("-select")); ! parser.addScanner(new OptionTagScanner("-option")); } *************** *** 139,148 **** FormTag ret; - // special step here... - // not sure why the recursion is tracked this way, - // rather than using the ENDERS and END_TAG_ENDERS arrays... - if (!stack.empty () && (this == stack.peek ())) - stack.pop (); - ret = new FormTag (); ret.setPage (page); --- 136,139 ---- *************** *** 161,169 **** return (ret); - } - - public void beforeScanningStarts() - { - stack.push(this); } } --- 152,155 ---- Index: OptionTagScanner.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/OptionTagScanner.java,v retrieving revision 1.35 retrieving revision 1.36 diff -C2 -d -r1.35 -r1.36 *** OptionTagScanner.java 26 Oct 2003 19:46:21 -0000 1.35 --- OptionTagScanner.java 28 Oct 2003 03:04:18 -0000 1.36 *************** *** 29,33 **** package org.htmlparser.scanners; - import java.util.Stack; import java.util.Vector; import org.htmlparser.lexer.Page; --- 29,32 ---- *************** *** 43,55 **** private static final String [] ENDERS = { "INPUT", "TEXTAREA", "SELECT", "OPTION" }; private static final String [] END_TAG_ENDERS = { "SELECT", "FORM", "BODY", "HTML" }; - private Stack stack; - - public OptionTagScanner(Stack stack) { - this("", stack); - } ! public OptionTagScanner(String filter, Stack stack) { super(filter, MATCH_NAME, ENDERS, END_TAG_ENDERS, false); - this.stack = stack; } --- 42,48 ---- private static final String [] ENDERS = { "INPUT", "TEXTAREA", "SELECT", "OPTION" }; private static final String [] END_TAG_ENDERS = { "SELECT", "FORM", "BODY", "HTML" }; ! public OptionTagScanner(String filter) { super(filter, MATCH_NAME, ENDERS, END_TAG_ENDERS, false); } *************** *** 62,71 **** OptionTag ret; - // special step here... - // not sure why the recursion is tracked this way, - // rather than using the ENDERS and END_TAG_ENDERS arrays... - if (!stack.empty () && (this == stack.peek ())) - stack.pop (); - ret = new OptionTag (); ret.setPage (page); --- 55,58 ---- *************** *** 76,110 **** ret.setEndTag (endTag); ret.setChildren (children); - - return (ret); - } - - public void beforeScanningStarts () - { - stack.push (this); - } - - /** - * This is the logic that decides when a option tag can be allowed - */ - public boolean shouldCreateEndTagAndExit () - { - boolean ret; - - ret = false; - - if (0 != stack.size ()) - { - TagScanner parentScanner = (TagScanner)stack.peek (); - if (parentScanner instanceof CompositeTagScanner) - { - CompositeTagScanner scanner = (CompositeTagScanner)parentScanner; - if (scanner.tagEnderSet.contains (MATCH_NAME[0])) // should loop over names - { - stack.pop (); - ret = true; - } - } - } return (ret); --- 63,66 ---- Index: ScriptScanner.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/ScriptScanner.java,v retrieving revision 1.44 retrieving revision 1.45 diff -C2 -d -r1.44 -r1.45 *** ScriptScanner.java 26 Oct 2003 19:46:21 -0000 1.44 --- ScriptScanner.java 28 Oct 2003 03:04:18 -0000 1.45 *************** *** 135,139 **** else // TODO: need to remove this cast ! last = (StringNode)factory.createStringNode (lexer, node.elementBegin (), node.elementEnd ()); } else if (node instanceof RemarkNode) --- 135,139 ---- else // TODO: need to remove this cast ! last = (StringNode)factory.createStringNode (lexer.getPage (), node.elementBegin (), node.elementEnd ()); } else if (node instanceof RemarkNode) *************** *** 145,149 **** // TODO: need to remove this cast // last = (StringNode)factory.createStringNode (lexer, node.elementBegin (), node.elementEnd ()); ! last = (StringNode)factory.createStringNode (lexer, node.elementBegin (), node.elementEnd ()); } } --- 145,149 ---- // TODO: need to remove this cast // last = (StringNode)factory.createStringNode (lexer, node.elementBegin (), node.elementEnd ()); ! last = (StringNode)factory.createStringNode (lexer.getPage (), node.elementBegin (), node.elementEnd ()); } } *************** *** 163,167 **** if (null == last) // TODO: need to remove this cast ! last = (StringNode)factory.createStringNode (lexer, position, position); // build new end tag if required if (null == end) --- 163,167 ---- if (null == last) // TODO: need to remove this cast ! last = (StringNode)factory.createStringNode (lexer.getPage (), position, position); // build new end tag if required if (null == end) Index: SelectTagScanner.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/SelectTagScanner.java,v retrieving revision 1.33 retrieving revision 1.34 diff -C2 -d -r1.33 -r1.34 *** SelectTagScanner.java 26 Oct 2003 19:46:21 -0000 1.33 --- SelectTagScanner.java 28 Oct 2003 03:04:18 -0000 1.34 *************** *** 29,33 **** package org.htmlparser.scanners; - import java.util.Stack; import java.util.Vector; --- 29,32 ---- *************** *** 46,60 **** private static final String [] ENDERS = { "INPUT", "TEXTAREA", "SELECT" }; private static final String [] END_TAG_ENDERS = {"FORM", "BODY", "HTML" }; - private Stack stack; - - public SelectTagScanner(Stack stack) - { - this("", stack); - } ! public SelectTagScanner(String filter, Stack stack) { super(filter, MATCH_NAME, ENDERS, END_TAG_ENDERS, false); - this.stack = stack; } --- 45,52 ---- private static final String [] ENDERS = { "INPUT", "TEXTAREA", "SELECT" }; private static final String [] END_TAG_ENDERS = {"FORM", "BODY", "HTML" }; ! public SelectTagScanner(String filter) { super(filter, MATCH_NAME, ENDERS, END_TAG_ENDERS, false); } *************** *** 68,77 **** SelectTag ret; - // special step here... - // not sure why the recursion is tracked this way, - // rather than using the ENDERS and END_TAG_ENDERS arrays... - if (!stack.empty () && (this == stack.peek ())) - stack.pop (); - ret = new SelectTag (); ret.setPage (page); --- 60,63 ---- *************** *** 82,116 **** ret.setEndTag (endTag); ret.setChildren (children); - - return (ret); - } - - public void beforeScanningStarts () - { - stack.push (this); - } - - /** - * This is the logic that decides when a option tag can be allowed - */ - public boolean shouldCreateEndTagAndExit () - { - boolean ret; - - ret = false; - - if (0 != stack.size ()) - { - TagScanner parentScanner = (TagScanner)stack.peek (); - if (parentScanner instanceof CompositeTagScanner) - { - CompositeTagScanner scanner = (CompositeTagScanner)parentScanner; - if (scanner.tagEnderSet.contains (MATCH_NAME[0])) // should loop over names - { - stack.pop (); - ret = true; - } - } - } return (ret); --- 68,71 ---- Index: TextareaTagScanner.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/TextareaTagScanner.java,v retrieving revision 1.30 retrieving revision 1.31 diff -C2 -d -r1.30 -r1.31 *** TextareaTagScanner.java 26 Oct 2003 19:46:21 -0000 1.30 --- TextareaTagScanner.java 28 Oct 2003 03:04:18 -0000 1.31 *************** *** 29,33 **** package org.htmlparser.scanners; - import java.util.Stack; import java.util.Vector; import org.htmlparser.lexer.Page; --- 29,32 ---- *************** *** 43,57 **** private static final String [] ENDERS = { "INPUT", "TEXTAREA", "SELECT", "OPTION" }; private static final String [] END_TAG_ENDERS = {"FORM", "BODY", "HTML" }; - private Stack stack; - - public TextareaTagScanner(Stack stack) - { - this("", stack); - } ! public TextareaTagScanner(String filter, Stack stack) { super(filter, MATCH_NAME, ENDERS, END_TAG_ENDERS, false); - this.stack = stack; } --- 42,49 ---- private static final String [] ENDERS = { "INPUT", "TEXTAREA", "SELECT", "OPTION" }; private static final String [] END_TAG_ENDERS = {"FORM", "BODY", "HTML" }; ! public TextareaTagScanner(String filter) { super(filter, MATCH_NAME, ENDERS, END_TAG_ENDERS, false); } *************** *** 64,73 **** TextareaTag ret; - // special step here... - // not sure why the recursion is tracked this way, - // rather than using the ENDERS and END_TAG_ENDERS arrays... - if (!stack.empty () && (this == stack.peek ())) - stack.pop (); - ret = new TextareaTag (); ret.setPage (page); --- 56,59 ---- *************** *** 78,112 **** ret.setEndTag (endTag); ret.setChildren (children); - - return (ret); - } - - public void beforeScanningStarts () - { - stack.push (this); - } - - /** - * This is the logic that decides when a option tag can be allowed - */ - public boolean shouldCreateEndTagAndExit () - { - boolean ret; - - ret = false; - - if (0 != stack.size ()) - { - TagScanner parentScanner = (TagScanner)stack.peek (); - if (parentScanner instanceof CompositeTagScanner) - { - CompositeTagScanner scanner = (CompositeTagScanner)parentScanner; - if (scanner.tagEnderSet.contains (MATCH_NAME[0])) // should loop over names - { - stack.pop (); - ret = true; - } - } - } return (ret); --- 64,67 ---- |