[Htmlparser-cvs] htmlparser/src/org/htmlparser/visitors HtmlPage.java,1.34,1.35 NodeVisitor.java,1.2
Brought to you by:
derrickoswald
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/visitors In directory sc8-pr-cvs1:/tmp/cvs-serv30684/visitors Modified Files: HtmlPage.java NodeVisitor.java TagFindingVisitor.java TextExtractingVisitor.java UrlModifyingVisitor.java Log Message: Lexer Integration Removed old Parser classes. Removed EndTag, this class was replaced by a call to the new isEndTag() method on the Tag class The StringNode, RemarkNode and tags.Tag class now derive from their lexeme counterparts in lexer.nodes instead of the other way around. The beginnings of a node factory interface are included. This was added so the lexer could return 'visitable' nodes to the parser. The parser acts as it's own node factory, as does the Lexer. The node count for parsing goes up in most cases because every whitespace (i.e. newline) now counts as a StringNode. This has whacked out a lot of the tests that were expecting fewer nodes or a certain type of node at a particular index. Attributes now maintain their order and case. The count of attributes also went up because whitespace is maintained within tags too. The storage in a Vector means the element 0 Attribute is actually the name of the tag, rather than having the $TAGNAME entry in a HashTable. Index: HtmlPage.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/visitors/HtmlPage.java,v retrieving revision 1.34 retrieving revision 1.35 diff -C2 -d -r1.34 -r1.35 *** HtmlPage.java 22 Sep 2003 02:40:16 -0000 1.34 --- HtmlPage.java 28 Sep 2003 15:34:00 -0000 1.35 *************** *** 34,38 **** import org.htmlparser.StringNode; import org.htmlparser.scanners.TableScanner; - import org.htmlparser.tags.EndTag; import org.htmlparser.tags.TableTag; import org.htmlparser.tags.Tag; --- 34,37 ---- *************** *** 63,75 **** } ! public void visitTag(Tag tag) { ! addTagToBodyIfApplicable(tag); ! ! if (isTable(tag)) { ! tables.add(tag); ! } ! else { if (isBodyTag(tag)) ! bodyTagBegin = true; } } --- 62,84 ---- } ! public void visitTag(Tag tag) ! { ! if (tag.isEndTag ()) ! { if (isBodyTag(tag)) ! bodyTagBegin = false; ! addTagToBodyIfApplicable(tag); ! } ! else ! { ! addTagToBodyIfApplicable(tag); ! ! if (isTable(tag)) { ! tables.add(tag); ! } ! else { ! if (isBodyTag(tag)) ! bodyTagBegin = true; ! } } } *************** *** 82,91 **** if (bodyTagBegin) nodesInBody.add(node); - } - - public void visitEndTag(EndTag endTag) { - if (isBodyTag(endTag)) - bodyTagBegin = false; - addTagToBodyIfApplicable(endTag); } --- 91,94 ---- Index: NodeVisitor.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/visitors/NodeVisitor.java,v retrieving revision 1.29 retrieving revision 1.30 diff -C2 -d -r1.29 -r1.30 *** NodeVisitor.java 22 Sep 2003 02:40:16 -0000 1.29 --- NodeVisitor.java 28 Sep 2003 15:34:00 -0000 1.30 *************** *** 31,38 **** import org.htmlparser.RemarkNode; import org.htmlparser.StringNode; ! import org.htmlparser.tags.EndTag; import org.htmlparser.tags.ImageTag; import org.htmlparser.tags.LinkTag; - import org.htmlparser.tags.Tag; import org.htmlparser.tags.TitleTag; --- 31,37 ---- import org.htmlparser.RemarkNode; import org.htmlparser.StringNode; ! import org.htmlparser.tags.Tag; import org.htmlparser.tags.ImageTag; import org.htmlparser.tags.LinkTag; import org.htmlparser.tags.TitleTag; *************** *** 66,73 **** public void visitImageTag(ImageTag imageTag) { - } - - public void visitEndTag(EndTag endTag) { - } --- 65,68 ---- Index: TagFindingVisitor.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/visitors/TagFindingVisitor.java,v retrieving revision 1.35 retrieving revision 1.36 diff -C2 -d -r1.35 -r1.36 *** TagFindingVisitor.java 22 Sep 2003 02:40:16 -0000 1.35 --- TagFindingVisitor.java 28 Sep 2003 15:34:00 -0000 1.36 *************** *** 32,36 **** import org.htmlparser.Node; - import org.htmlparser.tags.EndTag; import org.htmlparser.tags.Tag; import org.htmlparser.util.NodeList; --- 32,35 ---- *************** *** 68,72 **** } ! public void visitTag(Tag tag) { for (int i=0;i<tagsToBeFound.length;i++) if (tag.getTagName().equalsIgnoreCase(tagsToBeFound[i])) { --- 67,82 ---- } ! public void visitTag(Tag tag) ! { ! if (tag.isEndTag ()) ! { ! if (!endTagCheck) return; ! for (int i=0;i<tagsToBeFound.length;i++) ! if (tag.getTagName().substring (1).equalsIgnoreCase(tagsToBeFound[i])) ! { ! endTagCount[i]++; ! endTags[i].add(tag); ! } ! } for (int i=0;i<tagsToBeFound.length;i++) if (tag.getTagName().equalsIgnoreCase(tagsToBeFound[i])) { *************** *** 78,90 **** public Node [] getTags(int index) { return tags[index].toNodeArray(); - } - - public void visitEndTag(EndTag endTag) { - if (!endTagCheck) return; - for (int i=0;i<tagsToBeFound.length;i++) - if (endTag.getTagName().equalsIgnoreCase(tagsToBeFound[i])) { - endTagCount[i]++; - endTags[i].add(endTag); - } } --- 88,91 ---- Index: TextExtractingVisitor.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/visitors/TextExtractingVisitor.java,v retrieving revision 1.33 retrieving revision 1.34 diff -C2 -d -r1.33 -r1.34 *** TextExtractingVisitor.java 22 Sep 2003 02:40:16 -0000 1.33 --- TextExtractingVisitor.java 28 Sep 2003 15:34:00 -0000 1.34 *************** *** 30,34 **** import org.htmlparser.StringNode; - import org.htmlparser.tags.EndTag; import org.htmlparser.tags.Tag; import org.htmlparser.tags.TitleTag; --- 30,33 ---- *************** *** 76,87 **** } ! public void visitEndTag(EndTag endTag) { ! if (isPreTag(endTag)) ! preTagBeingProcessed = false; ! } ! ! public void visitTag(Tag tag) { ! if (isPreTag(tag)) ! preTagBeingProcessed = true; } --- 75,90 ---- } ! public void visitTag(Tag tag) ! { ! if (tag.isEndTag ()) ! { ! if (isPreTag(tag)) ! preTagBeingProcessed = false; ! } ! else ! { ! if (isPreTag(tag)) ! preTagBeingProcessed = true; ! } } Index: UrlModifyingVisitor.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/visitors/UrlModifyingVisitor.java,v retrieving revision 1.32 retrieving revision 1.33 diff -C2 -d -r1.32 -r1.33 *** UrlModifyingVisitor.java 22 Sep 2003 02:40:16 -0000 1.32 --- UrlModifyingVisitor.java 28 Sep 2003 15:34:00 -0000 1.33 *************** *** 33,37 **** import org.htmlparser.StringNode; import org.htmlparser.scanners.LinkScanner; - import org.htmlparser.tags.EndTag; import org.htmlparser.tags.ImageTag; import org.htmlparser.tags.LinkTag; --- 33,36 ---- *************** *** 66,79 **** } - public void visitEndTag(EndTag endTag) { - modifiedResult.append(endTag.toHtml()); - } - public void visitStringNode(StringNode stringNode) { modifiedResult.append(stringNode.toHtml()); } ! public void visitTag(Tag tag) { ! modifiedResult.append(tag.toHtml()); } --- 65,78 ---- } public void visitStringNode(StringNode stringNode) { modifiedResult.append(stringNode.toHtml()); } ! public void visitTag(Tag tag) ! { ! if (tag.isEndTag ()) ! modifiedResult.append(tag.toHtml()); ! else ! modifiedResult.append(tag.toHtml()); } |