htmlparser-cvs Mailing List for HTML Parser (Page 38)
Brought to you by:
derrickoswald
You can subscribe to this list here.
2003 |
Jan
|
Feb
|
Mar
|
Apr
|
May
(141) |
Jun
(108) |
Jul
(66) |
Aug
(127) |
Sep
(155) |
Oct
(149) |
Nov
(72) |
Dec
(72) |
---|---|---|---|---|---|---|---|---|---|---|---|---|
2004 |
Jan
(100) |
Feb
(36) |
Mar
(21) |
Apr
(3) |
May
(87) |
Jun
(28) |
Jul
(84) |
Aug
(5) |
Sep
(14) |
Oct
|
Nov
|
Dec
|
2005 |
Jan
(1) |
Feb
(39) |
Mar
(26) |
Apr
(38) |
May
(14) |
Jun
(10) |
Jul
|
Aug
|
Sep
(13) |
Oct
(8) |
Nov
(10) |
Dec
|
2006 |
Jan
|
Feb
(1) |
Mar
(17) |
Apr
(20) |
May
(28) |
Jun
(24) |
Jul
|
Aug
|
Sep
|
Oct
|
Nov
|
Dec
|
2015 |
Jan
|
Feb
|
Mar
(1) |
Apr
|
May
|
Jun
|
Jul
|
Aug
|
Sep
|
Oct
|
Nov
|
Dec
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/parserHelper In directory sc8-pr-cvs1:/tmp/cvs-serv30684/parserHelper Modified Files: CompositeTagScannerHelper.java ScriptScannerHelper.java Removed Files: AttributeParser.java StringParser.java TagParser.java Log Message: Lexer Integration Removed old Parser classes. Removed EndTag, this class was replaced by a call to the new isEndTag() method on the Tag class The StringNode, RemarkNode and tags.Tag class now derive from their lexeme counterparts in lexer.nodes instead of the other way around. The beginnings of a node factory interface are included. This was added so the lexer could return 'visitable' nodes to the parser. The parser acts as it's own node factory, as does the Lexer. The node count for parsing goes up in most cases because every whitespace (i.e. newline) now counts as a StringNode. This has whacked out a lot of the tests that were expecting fewer nodes or a certain type of node at a particular index. Attributes now maintain their order and case. The count of attributes also went up because whitespace is maintained within tags too. The storage in a Vector means the element 0 Attribute is actually the name of the tag, rather than having the $TAGNAME entry in a HashTable. Index: CompositeTagScannerHelper.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/parserHelper/CompositeTagScannerHelper.java,v retrieving revision 1.47 retrieving revision 1.48 diff -C2 -d -r1.47 -r1.48 *** CompositeTagScannerHelper.java 22 Sep 2003 02:39:59 -0000 1.47 --- CompositeTagScannerHelper.java 28 Sep 2003 15:33:58 -0000 1.48 *************** *** 29,37 **** package org.htmlparser.parserHelper; import org.htmlparser.Node; ! import org.htmlparser.NodeReader; import org.htmlparser.scanners.CompositeTagScanner; import org.htmlparser.tags.CompositeTag; - import org.htmlparser.tags.EndTag; import org.htmlparser.tags.Tag; import org.htmlparser.tags.data.CompositeTagData; --- 29,40 ---- package org.htmlparser.parserHelper; + import java.util.Vector; import org.htmlparser.Node; ! import org.htmlparser.lexer.Cursor; ! import org.htmlparser.lexer.Lexer; ! import org.htmlparser.lexer.nodes.Attribute; ! import org.htmlparser.lexer.nodes.TagNode; import org.htmlparser.scanners.CompositeTagScanner; import org.htmlparser.tags.CompositeTag; import org.htmlparser.tags.Tag; import org.htmlparser.tags.data.CompositeTagData; *************** *** 43,49 **** private CompositeTagScanner scanner; private Tag tag; ! private String url; ! private NodeReader reader; ! private String currLine; private Tag endTag; private NodeList nodeList; --- 46,50 ---- private CompositeTagScanner scanner; private Tag tag; ! private Lexer mLexer; private Tag endTag; private NodeList nodeList; *************** *** 56,69 **** CompositeTagScanner scanner, Tag tag, ! String url, ! NodeReader reader, ! String currLine, boolean balance_quotes) { this.scanner = scanner; this.tag = tag; ! this.url = url; ! this.reader = reader; ! this.currLine = currLine; this.endTag = null; this.nodeList = new NodeList(); --- 57,66 ---- CompositeTagScanner scanner, Tag tag, ! Lexer lexer, boolean balance_quotes) { this.scanner = scanner; this.tag = tag; ! mLexer = lexer; this.endTag = null; this.nodeList = new NodeList(); *************** *** 73,77 **** public Tag scan() throws ParserException { ! this.startingLineNumber = reader.getLastLineNumber(); if (shouldCreateEndTagAndExit()) { return createEndTagAndRepositionReader(); --- 70,74 ---- public Tag scan() throws ParserException { ! startingLineNumber = mLexer.getCurrentLineNumber (); if (shouldCreateEndTagAndExit()) { return createEndTagAndRepositionReader(); *************** *** 83,89 **** if (!endTagFound) { do { ! currentNode = reader.readElement(balance_quotes); ! if (currentNode==null) continue; ! currLine = reader.getCurrentLine(); if (currentNode instanceof Tag) doForceCorrectionCheckOn((Tag)currentNode); --- 80,86 ---- if (!endTagFound) { do { ! currentNode = mLexer.nextNode (); // balance_quotes ? ! if (currentNode==null) ! continue; if (currentNode instanceof Tag) doForceCorrectionCheckOn((Tag)currentNode); *************** *** 95,103 **** while (currentNode!=null && !endTagFound); } ! if (endTag==null) { ! createCorrectionEndTagBefore(reader.getLastReadPosition()+1); ! } ! this.endingLineNumber = reader.getLastLineNumber(); return createTag(); } --- 92,99 ---- while (currentNode!=null && !endTagFound); } ! if (endTag==null) ! createCorrectionEndTagBefore (mLexer.getCursor ().getPosition ()); ! endingLineNumber = mLexer.getCurrentLineNumber (); return createTag(); } *************** *** 108,161 **** private Tag createEndTagAndRepositionReader() { ! createCorrectionEndTagBefore(tag.elementBegin()); ! reader.setPosInLine(tag.elementBegin()); ! reader.setDontReadNextLine(true); return endTag; } ! private void createCorrectionEndTagBefore(int pos) { ! String endTagName = tag.getTagName(); ! int endTagBegin = pos ; ! int endTagEnd = endTagBegin + endTagName.length() + 2; ! endTag = new EndTag( ! new TagData( ! endTagBegin, ! endTagEnd, endTagName, ! currLine ! ) ! ); } private void createCorrectionEndTagBefore(Tag possibleEndTagCauser) { ! String endTagName = tag.getTagName(); int endTagBegin = possibleEndTagCauser.elementBegin(); int endTagEnd = endTagBegin + endTagName.length() + 2; possibleEndTagCauser.setTagBegin(endTagEnd+1); ! reader.addNextParsedNode(possibleEndTagCauser); ! endTag = new EndTag( ! new TagData( ! endTagBegin, ! endTagEnd, endTagName, ! currLine ! ) ! ); } ! private Tag createTag() throws ParserException { ! CompositeTag newTag = ! (CompositeTag) ! scanner.createTag( ! new TagData( ! tag.elementBegin(), ! endTag.elementEnd(), ! startingLineNumber, ! endingLineNumber, ! tag.getText(), ! currLine, ! url, ! tag.isEmptyXmlTag() ! ), new CompositeTagData( tag,endTag,nodeList --- 104,156 ---- private Tag createEndTagAndRepositionReader() { ! createCorrectionEndTagBefore (tag.elementBegin ()); ! mLexer.setPosition (tag.elementBegin ()); return endTag; } ! private void createCorrectionEndTagBefore(int position) ! { ! String endTagName = "/" + tag.getTagName(); ! Vector attributes = new Vector (); ! attributes.addElement (new Attribute (endTagName, (String)null, (char)0)); ! TagData data = new TagData( endTagName, ! position, ! attributes, ! mLexer.getPage ().getUrl (), ! false); ! endTag = new Tag (data); } private void createCorrectionEndTagBefore(Tag possibleEndTagCauser) { ! String endTagName = "/" + tag.getTagName(); int endTagBegin = possibleEndTagCauser.elementBegin(); int endTagEnd = endTagBegin + endTagName.length() + 2; possibleEndTagCauser.setTagBegin(endTagEnd+1); ! Vector attributes = new Vector (); ! attributes.addElement (new Attribute (endTagName, (String)null, (char)0)); ! TagData data = new TagData( endTagName, ! endTagBegin, ! attributes, ! mLexer.getPage ().getUrl (), ! false); ! ! endTag = new Tag(data); } ! private Tag createTag() throws ParserException ! { ! TagData data; ! ! data = new TagData( ! mLexer.getPage (), ! tag.elementBegin(), ! endTag.elementEnd(), ! tag.getAttributesEx (), ! mLexer.getPage ().getUrl (), ! tag.isEmptyXmlTag ()); ! ! CompositeTag newTag = (CompositeTag)scanner.createTag (data, new CompositeTagData( tag,endTag,nodeList *************** *** 169,179 **** } ! private void doChildAndEndTagCheckOn(Node currentNode) { ! if (currentNode instanceof EndTag) { ! EndTag possibleEndTag = (EndTag)currentNode; ! if (isExpectedEndTag(possibleEndTag)) { ! endTagFound = true; ! endTag = possibleEndTag; ! return; } } --- 164,182 ---- } ! private void doChildAndEndTagCheckOn(Node currentNode) ! { ! Tag tag; ! ! if (currentNode instanceof Tag) ! { ! tag = (Tag)currentNode; ! if (tag.isEndTag ()) ! { ! if (isExpectedEndTag (tag)) ! { ! endTagFound = true; ! endTag =tag; ! return; ! } } } *************** *** 182,187 **** } ! private boolean isExpectedEndTag(EndTag possibleEndTag) { ! return possibleEndTag.getTagName().equals(tag.getTagName()); } --- 185,191 ---- } ! private boolean isExpectedEndTag (TagNode possibleEndTag) ! { ! return (possibleEndTag.getTagName().equals (tag.getTagName ())); } *************** *** 212,216 **** private boolean isSelfChildTagRecievedIncorrectly(Tag possibleEndTag) { return ( ! !(possibleEndTag instanceof EndTag) && !scanner.isAllowSelfChildren() && possibleEndTag.getTagName().equals(tag.getTagName()) --- 216,220 ---- private boolean isSelfChildTagRecievedIncorrectly(Tag possibleEndTag) { return ( ! !(possibleEndTag.isEndTag ()) && !scanner.isAllowSelfChildren() && possibleEndTag.getTagName().equals(tag.getTagName()) Index: ScriptScannerHelper.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/parserHelper/ScriptScannerHelper.java,v retrieving revision 1.12 retrieving revision 1.13 diff -C2 -d -r1.12 -r1.13 *** ScriptScannerHelper.java 22 Sep 2003 02:39:59 -0000 1.12 --- ScriptScannerHelper.java 28 Sep 2003 15:33:58 -0000 1.13 *************** *** 29,32 **** --- 29,33 ---- import org.htmlparser.*; + import org.htmlparser.lexer.Lexer; import org.htmlparser.scanners.*; import org.htmlparser.tags.*; *************** *** 38,41 **** --- 39,43 ---- public class ScriptScannerHelper { + private Lexer mLexer; private int endTagLoc; private Tag endTag; *************** *** 44,48 **** private boolean sameLine; private boolean endTagFound; - private NodeReader reader; private StringBuffer scriptContents; private ScriptScanner scriptScanner; --- 46,49 ---- *************** *** 51,56 **** private String currLine; ! public ScriptScannerHelper(Tag tag, String url, NodeReader nodeReader, String currLine, ScriptScanner scriptScanner) { ! this.reader = nodeReader; this.scriptScanner = scriptScanner; this.tag = tag; --- 52,58 ---- private String currLine; ! public ScriptScannerHelper(Tag tag, Lexer lexer, ScriptScanner scriptScanner) ! { ! mLexer = lexer; this.scriptScanner = scriptScanner; this.tag = tag; *************** *** 60,64 **** public Tag scan() throws ParserException { ! int startLine = reader.getLastLineNumber(); startTag = tag; extractScriptTagFrom(currLine); --- 62,66 ---- public Tag scan() throws ParserException { ! int startLine = mLexer.getCurrentLineNumber (); startTag = tag; extractScriptTagFrom(currLine); *************** *** 69,84 **** } ! private Tag createScriptTagUsing(String url, String currLine, int startLine) { return scriptScanner.createTag( ! new TagData( ! startTag.elementBegin(), ! endTag.elementEnd(), ! startLine, ! reader.getLastLineNumber(), ! startTag.getText(), ! currLine, ! url, ! false ! ), new CompositeTagData( startTag,endTag,createChildrenNodeList() ) --- 71,89 ---- } ! private Tag createScriptTagUsing(String url, String currLine, int startLine) ! { ! TagData data; ! ! data = new TagData( ! mLexer.getPage (), ! startTag.elementBegin(), ! endTag.elementEnd(), ! startTag.getAttributesEx (), ! mLexer.getPage ().getUrl (), ! startTag.isEmptyXmlTag ()); ! return scriptScanner.createTag( ! data, ! new CompositeTagData( startTag,endTag,createChildrenNodeList() ) *************** *** 100,114 **** private void createScriptEndTag(Tag tag, String currLine) { // If end tag doesn't exist, create one ! String endTagName = tag.getTagName(); ! int endTagBegin = reader.getLastReadPosition()+1 ; ! int endTagEnd = endTagBegin + endTagName.length() + 2; ! endTag = new EndTag( new TagData( - endTagBegin, - endTagEnd, endTagName, ! currLine ! ) ! ); } --- 105,118 ---- private void createScriptEndTag(Tag tag, String currLine) { // If end tag doesn't exist, create one ! String endTagName = "/" + tag.getTagName(); ! int endTagBegin = mLexer.getPosition (); ! endTag = new Tag( new TagData( endTagName, ! endTagBegin, ! null, ! mLexer.getPage ().getUrl (), ! false) ! ); } *************** *** 118,139 **** private void extractScriptTagFrom(String currLine) throws ParserException { ! String line = null; ! scriptContents = new StringBuffer(); ! endTagFound = false; ! ! endTag = null; ! line = currLine; ! sameLine = true; ! startingPos = startTag.elementEnd(); ! do { ! doExtractionOfScriptContentsFrom(line); ! if (!endTagFound) { ! line = reader.getNextLine(); ! startingPos = 0; ! } ! if (sameLine) ! sameLine = false; ! } ! while (line!=null && !endTagFound); } --- 122,144 ---- private void extractScriptTagFrom(String currLine) throws ParserException { ! throw new IllegalStateException ("not implemented"); ! // String line = null; ! // scriptContents = new StringBuffer(); ! // endTagFound = false; ! // ! // endTag = null; ! // line = currLine; ! // sameLine = true; ! // startingPos = startTag.elementEnd(); ! // do { ! // doExtractionOfScriptContentsFrom(line); ! // if (!endTagFound) { ! // line = reader.getNextLine(); ! // startingPos = 0; ! // } ! // if (sameLine) ! // sameLine = false; ! // } ! // while (line!=null && !endTagFound); } *************** *** 163,181 **** private void extractEndTagFrom(String line) throws ParserException { ! endTagFound = true; ! endTag = (EndTag)EndTag.find(line,endTagLoc); ! if (sameLine) ! scriptContents.append( ! getCodeBetweenStartAndEndTags( ! line, ! startTag, ! endTagLoc) ! ); ! else { ! scriptContents.append(Parser.getLineSeparator()); ! scriptContents.append(line.substring(0,endTagLoc)); ! } ! ! reader.setPosInLine(endTag.elementEnd()); } --- 168,187 ---- private void extractEndTagFrom(String line) throws ParserException { ! throw new IllegalStateException ("not implemented"); ! // endTagFound = true; ! // endTag = (EndTag)EndTag.find(line,endTagLoc); ! // if (sameLine) ! // scriptContents.append( ! // getCodeBetweenStartAndEndTags( ! // line, ! // startTag, ! // endTagLoc) ! // ); ! // else { ! // scriptContents.append(Parser.getLineSeparator()); ! // scriptContents.append(line.substring(0,endTagLoc)); ! // } ! // ! // mLexer.setPosition (endTag.elementEnd ()); } --- AttributeParser.java DELETED --- --- StringParser.java DELETED --- --- TagParser.java DELETED --- |
From: <der...@us...> - 2003-09-29 21:45:41
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/util In directory sc8-pr-cvs1:/tmp/cvs-serv30684/util Modified Files: Generate.java IteratorImpl.java ParserUtils.java Log Message: Lexer Integration Removed old Parser classes. Removed EndTag, this class was replaced by a call to the new isEndTag() method on the Tag class The StringNode, RemarkNode and tags.Tag class now derive from their lexeme counterparts in lexer.nodes instead of the other way around. The beginnings of a node factory interface are included. This was added so the lexer could return 'visitable' nodes to the parser. The parser acts as it's own node factory, as does the Lexer. The node count for parsing goes up in most cases because every whitespace (i.e. newline) now counts as a StringNode. This has whacked out a lot of the tests that were expecting fewer nodes or a certain type of node at a particular index. Attributes now maintain their order and case. The count of attributes also went up because whitespace is maintained within tags too. The storage in a Vector means the element 0 Attribute is actually the name of the tag, rather than having the $TAGNAME entry in a HashTable. Index: Generate.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/util/Generate.java,v retrieving revision 1.42 retrieving revision 1.43 diff -C2 -d -r1.42 -r1.43 *** Generate.java 22 Sep 2003 02:40:15 -0000 1.42 --- Generate.java 28 Sep 2003 15:33:59 -0000 1.43 *************** *** 37,41 **** import org.htmlparser.RemarkNode; import org.htmlparser.StringNode; - import org.htmlparser.tags.EndTag; import org.htmlparser.tags.LinkTag; import org.htmlparser.tags.Tag; --- 37,40 ---- *************** *** 186,195 **** { String contents = ((Tag)node).getText (); - if (contents.equals ("BR") || contents.equals ("P")) - buffer.append (nl); - } - else if (node instanceof EndTag) - { - String contents = ((EndTag)node).getText (); if (contents.equals ("BR") || contents.equals ("P")) buffer.append (nl); --- 185,188 ---- Index: IteratorImpl.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/util/IteratorImpl.java,v retrieving revision 1.28 retrieving revision 1.29 diff -C2 -d -r1.28 -r1.29 *** IteratorImpl.java 22 Sep 2003 02:40:15 -0000 1.28 --- IteratorImpl.java 28 Sep 2003 15:33:59 -0000 1.29 *************** *** 32,49 **** import org.htmlparser.Node; ! import org.htmlparser.NodeReader; public class IteratorImpl implements PeekingIterator { ! NodeReader reader; Vector preRead; - String resourceLocn; ParserFeedback feedback; ! public IteratorImpl (NodeReader rd, String resource, ParserFeedback fb) { ! reader = rd; preRead = new Vector (25); - resourceLocn = resource; feedback = fb; } --- 32,47 ---- import org.htmlparser.Node; ! import org.htmlparser.lexer.Lexer; public class IteratorImpl implements PeekingIterator { ! Lexer mLexer; Vector preRead; ParserFeedback feedback; ! public IteratorImpl (Lexer lexer, ParserFeedback fb) { ! mLexer = lexer; preRead = new Vector (25); feedback = fb; } *************** *** 53,62 **** Node ret; ! if (null == reader) ret = null; else try { ! ret = reader.readElement(); if (null != ret) preRead.addElement (ret); --- 51,60 ---- Node ret; ! if (null == mLexer) ret = null; else try { ! ret = mLexer.nextNode (); if (null != ret) preRead.addElement (ret); *************** *** 65,71 **** StringBuffer msgBuffer = new StringBuffer(); msgBuffer.append("Unexpected Exception occurred while reading "); ! msgBuffer.append(resourceLocn); msgBuffer.append(", in nextHTMLNode"); ! reader.appendLineDetails(msgBuffer); ParserException ex = new ParserException(msgBuffer.toString(),e); feedback.error(msgBuffer.toString(),ex); --- 63,69 ---- StringBuffer msgBuffer = new StringBuffer(); msgBuffer.append("Unexpected Exception occurred while reading "); ! msgBuffer.append(mLexer.getPage ().getUrl ()); msgBuffer.append(", in nextHTMLNode"); ! // reader.appendLineDetails(msgBuffer); ParserException ex = new ParserException(msgBuffer.toString(),e); feedback.error(msgBuffer.toString(),ex); *************** *** 83,87 **** boolean ret; ! if (null == reader) ret = false; else if (0 != preRead.size ()) --- 81,85 ---- boolean ret; ! if (null == mLexer) ret = false; else if (0 != preRead.size ()) Index: ParserUtils.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/util/ParserUtils.java,v retrieving revision 1.30 retrieving revision 1.31 diff -C2 -d -r1.30 -r1.31 *** ParserUtils.java 22 Sep 2003 02:40:15 -0000 1.30 --- ParserUtils.java 28 Sep 2003 15:33:59 -0000 1.31 *************** *** 34,38 **** import org.htmlparser.Node; ! import org.htmlparser.NodeReader; import org.htmlparser.tags.Tag; --- 34,38 ---- import org.htmlparser.Node; ! import org.htmlparser.Parser; import org.htmlparser.tags.Tag; *************** *** 57,71 **** } ! public static Map adjustScanners(NodeReader reader) { Map tempScanners = new Hashtable(); ! tempScanners = reader.getParser().getScanners(); // Remove all existing scanners ! reader.getParser().flushScanners(); return tempScanners; } ! public static void restoreScanners(NodeReader reader, Map tempScanners) { // Flush the scanners ! reader.getParser().setScanners(tempScanners); } --- 57,71 ---- } ! public static Map adjustScanners(Parser parser) { Map tempScanners = new Hashtable(); ! tempScanners = parser.getScanners(); // Remove all existing scanners ! parser.flushScanners(); return tempScanners; } ! public static void restoreScanners(Parser parser, Map tempScanners) { // Flush the scanners ! parser.setScanners(tempScanners); } |
From: <der...@us...> - 2003-09-29 21:45:40
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/utilTests In directory sc8-pr-cvs1:/tmp/cvs-serv30684/tests/utilTests Modified Files: HTMLTagParserTest.java Log Message: Lexer Integration Removed old Parser classes. Removed EndTag, this class was replaced by a call to the new isEndTag() method on the Tag class The StringNode, RemarkNode and tags.Tag class now derive from their lexeme counterparts in lexer.nodes instead of the other way around. The beginnings of a node factory interface are included. This was added so the lexer could return 'visitable' nodes to the parser. The parser acts as it's own node factory, as does the Lexer. The node count for parsing goes up in most cases because every whitespace (i.e. newline) now counts as a StringNode. This has whacked out a lot of the tests that were expecting fewer nodes or a certain type of node at a particular index. Attributes now maintain their order and case. The count of attributes also went up because whitespace is maintained within tags too. The storage in a Vector means the element 0 Attribute is actually the name of the tag, rather than having the $TAGNAME entry in a HashTable. Index: HTMLTagParserTest.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/utilTests/HTMLTagParserTest.java,v retrieving revision 1.45 retrieving revision 1.46 diff -C2 -d -r1.45 -r1.46 *** HTMLTagParserTest.java 22 Sep 2003 02:40:14 -0000 1.45 --- HTMLTagParserTest.java 28 Sep 2003 15:33:59 -0000 1.46 *************** *** 29,33 **** package org.htmlparser.tests.utilTests; - import org.htmlparser.parserHelper.TagParser; import org.htmlparser.tags.Tag; import org.htmlparser.tags.data.TagData; --- 29,32 ---- *************** *** 36,40 **** public class HTMLTagParserTest extends ParserTestCase { ! private TagParser tagParser; public HTMLTagParserTest(String name) { --- 35,39 ---- public class HTMLTagParserTest extends ParserTestCase { ! // private TagParser tagParser; public HTMLTagParserTest(String name) { *************** *** 43,64 **** public void testCorrectTag() { ! Tag tag = new Tag(new TagData(0,20,"font face=\"Arial,\"helvetica,\" sans-serif=\"sans-serif\" size=\"2\" color=\"#FFFFFF\"","<font face=\"Arial,\"helvetica,\" sans-serif=\"sans-serif\" size=\"2\" color=\"#FFFFFF\">")); ! tagParser.correctTag(tag); ! assertStringEquals("Corrected Tag","font face=\"Arial,helvetica,\" sans-serif=\"sans-serif\" size=\"2\" color=\"#FFFFFF\"",tag.getText()); } public void testInsertInvertedCommasCorrectly() { ! StringBuffer test = new StringBuffer("a b=c d e = f"); ! StringBuffer result = tagParser.insertInvertedCommasCorrectly(test); ! assertStringEquals("Expected Correction","a b=\"c d\" e=\"f\"",result.toString()); } public void testPruneSpaces() { ! String test = " fdfdf dfdf "; ! assertEquals("Expected Pruned string","fdfdf dfdf",TagParser.pruneSpaces(test)); } protected void setUp() { ! tagParser = new TagParser(new DefaultParserFeedback()); } } --- 42,67 ---- public void testCorrectTag() { ! fail ("not implemented"); ! // Tag tag = new Tag(new TagData(0,20,"font face=\"Arial,\"helvetica,\" sans-serif=\"sans-serif\" size=\"2\" color=\"#FFFFFF\"","<font face=\"Arial,\"helvetica,\" sans-serif=\"sans-serif\" size=\"2\" color=\"#FFFFFF\">")); ! // tagParser.correctTag(tag); ! // assertStringEquals("Corrected Tag","font face=\"Arial,helvetica,\" sans-serif=\"sans-serif\" size=\"2\" color=\"#FFFFFF\"",tag.getText()); } public void testInsertInvertedCommasCorrectly() { ! fail ("not implemented"); ! // StringBuffer test = new StringBuffer("a b=c d e = f"); ! // StringBuffer result = tagParser.insertInvertedCommasCorrectly(test); ! // assertStringEquals("Expected Correction","a b=\"c d\" e=\"f\"",result.toString()); } public void testPruneSpaces() { ! fail ("not implemented"); ! // String test = " fdfdf dfdf "; ! // assertEquals("Expected Pruned string","fdfdf dfdf",TagParser.pruneSpaces(test)); } protected void setUp() { ! fail ("not implemented"); ! // tagParser = new TagParser(new DefaultParserFeedback()); } } |
From: <der...@us...> - 2003-09-29 21:38:48
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests In directory sc8-pr-cvs1:/tmp/cvs-serv30684/tests Modified Files: ParserTest.java ParserTestCase.java Log Message: Lexer Integration Removed old Parser classes. Removed EndTag, this class was replaced by a call to the new isEndTag() method on the Tag class The StringNode, RemarkNode and tags.Tag class now derive from their lexeme counterparts in lexer.nodes instead of the other way around. The beginnings of a node factory interface are included. This was added so the lexer could return 'visitable' nodes to the parser. The parser acts as it's own node factory, as does the Lexer. The node count for parsing goes up in most cases because every whitespace (i.e. newline) now counts as a StringNode. This has whacked out a lot of the tests that were expecting fewer nodes or a certain type of node at a particular index. Attributes now maintain their order and case. The count of attributes also went up because whitespace is maintained within tags too. The storage in a Vector means the element 0 Attribute is actually the name of the tag, rather than having the $TAGNAME entry in a HashTable. Index: ParserTest.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/ParserTest.java,v retrieving revision 1.42 retrieving revision 1.43 diff -C2 -d -r1.42 -r1.43 *** ParserTest.java 22 Sep 2003 02:40:04 -0000 1.42 --- ParserTest.java 28 Sep 2003 15:33:58 -0000 1.43 *************** *** 41,44 **** --- 41,46 ---- import org.htmlparser.Parser; import org.htmlparser.StringNode; + import org.htmlparser.lexer.Lexer; + import org.htmlparser.lexer.Page; import org.htmlparser.scanners.FormScanner; import org.htmlparser.scanners.TagScanner; *************** *** 89,93 **** throw new ParserException("You must be offline! This test needs you to be connected to the internet.",e); } - parser.getReader().mark(5000); Node [] node = new AbstractNode[500]; --- 91,94 ---- *************** *** 98,102 **** } int cnt = i; ! parser.getReader().reset(); // Now try getting the elements again i = 0; --- 99,103 ---- } int cnt = i; ! parser.reset (); // Now try getting the elements again i = 0; *************** *** 349,353 **** i++; } ! assertEquals("Expected nodes",12,i); } catch (Exception e) --- 350,354 ---- i++; } ! assertEquals("Expected nodes",20,i); } catch (Exception e) *************** *** 421,425 **** for (NodeIterator e = parser.elements(); e.hasMoreNodes();) nodes[i++] = e.nextNode(); ! assertEquals ("Expected nodes", 14, i); } --- 422,426 ---- for (NodeIterator e = parser.elements(); e.hasMoreNodes();) nodes[i++] = e.nextNode(); ! assertEquals ("Expected nodes", 23, i); } *************** *** 476,479 **** --- 477,481 ---- URL url; URLConnection connection; + Page page; Parser parser; String idiots = "http://users.aol.com/geinster/rej.htm"; *************** *** 485,492 **** // this little subclass just gets around normal JDK 1.4 processing // that filters out bogus character sets ! parser = new Parser () { ! protected String getCharset(String content) { int index; String ret; --- 487,495 ---- // this little subclass just gets around normal JDK 1.4 processing // that filters out bogus character sets ! page = new Page ("") { ! public String getCharset(String content) { + final String CHARSET_STRING = "charset"; int index; String ret; *************** *** 495,509 **** if (null != content) { ! index = content.indexOf(CHARSET_STRING); if (index != -1) { ! content = content.substring(index + CHARSET_STRING.length()).trim(); ! if (content.startsWith("=")) { ! content = content.substring(1).trim(); ! index = content.indexOf(";"); if (index != -1) ! content = content.substring(0, index); //remove any double quotes from around charset string --- 498,512 ---- if (null != content) { ! index = content.indexOf (CHARSET_STRING); if (index != -1) { ! content = content.substring (index + CHARSET_STRING.length ()).trim (); ! if (content.startsWith ("=")) { ! content = content.substring (1).trim (); ! index = content.indexOf (";"); if (index != -1) ! content = content.substring (0, index); //remove any double quotes from around charset string *************** *** 523,527 **** } }; ! parser.setConnection (connection); // must be the default assertTrue ("Wrong encoding", parser.getEncoding ().equals ("ISO-8859-1")); --- 526,531 ---- } }; ! page.setConnection (connection); ! parser = new Parser (new Lexer (page)); // must be the default assertTrue ("Wrong encoding", parser.getEncoding ().equals ("ISO-8859-1")); *************** *** 538,546 **** public void testNullUrl() { Parser parser; ! try { ! parser = new Parser("http://someoneexisting.com", Parser.noFeedback); assertTrue("Should have thrown an exception!",false); } ! catch (ParserException e) { } --- 542,552 ---- public void testNullUrl() { Parser parser; ! try ! { ! parser = new Parser("http://none.existant.url.org", Parser.noFeedback); assertTrue("Should have thrown an exception!",false); } ! catch (ParserException e) ! { } *************** *** 559,564 **** } ! assertEquals("Expected nodes",12,i); } public void testLinkCollection() throws ParserException { createParser( --- 565,571 ---- } ! assertEquals("Expected nodes",20,i); } + public void testLinkCollection() throws ParserException { createParser( *************** *** 619,623 **** node.collectInto(collectionList,LinkTag.class); } ! assertEquals("Size of collection vector should be 11",11,collectionList.size()); // All items in collection vector should be links for (SimpleNodeIterator e = collectionList.elements();e.hasMoreNodes();) { --- 626,631 ---- node.collectInto(collectionList,LinkTag.class); } ! // NOTE: the link within the script is also found... this may be debatable ! assertEquals("Size of collection vector should be 12",12,collectionList.size()); // All items in collection vector should be links for (SimpleNodeIterator e = collectionList.elements();e.hasMoreNodes();) { *************** *** 732,736 **** { Node node = e.nextNode(); ! if (7 == i) { assertTrue ("not a tag", node instanceof Tag); --- 740,744 ---- { Node node = e.nextNode(); ! if (10 == i) { assertTrue ("not a tag", node instanceof Tag); *************** *** 739,743 **** i++; } ! assertEquals("Expected nodes",16,i); } } --- 747,751 ---- i++; } ! assertEquals("Expected nodes",21,i); } } Index: ParserTestCase.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/ParserTestCase.java,v retrieving revision 1.28 retrieving revision 1.29 diff -C2 -d -r1.28 -r1.29 *** ParserTestCase.java 22 Sep 2003 02:40:04 -0000 1.28 --- ParserTestCase.java 28 Sep 2003 15:33:58 -0000 1.29 *************** *** 37,43 **** import org.htmlparser.AbstractNode; import org.htmlparser.Node; - import org.htmlparser.NodeReader; import org.htmlparser.Parser; import org.htmlparser.StringNode; import org.htmlparser.tags.FormTag; import org.htmlparser.tags.InputTag; --- 37,44 ---- import org.htmlparser.AbstractNode; import org.htmlparser.Node; import org.htmlparser.Parser; import org.htmlparser.StringNode; + import org.htmlparser.lexer.Lexer; + import org.htmlparser.lexer.Page; import org.htmlparser.tags.FormTag; import org.htmlparser.tags.InputTag; *************** *** 54,58 **** protected Node node []; protected int nodeCount; ! protected NodeReader reader; public ParserTestCase(String name) { --- 55,59 ---- protected Node node []; protected int nodeCount; ! protected Lexer mLexer; public ParserTestCase(String name) { *************** *** 67,98 **** protected void createParser(String inputHTML) { ! String testHTML = new String(inputHTML); ! StringReader sr = new StringReader(testHTML); ! reader = new NodeReader(new BufferedReader(sr),5000); ! parser = new Parser(reader,new DefaultParserFeedback()); node = new AbstractNode[40]; } ! protected void createParser(String inputHTML,int numNodes) { ! String testHTML = new String(inputHTML); ! StringReader sr = new StringReader(testHTML); ! reader = new NodeReader(new BufferedReader(sr),5000); ! parser = new Parser(reader,new DefaultParserFeedback()); node = new AbstractNode[numNodes]; } protected void createParser(String inputHTML, String url) { ! String testHTML = new String(inputHTML); ! StringReader sr = new StringReader(testHTML); ! reader = new NodeReader(new BufferedReader(sr),url); ! parser = new Parser(reader,new DefaultParserFeedback()); node = new AbstractNode[40]; } protected void createParser(String inputHTML, String url,int numNodes) { ! String testHTML = new String(inputHTML); ! StringReader sr = new StringReader(testHTML); ! reader = new NodeReader(new BufferedReader(sr),url); ! parser = new Parser(reader,new DefaultParserFeedback()); node = new AbstractNode[numNodes]; } --- 68,94 ---- protected void createParser(String inputHTML) { ! mLexer = new Lexer (new Page (inputHTML)); ! parser = new Parser(mLexer, new DefaultParserFeedback()); node = new AbstractNode[40]; } ! protected void createParser(String inputHTML,int numNodes) ! { ! Lexer lexer = new Lexer (inputHTML); ! parser = new Parser (lexer, new DefaultParserFeedback()); node = new AbstractNode[numNodes]; } protected void createParser(String inputHTML, String url) { ! Lexer lexer = new Lexer (inputHTML); ! lexer.getPage ().setUrl (url); ! parser = new Parser (lexer, new DefaultParserFeedback()); node = new AbstractNode[40]; } protected void createParser(String inputHTML, String url,int numNodes) { ! Lexer lexer = new Lexer (inputHTML); ! lexer.getPage ().setUrl (url); ! parser = new Parser (lexer, new DefaultParserFeedback()); node = new AbstractNode[numNodes]; } *************** *** 295,306 **** Tag tag = (Tag)node; if (tag.isEmptyXmlTag()) { // Add end tag ! String currLine = parser.getReader().getCurrentLine(); ! int pos = parser.getReader().getLastReadPosition(); ! currLine = ! currLine.substring(0,pos+1)+ ! "</"+tag.getTagName()+">"+ ! currLine.substring(pos+1,currLine.length()); ! parser.getReader().changeLine(currLine); } } --- 291,303 ---- Tag tag = (Tag)node; if (tag.isEmptyXmlTag()) { + // oh crap... // Add end tag ! // String currLine = parser.getReader().getCurrentLine(); ! // int pos = parser.getReader().getLastReadPosition(); ! // currLine = ! // currLine.substring(0,pos+1)+ ! // "</"+tag.getTagName()+">"+ ! // currLine.substring(pos+1,currLine.length()); ! // parser.getReader().changeLine(currLine); } } |
From: <der...@us...> - 2003-09-29 21:38:47
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tags/data In directory sc8-pr-cvs1:/tmp/cvs-serv30684/tags/data Modified Files: TagData.java Log Message: Lexer Integration Removed old Parser classes. Removed EndTag, this class was replaced by a call to the new isEndTag() method on the Tag class The StringNode, RemarkNode and tags.Tag class now derive from their lexeme counterparts in lexer.nodes instead of the other way around. The beginnings of a node factory interface are included. This was added so the lexer could return 'visitable' nodes to the parser. The parser acts as it's own node factory, as does the Lexer. The node count for parsing goes up in most cases because every whitespace (i.e. newline) now counts as a StringNode. This has whacked out a lot of the tests that were expecting fewer nodes or a certain type of node at a particular index. Attributes now maintain their order and case. The count of attributes also went up because whitespace is maintained within tags too. The storage in a Vector means the element 0 Attribute is actually the name of the tag, rather than having the $TAGNAME entry in a HashTable. Index: TagData.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tags/data/TagData.java,v retrieving revision 1.31 retrieving revision 1.32 diff -C2 -d -r1.31 -r1.32 *** TagData.java 22 Sep 2003 02:40:02 -0000 1.31 --- TagData.java 28 Sep 2003 15:33:58 -0000 1.32 *************** *** 29,67 **** package org.htmlparser.tags.data; public class TagData { private int tagBegin; private int tagEnd; ! private int startLine; ! private int endLine; ! private String tagContents; ! private String tagLine; private String urlBeingParsed; private boolean isXmlEndTag; ! public TagData(int tagBegin, int tagEnd, String tagContents,String tagLine) { ! this(tagBegin, tagEnd, 0, 0, tagContents, tagLine, "", false); } ! public TagData(int tagBegin, int tagEnd, String tagContents,String tagLine, String urlBeingParsed) { ! this(tagBegin, tagEnd, 0, 0, tagContents, tagLine, urlBeingParsed, false); } ! public TagData(int tagBegin, int tagEnd, int startLine, int endLine, String tagContents,String tagLine, String urlBeingParsed, boolean isXmlEndTag) { this.tagBegin = tagBegin; this.tagEnd = tagEnd; ! this.startLine = startLine; ! this.endLine = endLine; ! this.tagContents = tagContents; ! this.tagLine = tagLine; this.urlBeingParsed = urlBeingParsed; this.isXmlEndTag = isXmlEndTag; } public int getTagBegin() { return tagBegin; } ! public String getTagContents() { ! return tagContents; } --- 29,82 ---- package org.htmlparser.tags.data; + import java.util.Vector; + import org.htmlparser.lexer.Page; + public class TagData { + private Page mPage; private int tagBegin; private int tagEnd; ! private Vector mAttributes; private String urlBeingParsed; private boolean isXmlEndTag; ! public TagData(Page page, int tagBegin, int tagEnd, Vector attributes) { ! this(page, tagBegin, tagEnd, attributes, "", false); } ! public TagData(Page page, int tagBegin, int tagEnd, Vector attributes, String urlBeingParsed) { ! this(page, tagBegin, tagEnd, attributes, urlBeingParsed, false); } ! public TagData(Page page, int tagBegin, int tagEnd, Vector attributes, String urlBeingParsed, boolean isXmlEndTag) { ! mPage = page; this.tagBegin = tagBegin; this.tagEnd = tagEnd; ! mAttributes = attributes; this.urlBeingParsed = urlBeingParsed; this.isXmlEndTag = isXmlEndTag; } + /** + * Create a virtual tag. + * Not on the page but virtually injected at the given position. + */ + public TagData(String name, int tagBegin, Vector attributes, String urlBeingParsed, boolean isXmlEndTag) + { + this ( + null, + tagBegin, + tagBegin + name.length () + 2 + (isXmlEndTag ? 1 : 0), + attributes, + urlBeingParsed, + isXmlEndTag); + // todo: add attribute sizes + } + public int getTagBegin() { return tagBegin; } ! public void setTagBegin(int begin) { ! tagBegin = begin; } *************** *** 70,79 **** } ! public String getTagLine() { ! return tagLine; } ! public void setTagContents(String tagContents) { ! this.tagContents = tagContents; } --- 85,127 ---- } ! public void setTagEnd(int end) { ! tagEnd = end; } ! public String getTagContents() ! { ! String ret; ! ! if (null != mPage) ! ret = mPage.getText (getTagBegin(), getTagEnd()); ! else ! ret = ""; ! ! return (ret); ! } ! ! /** ! * @deprecated A tag may span more than one line. ! */ ! public String getTagLine() ! { ! String ret; ! ! if (null != mPage) ! ret = mPage.getLine (getTagBegin ()); ! else ! ret = ""; ! ! return (ret); ! } ! ! public void setTagContents (String tagContents, Vector attributes, String url, boolean xml_end_tag) ! { ! mPage = new Page (tagContents); ! tagBegin = 0; ! tagEnd = tagContents.length (); ! mAttributes = attributes; ! urlBeingParsed = url; ! isXmlEndTag = xml_end_tag; } *************** *** 95,100 **** * <code>CompositeTagScanner</code> or a subclass of it. */ ! public int getStartLine() { ! return startLine; } --- 143,156 ---- * <code>CompositeTagScanner</code> or a subclass of it. */ ! public int getStartLine () ! { ! int ret; ! ! if (null != mPage) ! ret = mPage.row (getTagBegin ()); ! else ! ret = -1; ! ! return (ret); } *************** *** 104,110 **** * <code>CompositeTagScanner</code> or a subclass of it. */ ! public int getEndLine() { ! return endLine; } } --- 160,183 ---- * <code>CompositeTagScanner</code> or a subclass of it. */ ! public int getEndLine() ! { ! int ret; ! ! if (null != mPage) ! ret = mPage.row (getTagEnd ()); ! else ! ret = -1; ! ! return (ret); ! } ! ! public Page getPage () ! { ! return (mPage); } + public Vector getAttributes () + { + return (mAttributes); + } } |
From: <der...@us...> - 2003-09-29 21:38:46
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/parserHelperTests In directory sc8-pr-cvs1:/tmp/cvs-serv30684/tests/parserHelperTests Modified Files: CompositeTagScannerHelperTest.java Log Message: Lexer Integration Removed old Parser classes. Removed EndTag, this class was replaced by a call to the new isEndTag() method on the Tag class The StringNode, RemarkNode and tags.Tag class now derive from their lexeme counterparts in lexer.nodes instead of the other way around. The beginnings of a node factory interface are included. This was added so the lexer could return 'visitable' nodes to the parser. The parser acts as it's own node factory, as does the Lexer. The node count for parsing goes up in most cases because every whitespace (i.e. newline) now counts as a StringNode. This has whacked out a lot of the tests that were expecting fewer nodes or a certain type of node at a particular index. Attributes now maintain their order and case. The count of attributes also went up because whitespace is maintained within tags too. The storage in a Vector means the element 0 Attribute is actually the name of the tag, rather than having the $TAGNAME entry in a HashTable. Index: CompositeTagScannerHelperTest.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/parserHelperTests/CompositeTagScannerHelperTest.java,v retrieving revision 1.21 retrieving revision 1.22 diff -C2 -d -r1.21 -r1.22 *** CompositeTagScannerHelperTest.java 22 Sep 2003 02:40:07 -0000 1.21 --- CompositeTagScannerHelperTest.java 28 Sep 2003 15:33:59 -0000 1.22 *************** *** 47,70 **** protected void setUp() { ! helper = ! new CompositeTagScannerHelper(null,null,null,null,null,false); } public void testIsXmlEndTagForRealXml() { ! Tag tag = new Tag( ! new TagData( ! 0,0,"something/","" ! ) ! ); ! assertTrue("should be an xml end tag",helper.isXmlEndTag(tag)); } public void testIsXmlEndTagForFalseMatches() { ! Tag tag = new Tag( ! new TagData( ! 0,0,"a href=http://someurl.com/","" ! ) ! ); ! assertFalse("should not be an xml end tag",helper.isXmlEndTag(tag)); } } --- 47,72 ---- protected void setUp() { ! // helper = ! // new CompositeTagScannerHelper(null,null,null,null,null,false); } public void testIsXmlEndTagForRealXml() { ! fail ("not implemented"); ! // Tag tag = new Tag( ! // new TagData( ! // 0,0,"something/","" ! // ) ! // ); ! // assertTrue("should be an xml end tag",helper.isXmlEndTag(tag)); } public void testIsXmlEndTagForFalseMatches() { ! fail ("not implemented"); ! // Tag tag = new Tag( ! // new TagData( ! // 0,0,"a href=http://someurl.com/","" ! // ) ! // ); ! // assertFalse("should not be an xml end tag",helper.isXmlEndTag(tag)); } } |
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners In directory sc8-pr-cvs1:/tmp/cvs-serv30684/scanners Modified Files: CompositeTagScanner.java DoctypeScanner.java ImageScanner.java JspScanner.java ScriptScanner.java TagScanner.java Log Message: Lexer Integration Removed old Parser classes. Removed EndTag, this class was replaced by a call to the new isEndTag() method on the Tag class The StringNode, RemarkNode and tags.Tag class now derive from their lexeme counterparts in lexer.nodes instead of the other way around. The beginnings of a node factory interface are included. This was added so the lexer could return 'visitable' nodes to the parser. The parser acts as it's own node factory, as does the Lexer. The node count for parsing goes up in most cases because every whitespace (i.e. newline) now counts as a StringNode. This has whacked out a lot of the tests that were expecting fewer nodes or a certain type of node at a particular index. Attributes now maintain their order and case. The count of attributes also went up because whitespace is maintained within tags too. The storage in a Vector means the element 0 Attribute is actually the name of the tag, rather than having the $TAGNAME entry in a HashTable. Index: CompositeTagScanner.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/CompositeTagScanner.java,v retrieving revision 1.67 retrieving revision 1.68 diff -C2 -d -r1.67 -r1.68 *** CompositeTagScanner.java 22 Sep 2003 02:40:00 -0000 1.67 --- CompositeTagScanner.java 28 Sep 2003 15:33:58 -0000 1.68 *************** *** 33,39 **** import org.htmlparser.Node; ! import org.htmlparser.NodeReader; import org.htmlparser.parserHelper.CompositeTagScannerHelper; - import org.htmlparser.tags.EndTag; import org.htmlparser.tags.Tag; import org.htmlparser.tags.data.CompositeTagData; --- 33,38 ---- import org.htmlparser.Node; ! import org.htmlparser.lexer.Lexer; import org.htmlparser.parserHelper.CompositeTagScannerHelper; import org.htmlparser.tags.Tag; import org.htmlparser.tags.data.CompositeTagData; *************** *** 177,183 **** } ! public Tag scan(Tag tag, String url, NodeReader reader,String currLine) throws ParserException { CompositeTagScannerHelper helper = ! new CompositeTagScannerHelper(this,tag,url,reader,currLine,balance_quotes); return helper.scan(); } --- 176,182 ---- } ! public Tag scan (Tag tag, String url, Lexer lexer) throws ParserException { CompositeTagScannerHelper helper = ! new CompositeTagScannerHelper(this, tag, lexer, balance_quotes); return helper.scan(); } *************** *** 199,202 **** --- 198,214 ---- /** + * For composite tags this shouldn't be used and hence throws an exception. + * @param tagData + * @param tag + * @param url + * @return Tag + * @throws ParserException + */ + protected Tag createTag(TagData tagData, Tag tag, String url) throws ParserException + { + throw new IllegalStateException ("composite tags shouldn't be using this"); + } + + /** * You must override this method to create the tag of your choice upon successful parsing. Data required * for construction of your tag can be found within tagData and compositeTagData *************** *** 205,210 **** public final boolean isTagToBeEndedFor(Tag tag) { ! boolean isEndTag = tag instanceof EndTag; String tagName = tag.getTagName(); if ( ( isEndTag && endTagEnderSet.contains(tagName)) || --- 217,224 ---- public final boolean isTagToBeEndedFor(Tag tag) { ! boolean isEndTag = tag.isEndTag (); String tagName = tag.getTagName(); + if (isEndTag) + tagName = tagName.substring (1); if ( ( isEndTag && endTagEnderSet.contains(tagName)) || Index: DoctypeScanner.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/DoctypeScanner.java,v retrieving revision 1.27 retrieving revision 1.28 diff -C2 -d -r1.27 -r1.28 *** DoctypeScanner.java 22 Sep 2003 02:40:00 -0000 1.27 --- DoctypeScanner.java 28 Sep 2003 15:33:58 -0000 1.28 *************** *** 59,63 **** String tagContents = tag.getText(); tagContents=tagContents.substring(9,tagContents.length()); ! tagData.setTagContents(tagContents); return new DoctypeTag(tagData); } --- 59,63 ---- String tagContents = tag.getText(); tagContents=tagContents.substring(9,tagContents.length()); ! tagData.setTagContents (tagContents, tag.getAttributesEx (), "" /*url*/, false /*xml_end_tag*/); return new DoctypeTag(tagData); } Index: ImageScanner.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/ImageScanner.java,v retrieving revision 1.28 retrieving revision 1.29 diff -C2 -d -r1.28 -r1.29 *** ImageScanner.java 22 Sep 2003 02:40:00 -0000 1.28 --- ImageScanner.java 28 Sep 2003 15:33:58 -0000 1.29 *************** *** 49,53 **** { public static final String IMAGE_SCANNER_ID = "IMG"; - private Hashtable table; private LinkProcessor processor; /** --- 49,52 ---- *************** *** 73,104 **** * @param url URL of web page being parsed. */ ! public String extractImageLocn(Tag tag,String url) throws ParserException { ! String relativeLink=null; ! try { ! table = tag.getAttributes(); ! relativeLink = (String)table.get("SRC"); ! if (relativeLink!=null) { ! relativeLink = ParserUtils.removeChars(relativeLink,'\n'); ! relativeLink = ParserUtils.removeChars(relativeLink,'\r'); ! } ! if (relativeLink==null || relativeLink.length()==0) { ! // try fix ! String tagText = tag.getText().toUpperCase(); ! int indexSrc = tagText.indexOf("SRC"); ! if (indexSrc != -1) { ! // There is a missing equals. ! tag.setText(tag.getText().substring(0,indexSrc+3)+"="+tag.getText().substring(indexSrc+3,tag.getText().length())); ! table = tag.redoParseAttributes(); ! relativeLink = (String) table.get("SRC"); ! } } ! if (relativeLink==null) return ""; else ! return processor.extract(relativeLink,url); } ! catch (Exception e) { ! throw new ParserException("HTMLImageScanner.extractImageLocn() : Error in extracting image location, relativeLink = "+relativeLink+", url = "+url,e); } } --- 72,106 ---- * @param url URL of web page being parsed. */ ! public String extractImageLocn (Tag tag,String url) throws ParserException { ! String ret; ! Hashtable table; ! ret = ""; ! try ! { ! table = tag.getAttributes (); ! ret = (String)table.get ("SRC"); ! if (null != ret) ! { ! ret = ParserUtils.removeChars (ret, '\n'); ! ret = ParserUtils.removeChars (ret, '\r'); ! ret = processor.extract (ret, url); } ! else ! ret = ""; } ! catch (Exception e) ! { ! throw new ParserException ( ! "ImageScanner.extractImageLocn() : " ! + "Error in extracting image location, relativeLink = " ! + ret ! + ", url = " ! + url, ! e); } + + return (ret); } Index: JspScanner.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/JspScanner.java,v retrieving revision 1.27 retrieving revision 1.28 diff -C2 -d -r1.27 -r1.28 *** JspScanner.java 22 Sep 2003 02:40:00 -0000 1.27 --- JspScanner.java 28 Sep 2003 15:33:58 -0000 1.28 *************** *** 57,63 **** protected Tag createTag(TagData tagData, Tag tag, String url) ! throws ParserException { ! String tagContents = tagData.getTagContents(); ! tagData.setTagContents(tagContents.substring(1,tagContents.length()-1)); return new JspTag(tagData); } --- 57,64 ---- protected Tag createTag(TagData tagData, Tag tag, String url) ! throws ParserException ! { ! tagData.setTagBegin (tagData.getTagBegin () + 1); ! tagData.setTagEnd (tagData.getTagEnd () - 1); return new JspTag(tagData); } Index: ScriptScanner.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/ScriptScanner.java,v retrieving revision 1.39 retrieving revision 1.40 diff -C2 -d -r1.39 -r1.40 *** ScriptScanner.java 22 Sep 2003 02:40:00 -0000 1.39 --- ScriptScanner.java 28 Sep 2003 15:33:58 -0000 1.40 *************** *** 32,35 **** --- 32,36 ---- ///////////////////////// import org.htmlparser.*; + import org.htmlparser.lexer.Lexer; import org.htmlparser.parserHelper.*; import org.htmlparser.tags.*; *************** *** 67,75 **** } ! public Tag scan(Tag tag, String url, NodeReader nodeReader, String currLine) throws ParserException { try { ScriptScannerHelper helper = ! new ScriptScannerHelper(tag,url,nodeReader,currLine, this); return helper.scan(); --- 68,76 ---- } ! public Tag scan (Tag tag, Lexer lexer) throws ParserException { try { ScriptScannerHelper helper = ! new ScriptScannerHelper(tag, lexer, this); return helper.scan(); Index: TagScanner.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/TagScanner.java,v retrieving revision 1.40 retrieving revision 1.41 diff -C2 -d -r1.40 -r1.41 *** TagScanner.java 22 Sep 2003 02:40:00 -0000 1.40 --- TagScanner.java 28 Sep 2003 15:33:58 -0000 1.41 *************** *** 34,43 **** import java.util.Hashtable; import java.util.Map; import org.htmlparser.AbstractNode; import org.htmlparser.Node; ! import org.htmlparser.NodeReader; import org.htmlparser.StringNode; ! import org.htmlparser.tags.EndTag; import org.htmlparser.tags.Tag; import org.htmlparser.tags.data.TagData; --- 34,44 ---- import java.util.Hashtable; import java.util.Map; + import java.util.Vector; import org.htmlparser.AbstractNode; import org.htmlparser.Node; ! import org.htmlparser.Parser; import org.htmlparser.StringNode; ! import org.htmlparser.lexer.Lexer; import org.htmlparser.tags.Tag; import org.htmlparser.tags.data.TagData; *************** *** 141,186 **** return true; } ! public static String extractXMLData(Node node, String tagName, NodeReader reader) throws ParserException{ ! try { ! String xmlData = ""; ! ! boolean xmlTagFound = isXMLTagFound(node, tagName); ! if (xmlTagFound) { ! try{ ! do { ! node = reader.readElement(); ! if (node!=null) { ! if (node instanceof StringNode) { ! StringNode stringNode = (StringNode)node; ! if (xmlData.length()>0) xmlData+=" "; ! xmlData += stringNode.getText(); ! } else if (!(node instanceof org.htmlparser.tags.EndTag)) ! xmlTagFound = false; ! } ! } ! while (node instanceof StringNode); ! ! } ! ! catch (Exception e) { ! throw new ParserException("HTMLTagScanner.extractXMLData() : error while trying to find xml tag",e); ! } ! } ! if (xmlTagFound) { ! if (node!=null) { ! if (node instanceof org.htmlparser.tags.EndTag) { ! org.htmlparser.tags.EndTag endTag = (org.htmlparser.tags.EndTag)node; ! if (!endTag.getText().equals(tagName)) xmlTagFound = false; ! } ! ! } ! ! } ! if (xmlTagFound) return xmlData; else return null; ! } ! catch (Exception e) { ! throw new ParserException("HTMLTagScanner.extractXMLData() : Error occurred while trying to extract xml tag",e); ! } ! } public String getFilter() { --- 142,188 ---- return true; } ! ! // public static String extractXMLData(Node node, String tagName, NodeReader reader) throws ParserException{ ! // try { ! // String xmlData = ""; ! // ! // boolean xmlTagFound = isXMLTagFound(node, tagName); ! // if (xmlTagFound) { ! // try{ ! // do { ! // node = reader.readElement(); ! // if (node!=null) { ! // if (node instanceof StringNode) { ! // StringNode stringNode = (StringNode)node; ! // if (xmlData.length()>0) xmlData+=" "; ! // xmlData += stringNode.getText(); ! // } else if (!(node instanceof org.htmlparser.tags.EndTag)) ! // xmlTagFound = false; ! // } ! // } ! // while (node instanceof StringNode); ! // ! // } ! // ! // catch (Exception e) { ! // throw new ParserException("HTMLTagScanner.extractXMLData() : error while trying to find xml tag",e); ! // } ! // } ! // if (xmlTagFound) { ! // if (node!=null) { ! // if (node instanceof org.htmlparser.tags.EndTag) { ! // org.htmlparser.tags.EndTag endTag = (org.htmlparser.tags.EndTag)node; ! // if (!endTag.getText().equals(tagName)) xmlTagFound = false; ! // } ! // ! // } ! // ! // } ! // if (xmlTagFound) return xmlData; else return null; ! // } ! // catch (Exception e) { ! // throw new ParserException("HTMLTagScanner.extractXMLData() : Error occurred while trying to extract xml tag",e); ! // } ! // } public String getFilter() { *************** *** 199,204 **** } ! public final Tag createScannedNode(Tag tag,String url,NodeReader reader,String currLine) throws ParserException { ! Tag thisTag = scan(tag,url,reader,currLine); thisTag.setThisScanner(this); thisTag.setAttributesEx(tag.getAttributesEx()); --- 201,206 ---- } ! public final Tag createScannedNode(Tag tag,String url,Lexer lexer) throws ParserException { ! Tag thisTag = scan(tag,url,lexer); thisTag.setThisScanner(this); thisTag.setAttributesEx(tag.getAttributesEx()); *************** *** 216,226 **** * @param reader The reader object responsible for reading the html page */ ! public Tag scan(Tag tag,String url,NodeReader reader,String currLine) throws ParserException { ! return createTag(new TagData( tag.elementBegin(), tag.elementEnd(), ! tag.getText(), ! currLine ! ), tag, url); } --- 218,231 ---- * @param reader The reader object responsible for reading the html page */ ! public Tag scan(Tag tag,String url,Lexer lexer) throws ParserException ! { ! TagData data; ! ! data = new TagData( ! lexer.getPage (), tag.elementBegin(), tag.elementEnd(), ! new Vector ()); ! return (createTag(data, tag, url)); } *************** *** 246,262 **** } ! public static Map adjustScanners(NodeReader reader) { ! Map tempScanners= new Hashtable(); ! tempScanners = reader.getParser().getScanners(); // Remove all existing scanners ! reader.getParser().flushScanners(); ! return tempScanners; } ! public static void restoreScanners(NodeReader pReader, Hashtable tempScanners) { // Flush the scanners ! pReader.getParser().setScanners(tempScanners); } --- 251,269 ---- } ! public static Map adjustScanners(Parser parser) { ! Map ret; ! ! ret = parser.getScanners(); // Remove all existing scanners ! parser.flushScanners(); ! ! return (ret); } ! public static void restoreScanners(Parser parser, Hashtable tempScanners) { // Flush the scanners ! parser.setScanners(tempScanners); } *************** *** 279,300 **** * @throws ParserException */ ! protected Tag createTag(TagData tagData, Tag tag, String url) throws ParserException { ! return null; ! } ! ! protected Tag getReplacedEndTag(Tag tag, NodeReader reader, String currentLine) { ! // Replace tag - it was a <A> tag - replace with </a> ! String newLine = replaceFaultyTagWithEndTag(tag, currentLine); ! reader.changeLine(newLine); ! return new EndTag( ! new TagData( ! tag.elementBegin(), ! tag.elementBegin()+3, ! tag.getTagName(), ! currentLine ! ) ! ); ! } public String replaceFaultyTagWithEndTag(Tag tag, String currentLine) { --- 286,304 ---- * @throws ParserException */ ! protected abstract Tag createTag(TagData tagData, Tag tag, String url) throws ParserException; ! // protected Tag getReplacedEndTag(Tag tag, NodeReader reader, String currentLine) { ! // // Replace tag - it was a <A> tag - replace with </a> ! // String newLine = replaceFaultyTagWithEndTag(tag, currentLine); ! // reader.changeLine(newLine); ! // return new EndTag( ! // new TagData( ! // tag.elementBegin(), ! // tag.elementBegin()+3, ! // tag.getTagName(), ! // currentLine ! // ) ! // ); ! // } public String replaceFaultyTagWithEndTag(Tag tag, String currentLine) { *************** *** 306,322 **** } ! protected Tag getInsertedEndTag(Tag tag, NodeReader reader, String currentLine) { ! // Insert end tag ! String newLine = insertEndTagBeforeNode(tag, currentLine); ! reader.changeLine(newLine); ! return new EndTag( ! new TagData( ! tag.elementBegin(), ! tag.elementBegin()+3, ! tag.getTagName(), ! currentLine ! ) ! ); ! } --- 310,326 ---- } ! // protected Tag getInsertedEndTag(Tag tag, String currentLine) { ! // // Insert end tag ! // String newLine = insertEndTagBeforeNode(tag, currentLine); ! // reader.changeLine(newLine); ! // return new EndTag( ! // new TagData( ! // tag.elementBegin(), ! // tag.elementBegin()+3, ! // tag.getTagName(), ! // currentLine ! // ) ! // ); ! // } |
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tags In directory sc8-pr-cvs1:/tmp/cvs-serv30684/tags Modified Files: AppletTag.java CompositeTag.java DoctypeTag.java ImageTag.java JspTag.java StyleTag.java Tag.java Removed Files: EndTag.java Log Message: Lexer Integration Removed old Parser classes. Removed EndTag, this class was replaced by a call to the new isEndTag() method on the Tag class The StringNode, RemarkNode and tags.Tag class now derive from their lexeme counterparts in lexer.nodes instead of the other way around. The beginnings of a node factory interface are included. This was added so the lexer could return 'visitable' nodes to the parser. The parser acts as it's own node factory, as does the Lexer. The node count for parsing goes up in most cases because every whitespace (i.e. newline) now counts as a StringNode. This has whacked out a lot of the tests that were expecting fewer nodes or a certain type of node at a particular index. Attributes now maintain their order and case. The count of attributes also went up because whitespace is maintained within tags too. The storage in a Vector means the element 0 Attribute is actually the name of the tag, rather than having the $TAGNAME entry in a HashTable. Index: AppletTag.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tags/AppletTag.java,v retrieving revision 1.29 retrieving revision 1.30 diff -C2 -d -r1.29 -r1.30 *** AppletTag.java 23 Sep 2003 03:41:33 -0000 1.29 --- AppletTag.java 28 Sep 2003 15:33:58 -0000 1.30 *************** *** 190,195 **** paramValue = (String)newAppletParams.get (paramName); s = "PARAM VALUE=\"" + paramValue + "\" NAME=\"" + paramName + "\""; ! tagData = new TagData (0, 0, 0, 0, s, s, "", false); // what, no URL? ! kids.add (new Tag (tagData)); } --- 190,196 ---- paramValue = (String)newAppletParams.get (paramName); s = "PARAM VALUE=\"" + paramValue + "\" NAME=\"" + paramName + "\""; ! throw new IllegalStateException ("not implemented"); ! // tagData = new TagData (0, 0, 0, 0, s, s, "", false); // what, no URL? ! // kids.add (new Tag (tagData)); } Index: CompositeTag.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tags/CompositeTag.java,v retrieving revision 1.55 retrieving revision 1.56 diff -C2 -d -r1.55 -r1.56 *** CompositeTag.java 22 Sep 2003 02:40:01 -0000 1.55 --- CompositeTag.java 28 Sep 2003 15:33:58 -0000 1.56 *************** *** 31,34 **** --- 31,35 ---- import org.htmlparser.*; import org.htmlparser.AbstractNode; + import org.htmlparser.lexer.nodes.TagNode; import org.htmlparser.tags.data.CompositeTagData; import org.htmlparser.tags.data.TagData; *************** *** 38,47 **** public abstract class CompositeTag extends Tag { ! protected Tag startTag, endTag; public CompositeTag(TagData tagData, CompositeTagData compositeTagData) { super(tagData); ! this.startTag = compositeTagData.getStartTag(); ! this.endTag = compositeTagData.getEndTag(); setChildren (compositeTagData.getChildren()); } --- 39,60 ---- public abstract class CompositeTag extends Tag { ! protected TagNode startTag; ! protected TagNode endTag; public CompositeTag(TagData tagData, CompositeTagData compositeTagData) { super(tagData); ! ! // from Tag(TagData) ! // super(tagData.getTagBegin(),tagData.getTagEnd()); ! // this.startLine = tagData.getStartLine(); ! // this.tagContents = new StringBuffer(); ! // this.tagContents.append(tagData.getTagContents()); ! // this.tagLine = tagData.getTagLine(); ! // this.tagLines = new String[] {tagData.getTagLine()}; ! // this.emptyXmlTag = tagData.isEmptyXmlTag(); ! ! ! startTag = compositeTagData.getStartTag(); ! endTag = compositeTagData.getEndTag(); setChildren (compositeTagData.getChildren()); } *************** *** 149,156 **** for (SimpleNodeIterator e = children();e.hasMoreNodes() && !found;) { node = (Node)e.nextNode(); ! if (node instanceof Tag) { tag = (Tag)node; String nameAttribute = tag.getAttribute("NAME"); ! if (nameAttribute!=null && nameAttribute.equals(name)) found=true; } } --- 162,170 ---- for (SimpleNodeIterator e = children();e.hasMoreNodes() && !found;) { node = (Node)e.nextNode(); ! if (node instanceof TagNode) { tag = (Tag)node; String nameAttribute = tag.getAttribute("NAME"); ! if (nameAttribute!=null && nameAttribute.equals(name)) ! found=true; } } *************** *** 271,287 **** } ! public void collectInto(NodeList collectionList, String filter) { ! super.collectInto(collectionList, filter); Node node; ! for (SimpleNodeIterator e = children();e.hasMoreNodes();) { ! node = e.nextNode(); ! node.collectInto(collectionList,filter); } } ! public void collectInto(NodeList collectionList, Class nodeType) { ! super.collectInto(collectionList,nodeType); ! for (SimpleNodeIterator e = children();e.hasMoreNodes();) { ! e.nextNode().collectInto(collectionList,nodeType); } } --- 285,309 ---- } ! public void collectInto (NodeList collectionList, String filter) ! { Node node; ! ! super.collectInto (collectionList, filter); ! for (SimpleNodeIterator e = children(); e.hasMoreNodes ();) ! { ! node = e.nextNode (); ! node.collectInto (collectionList, filter); } } ! public void collectInto (NodeList collectionList, Class nodeType) ! { ! Node node; ! ! super.collectInto (collectionList,nodeType); ! for (SimpleNodeIterator e = children(); e.hasMoreNodes (); ) ! { ! node = e.nextNode (); ! node.collectInto (collectionList, nodeType); } } *************** *** 296,299 **** --- 318,331 ---- } + /** + * Handle a visitor. + * <em>NOTE: This currently defers to accept(NodeVisitor), but eventually + * subclasses of Node should be overriding accept(Object) directly.</em> + * @param visitor The <code>NodeVisitor</code> object. + */ + public void accept(Object visitor) { + accept ((NodeVisitor)visitor); + } + public void accept(NodeVisitor visitor) { if (visitor.shouldRecurseChildren()) { *************** *** 314,322 **** } ! public Tag getStartTag() { return startTag; } ! public Tag getEndTag() { return endTag; } --- 346,354 ---- } ! public TagNode getStartTag() { return startTag; } ! public TagNode getEndTag() { return endTag; } Index: DoctypeTag.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tags/DoctypeTag.java,v retrieving revision 1.28 retrieving revision 1.29 diff -C2 -d -r1.28 -r1.29 *** DoctypeTag.java 22 Sep 2003 02:40:01 -0000 1.28 --- DoctypeTag.java 28 Sep 2003 15:33:58 -0000 1.29 *************** *** 51,58 **** public String toString() { ! return "Doctype Tag : "+tagContents+"; begins at : "+elementBegin()+"; ends at : "+elementEnd(); } public String toHtml() { ! return "<!DOCTYPE "+tagContents+">"; } } --- 51,58 ---- public String toString() { ! return "Doctype Tag : "+getTagContents()+"; begins at : "+elementBegin()+"; ends at : "+elementEnd(); } public String toHtml() { ! return "<!DOCTYPE "+getTagContents()+">"; } } Index: ImageTag.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tags/ImageTag.java,v retrieving revision 1.27 retrieving revision 1.28 diff -C2 -d -r1.27 -r1.28 *** ImageTag.java 22 Sep 2003 02:40:01 -0000 1.27 --- ImageTag.java 28 Sep 2003 15:33:58 -0000 1.28 *************** *** 29,32 **** --- 29,33 ---- package org.htmlparser.tags; + import org.htmlparser.lexer.nodes.TagNode; import org.htmlparser.tags.data.TagData; import org.htmlparser.visitors.NodeVisitor; Index: JspTag.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tags/JspTag.java,v retrieving revision 1.29 retrieving revision 1.30 diff -C2 -d -r1.29 -r1.30 *** JspTag.java 22 Sep 2003 02:40:01 -0000 1.29 --- JspTag.java 28 Sep 2003 15:33:58 -0000 1.30 *************** *** 47,51 **** public String toHtml() { ! return "<%"+tagContents+"%>"; } --- 47,51 ---- public String toHtml() { ! return "<%"+getTagContents()+"%>"; } *************** *** 55,59 **** public String toString() { ! return "JSP/ASP Tag : "+tagContents+"; begins at : "+elementBegin()+"; ends at : "+elementEnd(); } } --- 55,59 ---- public String toString() { ! return "JSP/ASP Tag : "+getTagContents()+"; begins at : "+elementBegin()+"; ends at : "+elementEnd(); } } Index: StyleTag.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tags/StyleTag.java,v retrieving revision 1.27 retrieving revision 1.28 diff -C2 -d -r1.27 -r1.28 *** StyleTag.java 22 Sep 2003 02:40:01 -0000 1.27 --- StyleTag.java 28 Sep 2003 15:33:58 -0000 1.28 *************** *** 62,66 **** sb.append("Code\n"); sb.append("****\n"); ! sb.append(tagContents+"\n"); return sb.toString(); } --- 62,66 ---- sb.append("Code\n"); sb.append("****\n"); ! sb.append(getTagContents ()+"\n"); return sb.toString(); } Index: Tag.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tags/Tag.java,v retrieving revision 1.48 retrieving revision 1.49 diff -C2 -d -r1.48 -r1.49 *** Tag.java 22 Sep 2003 02:40:02 -0000 1.48 --- Tag.java 28 Sep 2003 15:33:58 -0000 1.49 *************** *** 33,42 **** import java.util.Hashtable; import java.util.Map; import org.htmlparser.AbstractNode; ! import org.htmlparser.NodeReader; ! import org.htmlparser.parserHelper.AttributeParser; import org.htmlparser.parserHelper.SpecialHashtable; - import org.htmlparser.parserHelper.TagParser; import org.htmlparser.scanners.TagScanner; import org.htmlparser.tags.data.TagData; --- 33,42 ---- import java.util.Hashtable; import java.util.Map; + import java.util.Vector; import org.htmlparser.AbstractNode; ! import org.htmlparser.lexer.Page; ! import org.htmlparser.lexer.nodes.TagNode; import org.htmlparser.parserHelper.SpecialHashtable; import org.htmlparser.scanners.TagScanner; import org.htmlparser.tags.data.TagData; *************** *** 44,47 **** --- 44,48 ---- import org.htmlparser.util.ParserException; import org.htmlparser.visitors.NodeVisitor; + /** * Tag represents a generic tag. This class allows users to register specific *************** *** 49,282 **** * scanners to run over the text, and identify. It can be used to dynamically * configure a parser. - * @author Kaarle Kaila 23.10.2001 */ ! public class Tag extends AbstractNode { ! public static final String TYPE = "TAG"; ! /** ! * Constant used as value for the value of the tag name ! * in parseParameters (Kaarle Kaila 3.8.2001) ! */ ! public final static String TAGNAME = "$<TAGNAME>$"; ! public final static String EMPTYTAG = "$<EMPTYTAG>$"; ! public final static String NULLVALUE = "$<NULL>$"; ! public final static String NOTHING = "$<NOTHING>$"; ! private final static String EMPTY_STRING=""; ! ! private static TagParser tagParser; ! /** ! * Tag contents will have the contents of the comment tag. ! */ ! protected StringBuffer tagContents; ! private boolean emptyXmlTag = false; ! /** ! * tag parameters parsed into this hashtable ! * not implemented yet ! * added by Kaarle Kaila 23.10.2001 ! */ ! protected SpecialHashtable _attributes=null; ! ! /** ! * Scanner associated with this tag (useful for extraction of filtering data from a ! * HTML node) ! */ ! protected TagScanner thisScanner=null; ! private java.lang.String tagLine; ! ! /** ! * The combined text of all the lines spanned by this tag ! */ ! private String[] tagLines; ! ! /** ! * The line number on which this tag starts ! */ ! private int startLine; ! ! /** ! * Set of tags that breaks the flow. ! */ ! protected static HashSet breakTags; ! static ! { ! breakTags = new HashSet (30); ! breakTags.add ("BLOCKQUOTE"); ! breakTags.add ("BODY"); ! breakTags.add ("BR"); ! breakTags.add ("CENTER"); ! breakTags.add ("DD"); ! breakTags.add ("DIR"); ! breakTags.add ("DIV"); ! breakTags.add ("DL"); ! breakTags.add ("DT"); ! breakTags.add ("FORM"); ! breakTags.add ("H1"); ! breakTags.add ("H2"); ! breakTags.add ("H3"); ! breakTags.add ("H4"); ! breakTags.add ("H5"); ! breakTags.add ("H6"); ! breakTags.add ("HEAD"); ! breakTags.add ("HR"); ! breakTags.add ("HTML"); ! breakTags.add ("ISINDEX"); ! breakTags.add ("LI"); ! breakTags.add ("MENU"); ! breakTags.add ("NOFRAMES"); ! breakTags.add ("OL"); ! breakTags.add ("P"); ! breakTags.add ("PRE"); ! breakTags.add ("TD"); ! breakTags.add ("TH"); ! breakTags.add ("TITLE"); ! breakTags.add ("UL"); ! } ! /** ! * Set the Tag with the beginning posn, ending posn and tag contents (in ! * a tagData object. ! * @param tagData The data for this tag ! */ ! public Tag(TagData tagData) { ! super(tagData.getTagBegin(),tagData.getTagEnd()); ! this.startLine = tagData.getStartLine(); ! this.tagContents = new StringBuffer(); ! this.tagContents.append(tagData.getTagContents()); ! this.tagLine = tagData.getTagLine(); ! this.tagLines = new String[] {tagData.getTagLine()}; ! this.emptyXmlTag = tagData.isEmptyXmlTag(); ! } ! ! public void append(char ch) { ! tagContents.append(ch); ! } ! ! public void append(String ch) { ! tagContents.append(ch); ! } ! ! /** ! * Locate the tag withing the input string, by parsing from the given position ! * @param reader HTML reader to be provided so as to allow reading of next line ! * @param input Input String ! * @param position Position to start parsing from ! */ ! public static Tag find(NodeReader reader,String input,int position) { ! return tagParser.find(reader,input,position); ! } ! ! /** ! * This method is not to be called by any scanner or tag. It is ! * an expensive method, hence it has been made private. However, ! * there might be some circumstances when a scanner wishes to force ! * parsing of attributes over and above what has already been parsed. ! * To make the choice clear - we have a method - redoParseAttributes(), ! * which can be used. ! * @return Hashtable ! */ ! private SpecialHashtable parseAttributes(){ ! return (SpecialHashtable)(new AttributeParser()).parseAttributes(getText ()); } ! /** ! * In case the tag is parsed at the scan method this will return value of a ! * parameter not implemented yet ! * @param name of parameter ! */ ! public String getAttribute(String name) { ! SpecialHashtable ht; ! Object ret; ! ! ht = getAttributesEx(); ! ret = ht.getRaw(name.toUpperCase()); ! if (null != ret) ! { ! ret = ((String[])ret)[1]; ! if (Tag.NULLVALUE == ret) ! ret = null; ! else if (Tag.NOTHING == ret) ! ret = ""; ! } ! ! return ((String)ret); ! } ! ! /** ! * Set attribute with given key, value pair. ! * @param key ! * @param value ! */ ! public void setAttribute(String key, String value) { ! _attributes.put(key.toUpperCase (), new String[] {key, value}); ! } ! ! /** ! * In case the tag is parsed at the scan method this will return value of a ! * parameter not implemented yet ! * @param name of parameter ! * @deprecated use getAttribute instead ! */ ! public String getParameter(String name){ ! return ((String[])getAttributesEx().get(name.toUpperCase()))[1]; ! } ! ! /** ! * Gets the attributes in the tag. ! * NOTE: Values of the extended hashtable are two element arrays of String, ! * with the first element being the original name (not uppercased), ! * and the second element being the value. ! * @return Returns a special hashtable of attributes in two element String arrays. ! */ ! public SpecialHashtable getAttributesEx() { ! if (_attributes == null) ! _attributes = parseAttributes(); ! return _attributes; } ! /** ! * Gets the attributes in the tag. ! * @return Returns a Hashtable of attributes ! */ ! public Hashtable getAttributes() { ! Hashtable ret; ! ! ret = new SpecialHashtable (); ! for (Enumeration e = getAttributesEx ().keys(); e.hasMoreElements(); ) ! { ! String key = (String)e.nextElement (); ! ret.put (key, ((String[])getAttributesEx().getRaw(key))[1]); ! } ! ! return (ret); ! } ! ! public String getTagName(){ ! return getParameter(TAGNAME); ! } ! ! /** ! * Returns the line where the tag was found ! * @return java.lang.String ! */ ! public String getTagLine() { ! return tagLine; ! } ! ! /** ! * Returns the combined text of all the lines spanned by this tag ! * @return java.lang.String ! */ ! public String[] getTagLines() { ! return tagLines; ! } ! ! /** ! * Return the text contained in this tag ! */ ! public String getText() { ! return tagContents.toString(); } --- 50,76 ---- * scanners to run over the text, and identify. It can be used to dynamically * configure a parser. */ ! public class Tag extends TagNode { ! TagScanner mScanner; ! TagData mData; ! public Tag (TagNode node, TagScanner scanner) { ! super (node.getPage (), node.getTagBegin (), node.getTagEnd (), node.getAttributesEx ()); ! mScanner = scanner; } ! public Tag (Page page, int start, int end, Vector attributes) { ! super (page, start, end, attributes); ! mScanner = null; } ! public Tag (TagData data) { ! super (data.getPage (), data.getTagBegin (), data.getTagEnd (), data.getAttributes ()); ! mData = data; ! mScanner = null; } *************** *** 286,543 **** public TagScanner getThisScanner() { ! return thisScanner; ! } ! ! /** ! * Extract the first word from the given string. ! * Words are delimited by whitespace or equals signs. ! * @param s The string to get the word from. ! * @return The first word. ! */ ! public static String extractWord (String s) ! { ! int length; ! boolean parse; ! char ch; ! StringBuffer ret; ! ! length = s.length (); ! ret = new StringBuffer (length); ! parse = true; ! for (int i = 0; i < length && parse; i++) ! { ! ch = s.charAt (i); ! if (Character.isWhitespace (ch) || ch == '=') ! parse = false; ! else ! ret.append (Character.toUpperCase (ch)); ! } ! ! return (ret.toString ()); ! } ! ! /** ! * Scan the tag to see using the registered scanners, and attempt identification. ! * @param url URL at which HTML page is located ! * @param reader The NodeReader that is to be used for reading the url ! */ ! public AbstractNode scan(Map scanners,String url,NodeReader reader) throws ParserException ! { ! if (tagContents.length()==0) return this; ! try { ! boolean found=false; ! AbstractNode retVal=null; ! // Find the first word in the scanners ! String firstWord = extractWord(tagContents.toString()); ! // Now, get the scanner associated with this. ! TagScanner scanner = (TagScanner)scanners.get(firstWord); ! ! // Now do a deep check ! if (scanner != null && ! scanner.evaluate( ! tagContents.toString(), ! reader.getPreviousOpenScanner() ! ) ! ) ! { ! found=true; ! TagScanner save; ! save = reader.getPreviousOpenScanner (); ! reader.setPreviousOpenScanner(scanner); ! retVal=scanner.createScannedNode(this,url,reader,tagLine); ! reader.setPreviousOpenScanner(save); ! } ! ! if (!found) return this; ! else { ! return retVal; ! } ! } ! catch (Exception e) { ! String errorMsg; ! if (tagContents!=null) errorMsg = tagContents.toString(); else errorMsg="null"; ! throw new ParserException("Tag.scan() : Error while scanning tag, tag contents = "+errorMsg+", tagLine = "+tagLine,e); ! } ! } ! ! /** ! * Sets the attributes. ! * @param attributes The attribute collection to set. ! */ ! public void setAttributes(Hashtable attributes) ! { ! SpecialHashtable att = new SpecialHashtable (); ! for (Enumeration e = attributes.keys (); e.hasMoreElements (); ) ! { ! String key = (String)e.nextElement (); ! att.put (key, new String[] { key, (String)attributes.get (key)}); ! } ! this._attributes = att; ! } ! ! /** ! * Sets the attributes. ! * NOTE: Values of the extended hashtable are two element arrays of String, ! * with the first element being the original name (not uppercased), ! * and the second element being the value. ! * @param attributes The attribute collection to set. ! */ ! public void setAttributesEx (SpecialHashtable attributes) ! { ! _attributes = attributes; ! } ! ! /** ! * Sets the tagBegin. ! * @param tagBegin The starting position of the tag. ! */ ! public void setTagBegin(int tagBegin) { ! this.nodeBegin = tagBegin; ! } ! ! /** ! * Gets the tagBegin. ! * @return The nstarting position of the tag. ! */ ! public int getTagBegin() { ! return (nodeBegin); ! } ! ! /** ! * Sets the tagEnd. ! * @param tagEnd The ending position of the tag. ! */ ! public void setTagEnd(int tagEnd) { ! this.nodeEnd = tagEnd; ! } ! ! /** ! * Gets the tagEnd. ! * @return The ending position of the tag. ! */ ! public int getTagEnd() { ! return (nodeEnd); ! } ! ! /** ! * Gets the line number on which this tag starts. ! * @return the start line number ! */ ! public int getTagStartLine() { ! return startLine; ! } ! ! /** ! * Gets the line number on which this tag ends. ! * @return the end line number ! */ ! public int getTagEndLine() { ! return startLine + tagLines.length - 1; ! } ! ! public void setTagLine(java.lang.String newTagLine) { ! tagLine = newTagLine; ! ! // Note: Incur the overhead of resizing each time (versus ! // preallocating a larger array), since the average tag ! // generally doesn't span multiple lines ! String[] newTagLines = new String[tagLines.length + 1]; ! for (int i = 0; i < tagLines.length; i++) ! newTagLines[i] = tagLines[i]; ! newTagLines[tagLines.length] = newTagLine; ! tagLines = newTagLines; ! } ! ! public void setText(String text) { ! tagContents = new StringBuffer(text); } public void setThisScanner(TagScanner scanner) { ! thisScanner = scanner; ! } ! ! public String toPlainTextString() { ! return EMPTY_STRING; ! } ! ! /** ! * A call to a tag's toHTML() method will render it in HTML ! * Most tags that do not have children and inherit from Tag, ! * do not need to override toHTML(). ! * @see org.htmlparser.Node#toHtml() ! */ ! public String toHtml() ! { ! StringBuffer ret; ! String key; ! String value[]; ! String empty; ! ! ret = new StringBuffer (); ! value = (String[])(getAttributesEx().getRaw (TAGNAME)); ! ret.append ("<"); ! ret.append (value[1]); ! empty = null; ! for (Enumeration e = getAttributesEx ().keys(); e.hasMoreElements(); ) ! { ! key = (String)e.nextElement (); ! if (!key.equals (TAGNAME)) ! { ! if (key.equals (EMPTYTAG)) ! empty="/"; ! else ! { ! ret.append (" "); ! value = (String[])(getAttributesEx().getRaw (key.toUpperCase ())); ! ret.append (value[0]); ! if (Tag.NULLVALUE != value[1]) ! { ! ret.append ("="); ! if (!(Tag.NOTHING == value[1])) ! { ! ret.append ("\""); ! ret.append (value[1]); ! ret.append ("\""); ! } ! else ! ret.append (""); ! } ! } ! } ! } ! if (null != empty) ! ret.append (empty); ! if (isEmptyXmlTag ()) ! ret.append ("/"); ! ret.append (">"); ! ! return (ret.toString ()); ! } ! ! /** ! * Print the contents of the tag ! */ ! public String toString() ! { ! return "Begin Tag : "+tagContents+"; begins at : "+elementBegin()+"; ends at : "+elementEnd(); ! } ! ! /** ! * Sets the tagParser. ! * @param tagParser The tagParser to set ! */ ! public static void setTagParser(TagParser tagParser) { ! Tag.tagParser = tagParser; ! } ! ! /** ! * Determines if the given tag breaks the flow of text. ! * @return <code>true</code> if following text would start on a new line, ! * <code>false</code> otherwise. ! */ ! public boolean breaksFlow () ! { ! return (breakTags.contains (getText ().toUpperCase ())); } --- 80,89 ---- public TagScanner getThisScanner() { ! return (mScanner); } public void setThisScanner(TagScanner scanner) { ! mScanner = scanner; } *************** *** 549,612 **** * @see org.htmlparser.Node#collectInto(NodeList, String) */ ! public void collectInto(NodeList collectionList, String filter) { ! if (thisScanner!=null && thisScanner.getFilter()==filter) ! collectionList.add(this); ! } ! ! /** ! * Returns table of attributes in the tag ! * @return Hashtable ! * @deprecated This method is deprecated. Use getAttributes() instead. ! */ ! public Hashtable getParsed() { ! return getAttributes (); ! } ! ! /** ! * Sometimes, a scanner may need to request a re-evaluation of the ! * attributes in a tag. This may happen when there is some correction ! * activity. An example of its usage can be found in ImageTag. ! * <br> ! * <B>Note:<B> This is an intensive task, hence call only when ! * really necessary ! * @return Hashtable ! */ ! public Hashtable redoParseAttributes() { ! _attributes = null; ! getAttributesEx (); ! return (getAttributes ()); } /** ! * Handle a visitor. ! * <em>NOTE: This currently defers to accept(NodeVisitor), but eventually ! * subclasses of Node should be overriding accept(Object) directly.</em> ! * @param visitor The <code>NodeVisitor</code> object. */ ! public void accept(Object visitor) { ! accept ((NodeVisitor)visitor); ! } ! ! public void accept(NodeVisitor visitor) { ! visitor.visitTag(this); ! } ! ! public String getType() { ! return TYPE; ! } ! /** ! * Is this an empty xml tag of the form<br> ! * <tag/> ! * @return boolean ! */ ! public boolean isEmptyXmlTag() { ! return emptyXmlTag; } ! public void setEmptyXmlTag(boolean emptyXmlTag) { ! this.emptyXmlTag = emptyXmlTag; } - } --- 95,122 ---- * @see org.htmlparser.Node#collectInto(NodeList, String) */ ! public void collectInto(NodeList collectionList, String filter) { ! if (null != getThisScanner () && getThisScanner ().getFilter () == filter) ! collectionList.add (this); } /** ! * Jeez I hope this goes away. */ ! public String getTagContents () ! { ! String ret; ! if (null != mData) ! ret = mData.getTagContents(); ! else ! ret = ""; ! ! return (ret); } ! public void accept (Object visitor) ! { ! ((NodeVisitor)visitor).visitTag (this); } } --- EndTag.java DELETED --- |
From: <der...@us...> - 2003-09-29 21:30:08
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser In directory sc8-pr-cvs1:/tmp/cvs-serv32344 Modified Files: Parser.java Log Message: Fix broken serializability. Index: Parser.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/Parser.java,v retrieving revision 1.63 retrieving revision 1.64 diff -C2 -d -r1.63 -r1.64 *** Parser.java 28 Sep 2003 19:30:03 -0000 1.63 --- Parser.java 29 Sep 2003 00:00:38 -0000 1.64 *************** *** 189,193 **** * The html lexer associated with this parser. */ ! protected transient Lexer mLexer; /** --- 189,193 ---- * The html lexer associated with this parser. */ ! protected Lexer mLexer; /** *************** *** 369,400 **** { this (connection, stdout); - } - - // - // Serialization support - // - - private void writeObject (ObjectOutputStream out) - throws - IOException - { - out.defaultWriteObject (); - } - - private void readObject (ObjectInputStream in) - throws - IOException, - ClassNotFoundException - { - in.defaultReadObject (); - try - { - // reopen the connection and create a lexer which are transient fields - setURL (getURL ()); - } - catch (ParserException hpe) - { - throw new IOException (hpe.toString ()); - } } --- 369,372 ---- |
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/visitors In directory sc8-pr-cvs1:/tmp/cvs-serv30684/visitors Modified Files: HtmlPage.java NodeVisitor.java TagFindingVisitor.java TextExtractingVisitor.java UrlModifyingVisitor.java Log Message: Lexer Integration Removed old Parser classes. Removed EndTag, this class was replaced by a call to the new isEndTag() method on the Tag class The StringNode, RemarkNode and tags.Tag class now derive from their lexeme counterparts in lexer.nodes instead of the other way around. The beginnings of a node factory interface are included. This was added so the lexer could return 'visitable' nodes to the parser. The parser acts as it's own node factory, as does the Lexer. The node count for parsing goes up in most cases because every whitespace (i.e. newline) now counts as a StringNode. This has whacked out a lot of the tests that were expecting fewer nodes or a certain type of node at a particular index. Attributes now maintain their order and case. The count of attributes also went up because whitespace is maintained within tags too. The storage in a Vector means the element 0 Attribute is actually the name of the tag, rather than having the $TAGNAME entry in a HashTable. Index: HtmlPage.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/visitors/HtmlPage.java,v retrieving revision 1.34 retrieving revision 1.35 diff -C2 -d -r1.34 -r1.35 *** HtmlPage.java 22 Sep 2003 02:40:16 -0000 1.34 --- HtmlPage.java 28 Sep 2003 15:34:00 -0000 1.35 *************** *** 34,38 **** import org.htmlparser.StringNode; import org.htmlparser.scanners.TableScanner; - import org.htmlparser.tags.EndTag; import org.htmlparser.tags.TableTag; import org.htmlparser.tags.Tag; --- 34,37 ---- *************** *** 63,75 **** } ! public void visitTag(Tag tag) { ! addTagToBodyIfApplicable(tag); ! ! if (isTable(tag)) { ! tables.add(tag); ! } ! else { if (isBodyTag(tag)) ! bodyTagBegin = true; } } --- 62,84 ---- } ! public void visitTag(Tag tag) ! { ! if (tag.isEndTag ()) ! { if (isBodyTag(tag)) ! bodyTagBegin = false; ! addTagToBodyIfApplicable(tag); ! } ! else ! { ! addTagToBodyIfApplicable(tag); ! ! if (isTable(tag)) { ! tables.add(tag); ! } ! else { ! if (isBodyTag(tag)) ! bodyTagBegin = true; ! } } } *************** *** 82,91 **** if (bodyTagBegin) nodesInBody.add(node); - } - - public void visitEndTag(EndTag endTag) { - if (isBodyTag(endTag)) - bodyTagBegin = false; - addTagToBodyIfApplicable(endTag); } --- 91,94 ---- Index: NodeVisitor.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/visitors/NodeVisitor.java,v retrieving revision 1.29 retrieving revision 1.30 diff -C2 -d -r1.29 -r1.30 *** NodeVisitor.java 22 Sep 2003 02:40:16 -0000 1.29 --- NodeVisitor.java 28 Sep 2003 15:34:00 -0000 1.30 *************** *** 31,38 **** import org.htmlparser.RemarkNode; import org.htmlparser.StringNode; ! import org.htmlparser.tags.EndTag; import org.htmlparser.tags.ImageTag; import org.htmlparser.tags.LinkTag; - import org.htmlparser.tags.Tag; import org.htmlparser.tags.TitleTag; --- 31,37 ---- import org.htmlparser.RemarkNode; import org.htmlparser.StringNode; ! import org.htmlparser.tags.Tag; import org.htmlparser.tags.ImageTag; import org.htmlparser.tags.LinkTag; import org.htmlparser.tags.TitleTag; *************** *** 66,73 **** public void visitImageTag(ImageTag imageTag) { - } - - public void visitEndTag(EndTag endTag) { - } --- 65,68 ---- Index: TagFindingVisitor.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/visitors/TagFindingVisitor.java,v retrieving revision 1.35 retrieving revision 1.36 diff -C2 -d -r1.35 -r1.36 *** TagFindingVisitor.java 22 Sep 2003 02:40:16 -0000 1.35 --- TagFindingVisitor.java 28 Sep 2003 15:34:00 -0000 1.36 *************** *** 32,36 **** import org.htmlparser.Node; - import org.htmlparser.tags.EndTag; import org.htmlparser.tags.Tag; import org.htmlparser.util.NodeList; --- 32,35 ---- *************** *** 68,72 **** } ! public void visitTag(Tag tag) { for (int i=0;i<tagsToBeFound.length;i++) if (tag.getTagName().equalsIgnoreCase(tagsToBeFound[i])) { --- 67,82 ---- } ! public void visitTag(Tag tag) ! { ! if (tag.isEndTag ()) ! { ! if (!endTagCheck) return; ! for (int i=0;i<tagsToBeFound.length;i++) ! if (tag.getTagName().substring (1).equalsIgnoreCase(tagsToBeFound[i])) ! { ! endTagCount[i]++; ! endTags[i].add(tag); ! } ! } for (int i=0;i<tagsToBeFound.length;i++) if (tag.getTagName().equalsIgnoreCase(tagsToBeFound[i])) { *************** *** 78,90 **** public Node [] getTags(int index) { return tags[index].toNodeArray(); - } - - public void visitEndTag(EndTag endTag) { - if (!endTagCheck) return; - for (int i=0;i<tagsToBeFound.length;i++) - if (endTag.getTagName().equalsIgnoreCase(tagsToBeFound[i])) { - endTagCount[i]++; - endTags[i].add(endTag); - } } --- 88,91 ---- Index: TextExtractingVisitor.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/visitors/TextExtractingVisitor.java,v retrieving revision 1.33 retrieving revision 1.34 diff -C2 -d -r1.33 -r1.34 *** TextExtractingVisitor.java 22 Sep 2003 02:40:16 -0000 1.33 --- TextExtractingVisitor.java 28 Sep 2003 15:34:00 -0000 1.34 *************** *** 30,34 **** import org.htmlparser.StringNode; - import org.htmlparser.tags.EndTag; import org.htmlparser.tags.Tag; import org.htmlparser.tags.TitleTag; --- 30,33 ---- *************** *** 76,87 **** } ! public void visitEndTag(EndTag endTag) { ! if (isPreTag(endTag)) ! preTagBeingProcessed = false; ! } ! ! public void visitTag(Tag tag) { ! if (isPreTag(tag)) ! preTagBeingProcessed = true; } --- 75,90 ---- } ! public void visitTag(Tag tag) ! { ! if (tag.isEndTag ()) ! { ! if (isPreTag(tag)) ! preTagBeingProcessed = false; ! } ! else ! { ! if (isPreTag(tag)) ! preTagBeingProcessed = true; ! } } Index: UrlModifyingVisitor.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/visitors/UrlModifyingVisitor.java,v retrieving revision 1.32 retrieving revision 1.33 diff -C2 -d -r1.32 -r1.33 *** UrlModifyingVisitor.java 22 Sep 2003 02:40:16 -0000 1.32 --- UrlModifyingVisitor.java 28 Sep 2003 15:34:00 -0000 1.33 *************** *** 33,37 **** import org.htmlparser.StringNode; import org.htmlparser.scanners.LinkScanner; - import org.htmlparser.tags.EndTag; import org.htmlparser.tags.ImageTag; import org.htmlparser.tags.LinkTag; --- 33,36 ---- *************** *** 66,79 **** } - public void visitEndTag(EndTag endTag) { - modifiedResult.append(endTag.toHtml()); - } - public void visitStringNode(StringNode stringNode) { modifiedResult.append(stringNode.toHtml()); } ! public void visitTag(Tag tag) { ! modifiedResult.append(tag.toHtml()); } --- 65,78 ---- } public void visitStringNode(StringNode stringNode) { modifiedResult.append(stringNode.toHtml()); } ! public void visitTag(Tag tag) ! { ! if (tag.isEndTag ()) ! modifiedResult.append(tag.toHtml()); ! else ! modifiedResult.append(tag.toHtml()); } |
From: <der...@us...> - 2003-09-28 19:37:37
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/lexerTests In directory sc8-pr-cvs1:/tmp/cvs-serv30684/tests/lexerTests Modified Files: LexerTests.java Log Message: Lexer Integration Removed old Parser classes. Removed EndTag, this class was replaced by a call to the new isEndTag() method on the Tag class The StringNode, RemarkNode and tags.Tag class now derive from their lexeme counterparts in lexer.nodes instead of the other way around. The beginnings of a node factory interface are included. This was added so the lexer could return 'visitable' nodes to the parser. The parser acts as it's own node factory, as does the Lexer. The node count for parsing goes up in most cases because every whitespace (i.e. newline) now counts as a StringNode. This has whacked out a lot of the tests that were expecting fewer nodes or a certain type of node at a particular index. Attributes now maintain their order and case. The count of attributes also went up because whitespace is maintained within tags too. The storage in a Vector means the element 0 Attribute is actually the name of the tag, rather than having the $TAGNAME entry in a HashTable. Index: LexerTests.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/lexerTests/LexerTests.java,v retrieving revision 1.7 retrieving revision 1.8 diff -C2 -d -r1.7 -r1.8 *** LexerTests.java 22 Sep 2003 02:40:05 -0000 1.7 --- LexerTests.java 28 Sep 2003 15:33:58 -0000 1.8 *************** *** 42,46 **** import org.htmlparser.Node; - import org.htmlparser.NodeReader; import org.htmlparser.Parser; import org.htmlparser.lexer.Lexer; --- 42,45 ---- *************** *** 303,586 **** } ! /** ! * Test the relative speed reading from a string parsing tags too. ! */ ! public void testSpeedStringWithoutTags () throws ParserException, IOException ! { ! final String link = "http://htmlparser.sourceforge.net/javadoc_1_3/index-all.html"; ! URL url; ! URLConnection connection; ! Source source; ! StringBuffer buffer; ! int i; ! String html; ! ! long old_total; ! long new_total; ! long begin; ! long end; ! StringReader reader; ! NodeReader nodes; ! Parser parser; ! int nodecount; ! Node node; ! int charcount; ! ! url = new URL (link); ! connection = url.openConnection (); ! connection.connect (); ! source = new Source (new Stream (connection.getInputStream ())); ! buffer = new StringBuffer (350000); ! while (-1 != (i = source.read ())) ! buffer.append ((char)i); ! source.close (); ! html = buffer.toString (); ! old_total = 0; ! new_total = 0; ! for (i = 0; i < 5; i++) ! { ! System.gc (); ! begin = System.currentTimeMillis (); ! Lexer lexer = new Lexer (html); ! nodecount = 0; ! while (null != (node = lexer.nextNode ())) ! nodecount++; ! end = System.currentTimeMillis (); ! System.out.println (" lexer: " + (end - begin) + " msec, " + nodecount + " nodes"); ! if (0 != i) // the first timing is way different ! new_total += (end - begin); ! ! System.gc (); ! begin = System.currentTimeMillis (); ! reader = new StringReader (html); ! nodes = new NodeReader (new BufferedReader (reader), 350000); ! parser = new Parser (nodes, null); ! nodecount = 0; ! while (null != (node = nodes.readElement ())) ! nodecount++; ! end = System.currentTimeMillis (); ! System.out.println ("old reader: " + (end - begin) + " msec, " + nodecount + " nodes"); ! if (0 != i) // the first timing is way different ! old_total += (end - begin); ! } ! assertTrue ("old parser is" + ((double)(new_total - old_total)/(double)old_total*100.0) + "% faster", new_total < old_total); ! System.out.println ("lexer is " + ((double)(old_total - new_total)/(double)old_total*100.0) + "% faster"); ! } ! ! /** ! * Test the relative speed reading from a string parsing tags too. ! */ ! public void testSpeedStringWithTags () throws ParserException, IOException ! { ! final String link = "http://htmlparser.sourceforge.net/javadoc_1_3/index-all.html"; ! URL url; ! URLConnection connection; ! Source source; ! StringBuffer buffer; ! int i; ! String html; ! ! long old_total; ! long new_total; ! long begin; ! long end; ! StringReader reader; ! NodeReader nodes; ! Parser parser; ! int nodecount; ! Node node; ! int charcount; ! ! url = new URL (link); ! connection = url.openConnection (); ! connection.connect (); ! source = new Source (new Stream (connection.getInputStream ())); ! buffer = new StringBuffer (350000); ! while (-1 != (i = source.read ())) ! buffer.append ((char)i); ! source.close (); ! html = buffer.toString (); ! old_total = 0; ! new_total = 0; ! for (i = 0; i < 5; i++) ! { ! System.gc (); ! begin = System.currentTimeMillis (); ! Lexer lexer = new Lexer (html); ! nodecount = 0; ! while (null != (node = lexer.nextNode ())) ! { ! nodecount++; ! if (node instanceof TagNode) ! ((TagNode)node).getAttributes (); ! } ! end = System.currentTimeMillis (); ! System.out.println (" lexer: " + (end - begin) + " msec, " + nodecount + " nodes"); ! if (0 != i) // the first timing is way different ! new_total += (end - begin); ! ! System.gc (); ! begin = System.currentTimeMillis (); ! reader = new StringReader (html); ! nodes = new NodeReader (new BufferedReader (reader), 350000); ! parser = new Parser (nodes, null); ! nodecount = 0; ! while (null != (node = nodes.readElement ())) ! { ! nodecount++; ! if (node instanceof Tag) ! ((Tag)node).getAttributes (); ! } ! end = System.currentTimeMillis (); ! System.out.println ("old reader: " + (end - begin) + " msec, " + nodecount + " nodes"); ! if (0 != i) // the first timing is way different ! old_total += (end - begin); ! } ! assertTrue ("old parser is" + ((double)(new_total - old_total)/(double)old_total*100.0) + "% faster", new_total < old_total); ! System.out.println ("lexer is " + ((double)(old_total - new_total)/(double)old_total*100.0) + "% faster"); ! } ! ! public void testSpeedStreamWithoutTags () throws ParserException, IOException ! { ! final String link = "http://htmlparser.sourceforge.net/javadoc_1_3/index-all.html"; ! URL url; ! URLConnection connection; ! Source source; ! StringBuffer buffer; ! int i; ! String html; ! InputStream stream; ! ! long old_total; ! long new_total; ! long begin; ! long end; ! InputStreamReader reader; ! NodeReader nodes; ! Parser parser; ! int nodecount; ! Node node; ! int charcount; ! ! url = new URL (link); ! connection = url.openConnection (); ! connection.connect (); ! source = new Source (new Stream (connection.getInputStream ())); ! buffer = new StringBuffer (350000); ! while (-1 != (i = source.read ())) ! buffer.append ((char)i); ! source.close (); ! html = buffer.toString (); ! old_total = 0; ! new_total = 0; ! ! for (i = 0; i < 5; i++) ! { ! ! System.gc (); ! begin = System.currentTimeMillis (); ! stream = new ByteArrayInputStream (html.getBytes (Page.DEFAULT_CHARSET)); ! Lexer lexer = new Lexer (new Page (stream, Page.DEFAULT_CHARSET)); ! nodecount = 0; ! while (null != (node = lexer.nextNode ())) ! nodecount++; ! end = System.currentTimeMillis (); ! System.out.println (" lexer: " + (end - begin) + " msec, " + nodecount + " nodes"); ! if (0 != i) // the first timing is way different ! new_total += (end - begin); ! ! System.gc (); ! begin = System.currentTimeMillis (); ! stream = new ByteArrayInputStream (html.getBytes (Page.DEFAULT_CHARSET)); ! reader = new InputStreamReader (stream); ! nodes = new NodeReader (reader, 350000); ! parser = new Parser (nodes, null); ! nodecount = 0; ! while (null != (node = nodes.readElement ())) ! nodecount++; ! end = System.currentTimeMillis (); ! System.out.println ("old reader: " + (end - begin) + " msec, " + nodecount + " nodes"); ! if (0 != i) // the first timing is way different ! old_total += (end - begin); ! ! } ! assertTrue ("old parser is" + ((double)(new_total - old_total)/(double)old_total*100.0) + "% faster", new_total < old_total); ! System.out.println ("lexer is " + ((double)(old_total - new_total)/(double)old_total*100.0) + "% faster"); ! } ! ! public void testSpeedStreamWithTags () throws ParserException, IOException ! { ! final String link = "http://htmlparser.sourceforge.net/javadoc_1_3/index-all.html"; ! URL url; ! URLConnection connection; ! Source source; ! StringBuffer buffer; ! int i; ! String html; ! InputStream stream; ! ! long old_total; ! long new_total; ! long begin; ! long end; ! InputStreamReader reader; ! NodeReader nodes; ! Parser parser; ! int nodecount; ! Node node; ! int charcount; ! ! url = new URL (link); ! connection = url.openConnection (); ! connection.connect (); ! source = new Source (new Stream (connection.getInputStream ())); ! buffer = new StringBuffer (350000); ! while (-1 != (i = source.read ())) ! buffer.append ((char)i); ! source.close (); ! html = buffer.toString (); ! old_total = 0; ! new_total = 0; ! ! for (i = 0; i < 5; i++) ! { ! ! System.gc (); ! begin = System.currentTimeMillis (); ! stream = new ByteArrayInputStream (html.getBytes (Page.DEFAULT_CHARSET)); ! Lexer lexer = new Lexer (new Page (stream, Page.DEFAULT_CHARSET)); ! nodecount = 0; ! while (null != (node = lexer.nextNode ())) ! { ! nodecount++; ! if (node instanceof TagNode) ! ((TagNode)node).getAttributes (); ! } ! end = System.currentTimeMillis (); ! System.out.println (" lexer: " + (end - begin) + " msec, " + nodecount + " nodes"); ! if (0 != i) // the first timing is way different ! new_total += (end - begin); ! ! System.gc (); ! begin = System.currentTimeMillis (); ! stream = new ByteArrayInputStream (html.getBytes (Page.DEFAULT_CHARSET)); ! reader = new InputStreamReader (stream); ! nodes = new NodeReader (reader, 350000); ! parser = new Parser (nodes, null); ! nodecount = 0; ! while (null != (node = nodes.readElement ())) ! { ! nodecount++; ! if (node instanceof Tag) ! ((Tag)node).getAttributes (); ! } ! end = System.currentTimeMillis (); ! System.out.println ("old reader: " + (end - begin) + " msec, " + nodecount + " nodes"); ! if (0 != i) // the first timing is way different ! old_total += (end - begin); ! } ! assertTrue ("old parser is" + ((double)(new_total - old_total)/(double)old_total*100.0) + "% faster", new_total < old_total); ! System.out.println ("lexer is " + ((double)(old_total - new_total)/(double)old_total*100.0) + "% faster"); ! } // public static void main (String[] args) throws ParserException, IOException --- 302,585 ---- } ! // /** ! // * Test the relative speed reading from a string parsing tags too. ! // */ ! // public void testSpeedStringWithoutTags () throws ParserException, IOException ! // { ! // final String link = "http://htmlparser.sourceforge.net/javadoc_1_3/index-all.html"; ! // URL url; ! // URLConnection connection; ! // Source source; ! // StringBuffer buffer; ! // int i; ! // String html; ! // ! // long old_total; ! // long new_total; ! // long begin; ! // long end; ! // StringReader reader; ! // NodeReader nodes; ! // Parser parser; ! // int nodecount; ! // Node node; ! // int charcount; ! // ! // url = new URL (link); ! // connection = url.openConnection (); ! // connection.connect (); ! // source = new Source (new Stream (connection.getInputStream ())); ! // buffer = new StringBuffer (350000); ! // while (-1 != (i = source.read ())) ! // buffer.append ((char)i); ! // source.close (); ! // html = buffer.toString (); ! // old_total = 0; ! // new_total = 0; ! // for (i = 0; i < 5; i++) ! // { ! // System.gc (); ! // begin = System.currentTimeMillis (); ! // Lexer lexer = new Lexer (html); ! // nodecount = 0; ! // while (null != (node = lexer.nextNode ())) ! // nodecount++; ! // end = System.currentTimeMillis (); ! // System.out.println (" lexer: " + (end - begin) + " msec, " + nodecount + " nodes"); ! // if (0 != i) // the first timing is way different ! // new_total += (end - begin); ! // ! // System.gc (); ! // begin = System.currentTimeMillis (); ! // reader = new StringReader (html); ! // nodes = new NodeReader (new BufferedReader (reader), 350000); ! // parser = new Parser (nodes, null); ! // nodecount = 0; ! // while (null != (node = nodes.readElement ())) ! // nodecount++; ! // end = System.currentTimeMillis (); ! // System.out.println ("old reader: " + (end - begin) + " msec, " + nodecount + " nodes"); ! // if (0 != i) // the first timing is way different ! // old_total += (end - begin); ! // } ! // assertTrue ("old parser is" + ((double)(new_total - old_total)/(double)old_total*100.0) + "% faster", new_total < old_total); ! // System.out.println ("lexer is " + ((double)(old_total - new_total)/(double)old_total*100.0) + "% faster"); ! // } ! // ! // /** ! // * Test the relative speed reading from a string parsing tags too. ! // */ ! // public void testSpeedStringWithTags () throws ParserException, IOException ! // { ! // final String link = "http://htmlparser.sourceforge.net/javadoc_1_3/index-all.html"; ! // URL url; ! // URLConnection connection; ! // Source source; ! // StringBuffer buffer; ! // int i; ! // String html; ! // ! // long old_total; ! // long new_total; ! // long begin; ! // long end; ! // StringReader reader; ! // NodeReader nodes; ! // Parser parser; ! // int nodecount; ! // Node node; ! // int charcount; ! // ! // url = new URL (link); ! // connection = url.openConnection (); ! // connection.connect (); ! // source = new Source (new Stream (connection.getInputStream ())); ! // buffer = new StringBuffer (350000); ! // while (-1 != (i = source.read ())) ! // buffer.append ((char)i); ! // source.close (); ! // html = buffer.toString (); ! // old_total = 0; ! // new_total = 0; ! // for (i = 0; i < 5; i++) ! // { ! // System.gc (); ! // begin = System.currentTimeMillis (); ! // Lexer lexer = new Lexer (html); ! // nodecount = 0; ! // while (null != (node = lexer.nextNode ())) ! // { ! // nodecount++; ! // if (node instanceof TagNode) ! // ((TagNode)node).getAttributes (); ! // } ! // end = System.currentTimeMillis (); ! // System.out.println (" lexer: " + (end - begin) + " msec, " + nodecount + " nodes"); ! // if (0 != i) // the first timing is way different ! // new_total += (end - begin); ! // ! // System.gc (); ! // begin = System.currentTimeMillis (); ! // reader = new StringReader (html); ! // nodes = new NodeReader (new BufferedReader (reader), 350000); ! // parser = new Parser (nodes, null); ! // nodecount = 0; ! // while (null != (node = nodes.readElement ())) ! // { ! // nodecount++; ! // if (node instanceof Tag) ! // ((Tag)node).getAttributes (); ! // } ! // end = System.currentTimeMillis (); ! // System.out.println ("old reader: " + (end - begin) + " msec, " + nodecount + " nodes"); ! // if (0 != i) // the first timing is way different ! // old_total += (end - begin); ! // } ! // assertTrue ("old parser is" + ((double)(new_total - old_total)/(double)old_total*100.0) + "% faster", new_total < old_total); ! // System.out.println ("lexer is " + ((double)(old_total - new_total)/(double)old_total*100.0) + "% faster"); ! // } ! // ! // public void testSpeedStreamWithoutTags () throws ParserException, IOException ! // { ! // final String link = "http://htmlparser.sourceforge.net/javadoc_1_3/index-all.html"; ! // URL url; ! // URLConnection connection; ! // Source source; ! // StringBuffer buffer; ! // int i; ! // String html; ! // InputStream stream; ! // ! // long old_total; ! // long new_total; ! // long begin; ! // long end; ! // InputStreamReader reader; ! // NodeReader nodes; ! // Parser parser; ! // int nodecount; ! // Node node; ! // int charcount; ! // ! // url = new URL (link); ! // connection = url.openConnection (); ! // connection.connect (); ! // source = new Source (new Stream (connection.getInputStream ())); ! // buffer = new StringBuffer (350000); ! // while (-1 != (i = source.read ())) ! // buffer.append ((char)i); ! // source.close (); ! // html = buffer.toString (); ! // old_total = 0; ! // new_total = 0; ! // ! // for (i = 0; i < 5; i++) ! // { ! // ! // System.gc (); ! // begin = System.currentTimeMillis (); ! // stream = new ByteArrayInputStream (html.getBytes (Page.DEFAULT_CHARSET)); ! // Lexer lexer = new Lexer (new Page (stream, Page.DEFAULT_CHARSET)); ! // nodecount = 0; ! // while (null != (node = lexer.nextNode ())) ! // nodecount++; ! // end = System.currentTimeMillis (); ! // System.out.println (" lexer: " + (end - begin) + " msec, " + nodecount + " nodes"); ! // if (0 != i) // the first timing is way different ! // new_total += (end - begin); ! // ! // System.gc (); ! // begin = System.currentTimeMillis (); ! // stream = new ByteArrayInputStream (html.getBytes (Page.DEFAULT_CHARSET)); ! // reader = new InputStreamReader (stream); ! // nodes = new NodeReader (reader, 350000); ! // parser = new Parser (nodes, null); ! // nodecount = 0; ! // while (null != (node = nodes.readElement ())) ! // nodecount++; ! // end = System.currentTimeMillis (); ! // System.out.println ("old reader: " + (end - begin) + " msec, " + nodecount + " nodes"); ! // if (0 != i) // the first timing is way different ! // old_total += (end - begin); ! // ! // } ! // assertTrue ("old parser is" + ((double)(new_total - old_total)/(double)old_total*100.0) + "% faster", new_total < old_total); ! // System.out.println ("lexer is " + ((double)(old_total - new_total)/(double)old_total*100.0) + "% faster"); ! // } ! // ! // public void testSpeedStreamWithTags () throws ParserException, IOException ! // { ! // final String link = "http://htmlparser.sourceforge.net/javadoc_1_3/index-all.html"; ! // URL url; ! // URLConnection connection; ! // Source source; ! // StringBuffer buffer; ! // int i; ! // String html; ! // InputStream stream; ! // ! // long old_total; ! // long new_total; ! // long begin; ! // long end; ! // InputStreamReader reader; ! // NodeReader nodes; ! // Parser parser; ! // int nodecount; ! // Node node; ! // int charcount; ! // ! // url = new URL (link); ! // connection = url.openConnection (); ! // connection.connect (); ! // source = new Source (new Stream (connection.getInputStream ())); ! // buffer = new StringBuffer (350000); ! // while (-1 != (i = source.read ())) ! // buffer.append ((char)i); ! // source.close (); ! // html = buffer.toString (); ! // old_total = 0; ! // new_total = 0; ! // ! // for (i = 0; i < 5; i++) ! // { ! // ! // System.gc (); ! // begin = System.currentTimeMillis (); ! // stream = new ByteArrayInputStream (html.getBytes (Page.DEFAULT_CHARSET)); ! // Lexer lexer = new Lexer (new Page (stream, Page.DEFAULT_CHARSET)); ! // nodecount = 0; ! // while (null != (node = lexer.nextNode ())) ! // { ! // nodecount++; ! // if (node instanceof TagNode) ! // ((TagNode)node).getAttributes (); ! // } ! // end = System.currentTimeMillis (); ! // System.out.println (" lexer: " + (end - begin) + " msec, " + nodecount + " nodes"); ! // if (0 != i) // the first timing is way different ! // new_total += (end - begin); ! // ! // System.gc (); ! // begin = System.currentTimeMillis (); ! // stream = new ByteArrayInputStream (html.getBytes (Page.DEFAULT_CHARSET)); ! // reader = new InputStreamReader (stream); ! // nodes = new NodeReader (reader, 350000); ! // parser = new Parser (nodes, null); ! // nodecount = 0; ! // while (null != (node = nodes.readElement ())) ! // { ! // nodecount++; ! // if (node instanceof Tag) ! // ((Tag)node).getAttributes (); ! // } ! // end = System.currentTimeMillis (); ! // System.out.println ("old reader: " + (end - begin) + " msec, " + nodecount + " nodes"); ! // if (0 != i) // the first timing is way different ! // old_total += (end - begin); ! // } ! // assertTrue ("old parser is" + ((double)(new_total - old_total)/(double)old_total*100.0) + "% faster", new_total < old_total); ! // System.out.println ("lexer is " + ((double)(old_total - new_total)/(double)old_total*100.0) + "% faster"); ! // } // public static void main (String[] args) throws ParserException, IOException |
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/scannersTests In directory sc8-pr-cvs1:/tmp/cvs-serv30684/tests/scannersTests Modified Files: CompositeTagScannerTest.java ImageScannerTest.java LinkScannerTest.java MetaTagScannerTest.java TagScannerTest.java Log Message: Lexer Integration Removed old Parser classes. Removed EndTag, this class was replaced by a call to the new isEndTag() method on the Tag class The StringNode, RemarkNode and tags.Tag class now derive from their lexeme counterparts in lexer.nodes instead of the other way around. The beginnings of a node factory interface are included. This was added so the lexer could return 'visitable' nodes to the parser. The parser acts as it's own node factory, as does the Lexer. The node count for parsing goes up in most cases because every whitespace (i.e. newline) now counts as a StringNode. This has whacked out a lot of the tests that were expecting fewer nodes or a certain type of node at a particular index. Attributes now maintain their order and case. The count of attributes also went up because whitespace is maintained within tags too. The storage in a Vector means the element 0 Attribute is actually the name of the tag, rather than having the $TAGNAME entry in a HashTable. Index: CompositeTagScannerTest.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/scannersTests/CompositeTagScannerTest.java,v retrieving revision 1.39 retrieving revision 1.40 diff -C2 -d -r1.39 -r1.40 *** CompositeTagScannerTest.java 22 Sep 2003 02:40:09 -0000 1.39 --- CompositeTagScannerTest.java 28 Sep 2003 15:33:59 -0000 1.40 *************** *** 33,37 **** import org.htmlparser.scanners.CompositeTagScanner; import org.htmlparser.tags.CompositeTag; - import org.htmlparser.tags.EndTag; import org.htmlparser.tags.Tag; import org.htmlparser.tags.data.CompositeTagData; --- 33,36 ---- *************** *** 449,453 **** customTag.toHtml() ); ! EndTag endTag = (EndTag)node[2]; assertStringEquals( "first custom tag html", --- 448,452 ---- customTag.toHtml() ); ! Tag endTag = (Tag)node[2]; assertStringEquals( "first custom tag html", *************** *** 496,500 **** assertSame("parent and custom tag should be the same",customTag,parent); ! EndTag endTag = (EndTag)node[2]; assertStringEquals( "first custom tag html", --- 495,499 ---- assertSame("parent and custom tag should be the same",customTag,parent); ! Tag endTag = (Tag)node[2]; assertStringEquals( "first custom tag html", Index: ImageScannerTest.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/scannersTests/ImageScannerTest.java,v retrieving revision 1.31 retrieving revision 1.32 diff -C2 -d -r1.31 -r1.32 *** ImageScannerTest.java 22 Sep 2003 02:40:10 -0000 1.31 --- ImageScannerTest.java 28 Sep 2003 15:33:59 -0000 1.32 *************** *** 73,80 **** public void testExtractImageLocnInvertedCommasBug() throws ParserException { ! Tag tag = new Tag(new TagData(0,0,"img width=638 height=53 border=0 usemap=\"#m\" src=http://us.a1.yimg.com/us.yimg.com/i/ww/m5v5.gif alt=Yahoo","")); ! String url = "c:\\cvs\\html\\binaries\\yahoo.htm"; ! ImageScanner scanner = new ImageScanner("-i",new LinkProcessor()); ! assertEquals("Extracted Image Locn","http://us.a1.yimg.com/us.yimg.com/i/ww/m5v5.gif",scanner.extractImageLocn(tag,url)); } --- 73,81 ---- public void testExtractImageLocnInvertedCommasBug() throws ParserException { ! fail ("not implemented"); ! // Tag tag = new Tag(new TagData(0,0,"img width=638 height=53 border=0 usemap=\"#m\" src=http://us.a1.yimg.com/us.yimg.com/i/ww/m5v5.gif alt=Yahoo","")); ! // String url = "c:\\cvs\\html\\binaries\\yahoo.htm"; ! // ImageScanner scanner = new ImageScanner("-i",new LinkProcessor()); ! // assertEquals("Extracted Image Locn","http://us.a1.yimg.com/us.yimg.com/i/ww/m5v5.gif",scanner.extractImageLocn(tag,url)); } Index: LinkScannerTest.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/scannersTests/LinkScannerTest.java,v retrieving revision 1.37 retrieving revision 1.38 diff -C2 -d -r1.37 -r1.38 *** LinkScannerTest.java 22 Sep 2003 02:40:11 -0000 1.37 --- LinkScannerTest.java 28 Sep 2003 15:33:59 -0000 1.38 *************** *** 35,39 **** import org.htmlparser.StringNode; import org.htmlparser.scanners.LinkScanner; - import org.htmlparser.tags.EndTag; import org.htmlparser.tags.ImageTag; import org.htmlparser.tags.LinkTag; --- 35,38 ---- *************** *** 158,166 **** public void testExtractLinkInvertedCommasBug() throws ParserException { ! String tagContents = "a href=r/anorth/top.html"; ! Tag tag = new Tag(new TagData(0,0,tagContents,"")); ! String url = "c:\\cvs\\html\\binaries\\yahoo.htm"; ! LinkScanner scanner = new LinkScanner("-l"); ! assertEquals("Extracted Link","r/anorth/top.html",scanner.extractLink(tag,url)); } --- 157,166 ---- public void testExtractLinkInvertedCommasBug() throws ParserException { ! fail ("not implemented"); ! // String tagContents = "a href=r/anorth/top.html"; ! // Tag tag = new Tag(new TagData(0,0,tagContents,"")); ! // String url = "c:\\cvs\\html\\binaries\\yahoo.htm"; ! // LinkScanner scanner = new LinkScanner("-l"); ! // assertEquals("Extracted Link","r/anorth/top.html",scanner.extractLink(tag,url)); } *************** *** 281,297 **** public void testReplaceFaultyTagWithEndTag() throws ParserException { ! String currentLine = "<p>Site Comments?<br><a href=\"mailto:sa...@ne...?subject=Site Comments\">Mail Us<a></p>"; ! Tag tag = new Tag(new TagData(85,87,"a",currentLine)); ! LinkScanner linkScanner = new LinkScanner(); ! String newLine = linkScanner.replaceFaultyTagWithEndTag(tag,currentLine); ! assertEquals("Expected replacement","<p>Site Comments?<br><a href=\"mailto:sa...@ne...?subject=Site Comments\">Mail Us</A></p>",newLine); } public void testInsertEndTagBeforeTag() throws ParserException { ! String currentLine = "<a href=s/7509><b>Yahoo! Movies</b></a>"; ! Tag tag = new Tag(new TagData(0,14,"a href=s/7509",currentLine)); ! LinkScanner linkScanner = new LinkScanner(); ! String newLine = linkScanner.insertEndTagBeforeNode(tag,currentLine); ! assertEquals("Expected insertion","</A><a href=s/7509><b>Yahoo! Movies</b></a>",newLine); } --- 281,299 ---- public void testReplaceFaultyTagWithEndTag() throws ParserException { ! fail ("not implemented"); ! // String currentLine = "<p>Site Comments?<br><a href=\"mailto:sa...@ne...?subject=Site Comments\">Mail Us<a></p>"; ! // Tag tag = new Tag(new TagData(85,87,"a",currentLine)); ! // LinkScanner linkScanner = new LinkScanner(); ! // String newLine = linkScanner.replaceFaultyTagWithEndTag(tag,currentLine); ! // assertEquals("Expected replacement","<p>Site Comments?<br><a href=\"mailto:sa...@ne...?subject=Site Comments\">Mail Us</A></p>",newLine); } public void testInsertEndTagBeforeTag() throws ParserException { ! fail ("not implemented"); ! // String currentLine = "<a href=s/7509><b>Yahoo! Movies</b></a>"; ! // Tag tag = new Tag(new TagData(0,14,"a href=s/7509",currentLine)); ! // LinkScanner linkScanner = new LinkScanner(); ! // String newLine = linkScanner.insertEndTagBeforeNode(tag,currentLine); ! // assertEquals("Expected insertion","</A><a href=s/7509><b>Yahoo! Movies</b></a>",newLine); } *************** *** 313,319 **** StringNode stringNode = (StringNode)node[1]; assertEquals("StringNode Contents","Revision",stringNode.getText()); ! assertTrue("Node 2 should be a string node",node[2] instanceof EndTag); ! EndTag endTag = (EndTag)node[2]; ! assertEquals("End Tag Contents","a",endTag.getText()); } --- 315,322 ---- StringNode stringNode = (StringNode)node[1]; assertEquals("StringNode Contents","Revision",stringNode.getText()); ! assertTrue("Node 2 should be an end tag",node[2] instanceof Tag); ! tag = (Tag)node[2]; ! assertTrue("Node 2 should be an end tag",tag.isEndTag ()); ! assertEquals("End Tag Contents","a",tag.getText()); } *************** *** 374,383 **** Tag tag2 = (Tag)containedNodes[2]; assertEquals("Tag Contents","b",tag2.getText()); ! assertTrue("Fourth contained node should be HTMLEndTag",containedNodes[3] instanceof EndTag); ! EndTag endTag1 = (EndTag)containedNodes[3]; ! assertEquals("Fourth Tag contents","b",endTag1.getText()); ! assertTrue("Fifth contained node should be HTMLEndTag",containedNodes[4] instanceof EndTag); ! EndTag endTag2 = (EndTag)containedNodes[4]; ! assertEquals("Fifth Tag contents","font",endTag2.getText()); } --- 377,388 ---- Tag tag2 = (Tag)containedNodes[2]; assertEquals("Tag Contents","b",tag2.getText()); ! assertTrue("Fourth contained node should be a Tag",containedNodes[3] instanceof Tag); ! Tag tag = (Tag)containedNodes[3]; ! assertTrue("Fourth contained node should be an EndTag",tag.isEndTag ()); ! assertEquals("Fourth Tag contents","b",tag.getText()); ! assertTrue("Fifth contained node should be a Tag",containedNodes[4] instanceof Tag); ! tag = (Tag)containedNodes[4]; ! assertTrue("Fifth contained node should be an EndTag",tag.isEndTag ()); ! assertEquals("Fifth Tag contents","font",tag.getText()); } Index: MetaTagScannerTest.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/scannersTests/MetaTagScannerTest.java,v retrieving revision 1.28 retrieving revision 1.29 diff -C2 -d -r1.28 -r1.29 *** MetaTagScannerTest.java 22 Sep 2003 02:40:11 -0000 1.28 --- MetaTagScannerTest.java 28 Sep 2003 15:33:59 -0000 1.29 *************** *** 30,35 **** import org.htmlparser.scanners.MetaTagScanner; - import org.htmlparser.tags.EndTag; import org.htmlparser.tags.MetaTag; import org.htmlparser.tests.ParserTestCase; import org.htmlparser.util.ParserException; --- 30,35 ---- import org.htmlparser.scanners.MetaTagScanner; import org.htmlparser.tags.MetaTag; + import org.htmlparser.tags.Tag; import org.htmlparser.tests.ParserTestCase; import org.htmlparser.util.ParserException; *************** *** 56,60 **** parseAndAssertNodeCount(11); ! assertTrue("Node 5 should be End Tag",node[5] instanceof EndTag); assertTrue("Node 6 should be META Tag",node[6] instanceof MetaTag); MetaTag metaTag; --- 56,60 ---- parseAndAssertNodeCount(11); ! assertTrue("Node 5 should be End Tag",node[5] instanceof Tag && ((Tag)node[5]).isEndTag ()); assertTrue("Node 6 should be META Tag",node[6] instanceof MetaTag); MetaTag metaTag; Index: TagScannerTest.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/scannersTests/TagScannerTest.java,v retrieving revision 1.29 retrieving revision 1.30 diff -C2 -d -r1.29 -r1.30 *** TagScannerTest.java 22 Sep 2003 02:40:11 -0000 1.29 --- TagScannerTest.java 28 Sep 2003 15:33:59 -0000 1.30 *************** *** 30,34 **** package org.htmlparser.tests.scannersTests; import org.htmlparser.Node; - import org.htmlparser.NodeReader; import org.htmlparser.Parser; import org.htmlparser.scanners.TagScanner; --- 30,33 ---- *************** *** 54,96 **** public void testExtractXMLData() throws ParserException { ! createParser( ! "<MESSAGE>\n"+ ! "Abhi\n"+ ! "Sri\n"+ ! "</MESSAGE>"); ! Parser.setLineSeparator("\r\n"); ! NodeIterator e = parser.elements(); ! ! Node node = e.nextNode(); ! try { ! String result = TagScanner.extractXMLData(node,"MESSAGE",parser.getReader()); ! assertEquals("Result","Abhi\r\nSri\r\n",result); ! } ! catch (ParserException ex) { ! assertTrue(e.toString(),false); ! } } public void testExtractXMLDataSingle() throws ParserException { ! createParser( ! "<MESSAGE>Test</MESSAGE>"); ! NodeIterator e = parser.elements(); ! ! Node node = (Node)e.nextNode(); ! try { ! String result = TagScanner.extractXMLData(node,"MESSAGE",parser.getReader()); ! assertEquals("Result","Test",result); ! } ! catch (ParserException ex) { ! assertTrue(e.toString(),false); ! } } public void testTagExtraction() { ! String testHTML = "<AREA \n coords=0,0,52,52 href=\"http://www.yahoo.com/r/c1\" shape=RECT>"; ! createParser(testHTML); ! Tag tag = Tag.find(parser.getReader(),testHTML,0); ! assertNotNull(tag); } --- 53,98 ---- public void testExtractXMLData() throws ParserException { ! fail ("not implemented"); ! // createParser( ! // "<MESSAGE>\n"+ ! // "Abhi\n"+ ! // "Sri\n"+ ! // "</MESSAGE>"); ! // Parser.setLineSeparator("\r\n"); ! // NodeIterator e = parser.elements(); ! // ! // Node node = e.nextNode(); ! // try { ! // String result = TagScanner.extractXMLData(node,"MESSAGE",parser.getReader()); ! // assertEquals("Result","Abhi\r\nSri\r\n",result); ! // } ! // catch (ParserException ex) { ! // assertTrue(e.toString(),false); ! // } } public void testExtractXMLDataSingle() throws ParserException { ! fail ("not implemented"); ! // createParser( ! // "<MESSAGE>Test</MESSAGE>"); ! // NodeIterator e = parser.elements(); ! // ! // Node node = (Node)e.nextNode(); ! // try { ! // String result = TagScanner.extractXMLData(node,"MESSAGE",parser.getReader()); ! // assertEquals("Result","Test",result); ! // } ! // catch (ParserException ex) { ! // assertTrue(e.toString(),false); ! // } } public void testTagExtraction() { ! fail ("not implemented"); ! // String testHTML = "<AREA \n coords=0,0,52,52 href=\"http://www.yahoo.com/r/c1\" shape=RECT>"; ! // createParser(testHTML); ! // Tag tag = Tag.find(parser.getReader(),testHTML,0); ! // assertNotNull(tag); } *************** *** 115,129 **** public void testRemoveChars2() { ! String test = "hello\r\nworld\r\n\tqsdsds"; ! TagScanner scanner = new TagScanner() { ! public Tag scan(Tag tag,String url,NodeReader reader,String currLine) { return null;} ! public boolean evaluate(String s,TagScanner previousOpenScanner) { return false; } ! public String [] getID() { ! return null; ! } ! ! }; ! String result = scanner.removeChars(test,"\r\n"); ! assertEquals("Removing Chars","helloworld\tqsdsds",result); } --- 117,132 ---- public void testRemoveChars2() { ! fail ("not implemented"); ! // String test = "hello\r\nworld\r\n\tqsdsds"; ! // TagScanner scanner = new TagScanner() { ! // public Tag scan(Tag tag,String url,NodeReader reader,String currLine) { return null;} ! // public boolean evaluate(String s,TagScanner previousOpenScanner) { return false; } ! // public String [] getID() { ! // return null; ! // } ! // ! // }; ! // String result = scanner.removeChars(test,"\r\n"); ! // assertEquals("Removing Chars","helloworld\tqsdsds",result); } |
From: <der...@us...> - 2003-09-28 19:37:34
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/temporaryFailures In directory sc8-pr-cvs1:/tmp/cvs-serv30684/tests/temporaryFailures Modified Files: AttributeParserTest.java Log Message: Lexer Integration Removed old Parser classes. Removed EndTag, this class was replaced by a call to the new isEndTag() method on the Tag class The StringNode, RemarkNode and tags.Tag class now derive from their lexeme counterparts in lexer.nodes instead of the other way around. The beginnings of a node factory interface are included. This was added so the lexer could return 'visitable' nodes to the parser. The parser acts as it's own node factory, as does the Lexer. The node count for parsing goes up in most cases because every whitespace (i.e. newline) now counts as a StringNode. This has whacked out a lot of the tests that were expecting fewer nodes or a certain type of node at a particular index. Attributes now maintain their order and case. The count of attributes also went up because whitespace is maintained within tags too. The storage in a Vector means the element 0 Attribute is actually the name of the tag, rather than having the $TAGNAME entry in a HashTable. Index: AttributeParserTest.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/temporaryFailures/AttributeParserTest.java,v retrieving revision 1.14 retrieving revision 1.15 diff -C2 -d -r1.14 -r1.15 *** AttributeParserTest.java 22 Sep 2003 02:40:13 -0000 1.14 --- AttributeParserTest.java 28 Sep 2003 15:33:59 -0000 1.15 *************** *** 38,42 **** import org.htmlparser.Parser; - import org.htmlparser.parserHelper.AttributeParser; import org.htmlparser.tags.Tag; import org.htmlparser.tags.data.TagData; --- 38,41 ---- *************** *** 44,48 **** public class AttributeParserTest extends ParserTestCase { - private AttributeParser parser; private Tag tag; private Hashtable table; --- 43,46 ---- *************** *** 53,63 **** protected void setUp() { ! parser = new AttributeParser(); } public void getParameterTableFor(String tagContents) { ! tag = new Tag(new TagData(0,0,tagContents,"")); ! table = parser.parseAttributes(tag.getText ()); ! } --- 51,62 ---- protected void setUp() { ! //parser = new AttributeParser(); ! fail ("needs to be reworked"); } public void getParameterTableFor(String tagContents) { ! fail ("needs to be reworked"); ! // tag = new Tag(new TagData(0,0,tagContents,"")); ! // table = parser.parseAttributes(tag.getText ()); } |
From: <der...@us...> - 2003-09-28 19:30:55
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/util In directory sc8-pr-cvs1:/tmp/cvs-serv11047/util Modified Files: LinkProcessor.java Log Message: Fixed up the broken visitor logic. Added some docos on NodeVisitor. Index: LinkProcessor.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/util/LinkProcessor.java,v retrieving revision 1.27 retrieving revision 1.28 diff -C2 -d -r1.27 -r1.28 *** LinkProcessor.java 22 Sep 2003 02:40:15 -0000 1.27 --- LinkProcessor.java 28 Sep 2003 19:30:04 -0000 1.28 *************** *** 70,73 **** --- 70,75 ---- if (null == link) link = ""; + else + link = stripQuotes (link); if (null != getBaseUrl ()) base = getBaseUrl (); *************** *** 88,91 **** --- 90,109 ---- } + /** + * Remove double or single quotes from the string. + */ + public String stripQuotes (String string) + { + //remove any double quotes from around charset string + if (string.startsWith ("\"") && string.endsWith ("\"") && (1 < string.length ())) + string = string.substring (1, string.length () - 1); + + //remove any single quote from around charset string + if (string.startsWith ("'") && string.endsWith ("'") && (1 < string.length ())) + string = string.substring (1, string.length () - 1); + + return (string); + } + public URL constructUrl(String link, String base) throws MalformedURLException { |
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/visitors In directory sc8-pr-cvs1:/tmp/cvs-serv11047/visitors Modified Files: HtmlPage.java NodeVisitor.java TagFindingVisitor.java TextExtractingVisitor.java UrlModifyingVisitor.java Log Message: Fixed up the broken visitor logic. Added some docos on NodeVisitor. Index: HtmlPage.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/visitors/HtmlPage.java,v retrieving revision 1.35 retrieving revision 1.36 diff -C2 -d -r1.35 -r1.36 *** HtmlPage.java 28 Sep 2003 15:34:00 -0000 1.35 --- HtmlPage.java 28 Sep 2003 19:30:04 -0000 1.36 *************** *** 64,85 **** public void visitTag(Tag tag) { ! if (tag.isEndTag ()) ! { if (isBodyTag(tag)) ! bodyTagBegin = false; ! addTagToBodyIfApplicable(tag); } ! else ! { ! addTagToBodyIfApplicable(tag); ! if (isTable(tag)) { ! tables.add(tag); ! } ! else { ! if (isBodyTag(tag)) ! bodyTagBegin = true; ! } ! } } --- 64,83 ---- public void visitTag(Tag tag) { ! addTagToBodyIfApplicable(tag); ! ! if (isTable(tag)) { ! tables.add(tag); ! } ! else { if (isBodyTag(tag)) ! bodyTagBegin = true; } ! } ! public void visitEndTag(Tag tag) ! { ! if (isBodyTag(tag)) ! bodyTagBegin = false; ! addTagToBodyIfApplicable(tag); } Index: NodeVisitor.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/visitors/NodeVisitor.java,v retrieving revision 1.30 retrieving revision 1.31 diff -C2 -d -r1.30 -r1.31 *** NodeVisitor.java 28 Sep 2003 15:34:00 -0000 1.30 --- NodeVisitor.java 28 Sep 2003 19:30:04 -0000 1.31 *************** *** 36,90 **** import org.htmlparser.tags.TitleTag; ! public abstract class NodeVisitor { ! private boolean recurseChildren; ! private boolean recurseSelf; ! ! public NodeVisitor() { ! this(true); } ! ! public NodeVisitor(boolean recurseChildren) { ! this.recurseChildren = recurseChildren; ! this.recurseSelf = true; } ! ! public NodeVisitor(boolean recurseChildren,boolean recurseSelf) { ! this.recurseChildren = recurseChildren; ! this.recurseSelf = recurseSelf; } ! public void visitTag(Tag tag) { ! } ! public void visitStringNode(StringNode stringNode) { } ! ! public void visitLinkTag(LinkTag linkTag) { } ! ! public void visitImageTag(ImageTag imageTag) { } ! ! public void visitTitleTag(TitleTag titleTag) { ! } ! public void visitRemarkNode(RemarkNode remarkNode) { ! } ! public boolean shouldRecurseChildren() { ! return recurseChildren; } ! ! public boolean shouldRecurseSelf() { ! return recurseSelf; } ! /** ! * Override this method if you wish to do special ! * processing upon completion of parsing ! */ ! public void finishedParsing() { } } --- 36,154 ---- import org.htmlparser.tags.TitleTag; ! /** ! * The base class for the 'Visitor' pattern. ! * Classes that wish to use <code>visitAllNodesWith()</code> will subclass ! * this class and provide implementations for methods they are interested in ! * processing.<p> ! * The operation of <code>visitAllNodesWith()</code> is to call ! * <code>beginParsing()</code>, then <code>visitXXX()</code> according to the ! * types of nodes encountered in depth-first order and finally ! * <code>finishedParsing()</code>.<p> ! * There are currently three specialized <code>visitXXX()</code> calls for ! * titles, images and links. Thes call their specialized visit, and then ! * perform the generic processing. ! * Typical code to print all the link tags: ! * <pre> ! * import org.htmlparser.Parser; ! * import org.htmlparser.tags.LinkTag; ! * import org.htmlparser.util.ParserException; ! * import org.htmlparser.visitors.NodeVisitor; ! * ! * public class Visitor extends NodeVisitor ! * { ! * public Visitor () ! * { ! * } ! * public void visitLinkTag (LinkTag linkTag) ! * { ! * System.out.println (linkTag); ! * } ! * public static void main (String[] args) throws ParserException ! * { ! * Parser parser = new Parser ("http://cbc.ca"); ! * parser.registerScanners (); ! * Visitor visitor = new Visitor (); ! * parser.visitAllNodesWith (visitor); ! * } ! * } ! * </pre> ! */ ! public abstract class NodeVisitor ! { ! private boolean mRecurseChildren; ! private boolean mRecurseSelf; ! ! public NodeVisitor () ! { ! this (true); } ! ! public NodeVisitor (boolean recurseChildren) ! { ! this (recurseChildren, true); } ! ! public NodeVisitor (boolean recurseChildren, boolean recurseSelf) ! { ! mRecurseChildren = recurseChildren; ! mRecurseSelf = recurseSelf; } ! /** ! * Override this method if you wish to do special ! * processing prior to the start of parsing. ! */ ! public void beginParsing () ! { } ! public void visitTag (Tag tag) ! { ! } ! ! public void visitEndTag (Tag tag) ! { ! } ! ! public void visitStringNode (StringNode stringNode) ! { } ! ! public void visitRemarkNode (RemarkNode remarkNode) ! { ! } ! ! /** ! * Override this method if you wish to do special ! * processing upon completion of parsing. ! */ ! public void finishedParsing () ! { } ! public void visitLinkTag (LinkTag linkTag) ! { } ! ! public void visitImageTag (ImageTag imageTag) ! { ! } ! ! public void visitTitleTag (TitleTag titleTag) ! { ! } ! public boolean shouldRecurseChildren () ! { ! return (mRecurseChildren); ! } ! ! public boolean shouldRecurseSelf () ! { ! return (mRecurseSelf); } } Index: TagFindingVisitor.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/visitors/TagFindingVisitor.java,v retrieving revision 1.36 retrieving revision 1.37 diff -C2 -d -r1.36 -r1.37 *** TagFindingVisitor.java 28 Sep 2003 15:34:00 -0000 1.36 --- TagFindingVisitor.java 28 Sep 2003 19:30:04 -0000 1.37 *************** *** 69,86 **** public void visitTag(Tag tag) { - if (tag.isEndTag ()) - { - if (!endTagCheck) return; - for (int i=0;i<tagsToBeFound.length;i++) - if (tag.getTagName().substring (1).equalsIgnoreCase(tagsToBeFound[i])) - { - endTagCount[i]++; - endTags[i].add(tag); - } - } for (int i=0;i<tagsToBeFound.length;i++) if (tag.getTagName().equalsIgnoreCase(tagsToBeFound[i])) { count[i]++; tags[i].add(tag); } } --- 69,87 ---- public void visitTag(Tag tag) { for (int i=0;i<tagsToBeFound.length;i++) if (tag.getTagName().equalsIgnoreCase(tagsToBeFound[i])) { count[i]++; tags[i].add(tag); + } + } + + public void visitEndTag(Tag tag) + { + if (!endTagCheck) return; + for (int i=0;i<tagsToBeFound.length;i++) + if (tag.getTagName().equalsIgnoreCase(tagsToBeFound[i])) + { + endTagCount[i]++; + endTags[i].add(tag); } } Index: TextExtractingVisitor.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/visitors/TextExtractingVisitor.java,v retrieving revision 1.34 retrieving revision 1.35 diff -C2 -d -r1.34 -r1.35 *** TextExtractingVisitor.java 28 Sep 2003 15:34:00 -0000 1.34 --- TextExtractingVisitor.java 28 Sep 2003 19:30:04 -0000 1.35 *************** *** 67,74 **** } - public void visitTitleTag(TitleTag titleTag) { - textAccumulator.append(titleTag.getTitle ()); - } - private String replaceNonBreakingSpaceWithOrdinarySpace(String text) { return text.replace('\u00a0',' '); --- 67,70 ---- *************** *** 77,90 **** public void visitTag(Tag tag) { ! if (tag.isEndTag ()) ! { ! if (isPreTag(tag)) ! preTagBeingProcessed = false; ! } ! else ! { ! if (isPreTag(tag)) ! preTagBeingProcessed = true; ! } } --- 73,84 ---- public void visitTag(Tag tag) { ! if (isPreTag(tag)) ! preTagBeingProcessed = true; ! } ! ! public void visitEndTag(Tag tag) ! { ! if (isPreTag(tag)) ! preTagBeingProcessed = false; } Index: UrlModifyingVisitor.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/visitors/UrlModifyingVisitor.java,v retrieving revision 1.33 retrieving revision 1.34 diff -C2 -d -r1.33 -r1.34 *** UrlModifyingVisitor.java 28 Sep 2003 15:34:00 -0000 1.33 --- UrlModifyingVisitor.java 28 Sep 2003 19:30:05 -0000 1.34 *************** *** 62,66 **** public void visitImageTag(ImageTag imageTag) { imageTag.setImageURL(linkPrefix + imageTag.getImageURL()); - modifiedResult.append(imageTag.toHtml()); } --- 62,65 ---- *************** *** 71,78 **** public void visitTag(Tag tag) { ! if (tag.isEndTag ()) ! modifiedResult.append(tag.toHtml()); ! else ! modifiedResult.append(tag.toHtml()); } --- 70,79 ---- public void visitTag(Tag tag) { ! modifiedResult.append(tag.toHtml()); ! } ! ! public void visitEndTag(Tag tag) ! { ! modifiedResult.append(tag.toHtml()); } |
From: <der...@us...> - 2003-09-28 19:30:55
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser In directory sc8-pr-cvs1:/tmp/cvs-serv11047 Modified Files: Parser.java RemarkNode.java StringNode.java Log Message: Fixed up the broken visitor logic. Added some docos on NodeVisitor. Index: Parser.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/Parser.java,v retrieving revision 1.62 retrieving revision 1.63 diff -C2 -d -r1.62 -r1.63 *** Parser.java 28 Sep 2003 15:33:57 -0000 1.62 --- Parser.java 28 Sep 2003 19:30:03 -0000 1.63 *************** *** 862,865 **** --- 862,866 ---- public void visitAllNodesWith(NodeVisitor visitor) throws ParserException { Node node; + visitor.beginParsing(); for (NodeIterator e = elements();e.hasMoreNodes();) { node = e.nextNode(); Index: RemarkNode.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/RemarkNode.java,v retrieving revision 1.32 retrieving revision 1.33 diff -C2 -d -r1.32 -r1.33 *** RemarkNode.java 28 Sep 2003 15:33:57 -0000 1.32 --- RemarkNode.java 28 Sep 2003 19:30:03 -0000 1.33 *************** *** 84,88 **** } ! public void accept(Object visitor) { ((NodeVisitor)visitor).visitRemarkNode (this); --- 84,93 ---- } ! /** ! * Remark visiting code. ! * @param visitor The <code>NodeVisitor</code> object to invoke ! * <code>visitRemarkNode()</code> on. ! */ ! public void accept (Object visitor) { ((NodeVisitor)visitor).visitRemarkNode (this); Index: StringNode.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/StringNode.java,v retrieving revision 1.40 retrieving revision 1.41 diff -C2 -d -r1.40 -r1.41 *** StringNode.java 28 Sep 2003 15:33:57 -0000 1.40 --- StringNode.java 28 Sep 2003 19:30:03 -0000 1.41 *************** *** 78,82 **** } ! public void accept(Object visitor) { ((NodeVisitor)visitor).visitStringNode (this); --- 78,87 ---- } ! /** ! * String visiting code. ! * @param visitor The <code>NodeVisitor</code> object to invoke ! * <code>visitStringNode()</code> on. ! */ ! public void accept (Object visitor) { ((NodeVisitor)visitor).visitStringNode (this); |
From: <der...@us...> - 2003-09-28 19:30:55
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tags In directory sc8-pr-cvs1:/tmp/cvs-serv11047/tags Modified Files: CompositeTag.java ImageTag.java LinkTag.java Tag.java TitleTag.java Log Message: Fixed up the broken visitor logic. Added some docos on NodeVisitor. Index: CompositeTag.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tags/CompositeTag.java,v retrieving revision 1.56 retrieving revision 1.57 diff -C2 -d -r1.56 -r1.57 *** CompositeTag.java 28 Sep 2003 15:33:58 -0000 1.56 --- CompositeTag.java 28 Sep 2003 19:30:04 -0000 1.57 *************** *** 319,343 **** /** ! * Handle a visitor. ! * <em>NOTE: This currently defers to accept(NodeVisitor), but eventually ! * subclasses of Node should be overriding accept(Object) directly.</em> ! * @param visitor The <code>NodeVisitor</code> object. */ ! public void accept(Object visitor) { ! accept ((NodeVisitor)visitor); ! } ! public void accept(NodeVisitor visitor) { ! if (visitor.shouldRecurseChildren()) { ! startTag.accept(visitor); ! SimpleNodeIterator children = children(); ! while (children.hasMoreNodes()) { ! Node child = (Node)children.nextNode(); ! child.accept(visitor); } ! endTag.accept(visitor); } ! if (visitor.shouldRecurseSelf()) ! visitor.visitTag(this); } --- 319,349 ---- /** ! * Tag visiting code. ! * Invokes <code>accept()</code> on the start tag and then ! * walks the child list invoking <code>accept()</code> on each ! * of the children, finishing up with an <code>accept()</code> ! * call on the end tag. If <code>shouldRecurseSelf()</code> ! * returns true it then asks the visitor to visit itself. ! * @param visitor The <code>NodeVisitor</code> object to be signalled ! * for each child and possibly this tag. */ ! public void accept (NodeVisitor visitor) ! { ! SimpleNodeIterator children; ! Node child; ! if (visitor.shouldRecurseChildren ()) ! { ! startTag.accept (visitor); ! children = children (); ! while (children.hasMoreNodes ()) ! { ! child = (Node)children.nextNode (); ! child.accept (visitor); } ! endTag.accept (visitor); } ! if (visitor.shouldRecurseSelf ()) ! visitor.visitTag (this); } Index: ImageTag.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tags/ImageTag.java,v retrieving revision 1.28 retrieving revision 1.29 diff -C2 -d -r1.28 -r1.29 *** ImageTag.java 28 Sep 2003 15:33:58 -0000 1.28 --- ImageTag.java 28 Sep 2003 19:30:04 -0000 1.29 *************** *** 72,77 **** } ! public void accept(NodeVisitor visitor) { ! visitor.visitImageTag(this); } --- 72,86 ---- } ! /** ! * Image visiting code. ! * Invokes <code>visitImageTag()</code> on the visitor and then ! * invokes the normal tag processing. ! * @param visitor The <code>NodeVisitor</code> object to invoke ! * <code>visitImageTag()</code> on. ! */ ! public void accept (NodeVisitor visitor) ! { ! visitor.visitImageTag (this); ! super.accept (visitor); } Index: LinkTag.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tags/LinkTag.java,v retrieving revision 1.35 retrieving revision 1.36 diff -C2 -d -r1.35 -r1.36 *** LinkTag.java 22 Sep 2003 02:40:01 -0000 1.35 --- LinkTag.java 28 Sep 2003 19:30:04 -0000 1.36 *************** *** 251,257 **** } ! public void accept(NodeVisitor visitor) { ! visitor.visitLinkTag(this); ! super.accept(visitor); } } --- 251,265 ---- } ! /** ! * Link visiting code. ! * Invokes <code>visitLinkTag()</code> on the visitor and then ! * invokes the normal tag processing. ! * @param visitor The <code>NodeVisitor</code> object to invoke ! * <code>visitLinkTag()</code> on. ! */ ! public void accept (NodeVisitor visitor) ! { ! visitor.visitLinkTag (this); ! super.accept (visitor); } } Index: Tag.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tags/Tag.java,v retrieving revision 1.49 retrieving revision 1.50 diff -C2 -d -r1.49 -r1.50 *** Tag.java 28 Sep 2003 15:33:58 -0000 1.49 --- Tag.java 28 Sep 2003 19:30:04 -0000 1.50 *************** *** 116,122 **** } public void accept (Object visitor) { ! ((NodeVisitor)visitor).visitTag (this); } } --- 116,144 ---- } + /** + * Handle a visitor. + * <em>NOTE: This currently defers to accept(NodeVisitor). If + * subclasses of Node override accept(Object) directly, they must + * handle the delegation to <code>visitTag()</code> and + * <code>visitEndTag()</code>.</em> + * @param visitor The <code>NodeVisitor</code> object + * (a cast is performed without checking). + */ public void accept (Object visitor) { ! accept ((NodeVisitor)visitor); ! } ! ! /** ! * Default tag visiting code. ! * Based on <code>isEndTag()</code>, calls either <code>visitTag()</code> or ! * <code>visitEndTag()</code>. ! */ ! public void accept (NodeVisitor visitor) ! { ! if (isEndTag ()) ! ((NodeVisitor)visitor).visitEndTag (this); ! else ! ((NodeVisitor)visitor).visitTag (this); } } Index: TitleTag.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tags/TitleTag.java,v retrieving revision 1.26 retrieving revision 1.27 diff -C2 -d -r1.26 -r1.27 *** TitleTag.java 22 Sep 2003 02:40:02 -0000 1.26 --- TitleTag.java 28 Sep 2003 19:30:04 -0000 1.27 *************** *** 50,56 **** } ! public void accept(NodeVisitor visitor) { ! visitor.visitTitleTag(this); } - } --- 50,64 ---- } ! /** ! * Title visiting code. ! * Invokes <code>visitTitleTag()</code> on the visitor and then ! * invokes the normal tag processing. ! * @param visitor The <code>NodeVisitor</code> object to invoke ! * <code>visitTitleTag()</code> on. ! */ ! public void accept (NodeVisitor visitor) ! { ! visitor.visitTitleTag (this); ! super.accept (visitor); } } |
From: <der...@us...> - 2003-09-28 19:30:54
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/beans In directory sc8-pr-cvs1:/tmp/cvs-serv11047/beans Modified Files: StringBean.java Log Message: Fixed up the broken visitor logic. Added some docos on NodeVisitor. Index: StringBean.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/beans/StringBean.java,v retrieving revision 1.28 retrieving revision 1.29 diff -C2 -d -r1.28 -r1.29 *** StringBean.java 28 Sep 2003 15:33:57 -0000 1.28 --- StringBean.java 28 Sep 2003 19:30:03 -0000 1.29 *************** *** 601,606 **** * Appends a newline to the output if the tag breaks flow, and * possibly sets the state of the PRE and SCRIPT flags. - * Possibly resets the state of the PRE and SCRIPT flags if it's - * an end tag. */ public void visitTag (Tag tag) --- 601,604 ---- *************** *** 609,628 **** name = tag.getTagName (); ! if (tag.isEndTag ()) ! { ! if (name.equalsIgnoreCase ("/PRE")) ! mIsPre = false; ! else if (name.equalsIgnoreCase ("/SCRIPT")) ! mIsScript = false; ! } ! else ! { ! if (name.equalsIgnoreCase ("PRE")) ! mIsPre = true; ! else if (name.equalsIgnoreCase ("SCRIPT")) ! mIsScript = true; ! if (tag.breaksFlow ()) ! carriage_return (); ! } } --- 607,631 ---- name = tag.getTagName (); ! if (name.equalsIgnoreCase ("PRE")) ! mIsPre = true; ! else if (name.equalsIgnoreCase ("SCRIPT")) ! mIsScript = true; ! if (tag.breaksFlow ()) ! carriage_return (); ! } ! ! /** ! * Resets the state of the PRE and SCRIPT flags. ! * @param tag The end tag to process. ! */ ! public void visitEndTag (Tag tag) ! { ! String name; ! ! name = tag.getTagName (); ! if (name.equalsIgnoreCase ("/PRE")) ! mIsPre = false; ! else if (name.equalsIgnoreCase ("/SCRIPT")) ! mIsScript = false; } |
From: <der...@us...> - 2003-09-28 19:30:54
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/visitorsTests In directory sc8-pr-cvs1:/tmp/cvs-serv11047/tests/visitorsTests Modified Files: UrlModifyingVisitorTest.java Log Message: Fixed up the broken visitor logic. Added some docos on NodeVisitor. Index: UrlModifyingVisitorTest.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/visitorsTests/UrlModifyingVisitorTest.java,v retrieving revision 1.9 retrieving revision 1.10 diff -C2 -d -r1.9 -r1.10 *** UrlModifyingVisitorTest.java 22 Sep 2003 02:40:15 -0000 1.9 --- UrlModifyingVisitorTest.java 28 Sep 2003 19:30:04 -0000 1.10 *************** *** 37,48 **** "<HTML><BODY>" + "<A HREF=\"mylink.html\"><IMG SRC=\"mypic.jpg\">" + ! "</A><IMG SRC=\"mysecondimage.gif\">" + "</BODY></HTML>"; private static final String MODIFIED_HTML = "<HTML><BODY>" + ! "<A HREF=\"localhost://mylink.html\">" + ! "<IMG SRC=\"localhost://mypic.jpg\"></A>" + ! "<IMG SRC=\"localhost://mysecondimage.gif\">" + "</BODY></HTML>"; --- 37,49 ---- "<HTML><BODY>" + "<A HREF=\"mylink.html\"><IMG SRC=\"mypic.jpg\">" + ! "</A><IMG SRC=\"my second image.gif\">" + "</BODY></HTML>"; + // Note: links are only quoted if needed private static final String MODIFIED_HTML = "<HTML><BODY>" + ! "<A HREF=localhost://mylink.html>" + ! "<IMG SRC=localhost://mypic.jpg></A>" + ! "<IMG SRC=\"localhost://my second image.gif\">" + "</BODY></HTML>"; *************** *** 56,62 **** new UrlModifyingVisitor(parser, "localhost://"); parser.visitAllNodesWith(visitor); assertStringEquals("Expected HTML", MODIFIED_HTML, ! visitor.getModifiedResult()); } } --- 57,64 ---- new UrlModifyingVisitor(parser, "localhost://"); parser.visitAllNodesWith(visitor); + String result = visitor.getModifiedResult(); assertStringEquals("Expected HTML", MODIFIED_HTML, ! result); } } |
From: <der...@us...> - 2003-09-28 15:34:59
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer In directory sc8-pr-cvs1:/tmp/cvs-serv30684/lexer Modified Files: Cursor.java Lexer.java Page.java Source.java Log Message: Lexer Integration Removed old Parser classes. Removed EndTag, this class was replaced by a call to the new isEndTag() method on the Tag class The StringNode, RemarkNode and tags.Tag class now derive from their lexeme counterparts in lexer.nodes instead of the other way around. The beginnings of a node factory interface are included. This was added so the lexer could return 'visitable' nodes to the parser. The parser acts as it's own node factory, as does the Lexer. The node count for parsing goes up in most cases because every whitespace (i.e. newline) now counts as a StringNode. This has whacked out a lot of the tests that were expecting fewer nodes or a certain type of node at a particular index. Attributes now maintain their order and case. The count of attributes also went up because whitespace is maintained within tags too. The storage in a Vector means the element 0 Attribute is actually the name of the tag, rather than having the $TAGNAME entry in a HashTable. Index: Cursor.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/Cursor.java,v retrieving revision 1.8 retrieving revision 1.9 diff -C2 -d -r1.8 -r1.9 *** Cursor.java 22 Sep 2003 02:39:59 -0000 1.8 --- Cursor.java 28 Sep 2003 15:33:57 -0000 1.9 *************** *** 81,84 **** --- 81,93 ---- /** + * Set the position of this cursor. + * @param The new cursor position. + */ + public void setPosition (int position) + { + mPosition = position; + } + + /** * Move the cursor position ahead one character. */ Index: Lexer.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/Lexer.java,v retrieving revision 1.9 retrieving revision 1.10 diff -C2 -d -r1.9 -r1.10 *** Lexer.java 22 Sep 2003 02:39:59 -0000 1.9 --- Lexer.java 28 Sep 2003 15:33:57 -0000 1.10 *************** *** 33,45 **** package org.htmlparser.lexer; ! import java.io.*; ! import java.net.*; ! import java.util.*; ! import org.htmlparser.*; ! import org.htmlparser.lexer.nodes.*; import org.htmlparser.lexer.nodes.RemarkNode; import org.htmlparser.lexer.nodes.StringNode; ! import org.htmlparser.util.*; /** --- 33,50 ---- package org.htmlparser.lexer; ! import java.io.IOException; ! import java.net.MalformedURLException; ! import java.net.URL; ! import java.net.URLConnection; ! import java.util.Vector; ! import org.htmlparser.Node; ! import org.htmlparser.lexer.nodes.AbstractNode; ! import org.htmlparser.lexer.nodes.Attribute; ! import org.htmlparser.lexer.nodes.NodeFactory; import org.htmlparser.lexer.nodes.RemarkNode; import org.htmlparser.lexer.nodes.StringNode; ! import org.htmlparser.lexer.nodes.TagNode; ! import org.htmlparser.util.ParserException; /** *************** *** 53,56 **** --- 58,63 ---- */ public class Lexer + implements + NodeFactory { /** *************** *** 65,68 **** --- 72,80 ---- /** + * The factory for new nodes. + */ + protected NodeFactory mFactory; + + /** * Creates a new instance of a Lexer. * @param page The page with HTML text. *************** *** 70,75 **** public Lexer (Page page) { ! mPage = page; ! mCursor = new Cursor (page, 0); } --- 82,88 ---- public Lexer (Page page) { ! setPage (page); ! setCursor (new Cursor (page, 0)); ! setNodeFactory (this); } *************** *** 78,82 **** * @param text The text to parse. */ ! public Lexer (String text) throws ParserException { this (new Page (text)); --- 91,95 ---- * @param text The text to parse. */ ! public Lexer (String text) { this (new Page (text)); *************** *** 93,96 **** --- 106,120 ---- /** + * Reset the lexer to start parsing from the beginning again. + * The underlying components are reset such that the next call to + * <code>nextNode()</code> will return the first lexeme on the page. + */ + public void reset () + { + getPage ().reset (); + setCursor (new Cursor (getPage (), 0)); + } + + /** * Get the page this lexer is working on. * @return The page that nodes are being read from. *************** *** 102,105 **** --- 126,211 ---- /** + * Set the page this lexer is working on. + * @return The page that nodes will be read from. + */ + public void setPage (Page page) + { + if (null == page) + throw new IllegalArgumentException ("page cannot be null"); + // todo: sanity checks + mPage = page; + } + + /** + * Get the current scanning position. + * @return The lexer's cursor position. + */ + public Cursor getCursor () + { + return (mCursor); + } + + /** + * Set the current scanning position. + * @param cursor The lexer's new cursor position. + */ + public void setCursor (Cursor cursor) + { + if (null == cursor) + throw new IllegalArgumentException ("cursor cannot be null"); + // todo: sanity checks + mCursor = cursor; + } + + /** + * Get the current node factory. + * @return The lexer's node factory. + */ + public NodeFactory getNodeFactory () + { + return (mFactory); + } + + /** + * Get the current node factory. + * @return The lexer's cursor position. + */ + public void setNodeFactory (NodeFactory factory) + { + if (null == factory) + throw new IllegalArgumentException ("node factory cannot be null"); + mFactory = factory; + } + + public int getPosition () + { + return (getCursor ().getPosition ()); + } + + public void setPosition (int position) + { + // todo: sanity checks + getCursor ().setPosition (position); + } + + /** + * Get the current line number. + * @return The line number the lexer's working on. + */ + public int getCurrentLineNumber () + { + return (getPage ().row (getCursor ())); + } + + /** + * Get the current line. + * @return The string the lexer's working on. + */ + public String getCurrentLine () + { + return (getPage ().getLine (getCursor ())); + } + + /** * Get the next node from the source. * @return A RemarkNode, StringNode or TagNode, or <code>null</code> if no *************** *** 125,144 **** ch = mPage.getCharacter (probe); if (0 == ch) ! ret = parseString (); else if ('/' == ch || '%' == ch || Character.isLetter (ch)) ! ret = parseTag (); else if ('!' == ch) { ch = mPage.getCharacter (probe); ! if ('-' == ch) ! ret = parseRemark (); else ! ret = parseTag (); } else ! ret = parseString (); break; default: ! ret = parseString (); break; } --- 231,262 ---- ch = mPage.getCharacter (probe); if (0 == ch) ! ret = makeString (probe); else if ('/' == ch || '%' == ch || Character.isLetter (ch)) ! { ! probe.retreat (); ! ret = parseTag (probe); ! } else if ('!' == ch) { ch = mPage.getCharacter (probe); ! if (0 == ch) ! ret = makeString (probe); else ! { ! probe.retreat (); // remark and tag need this character ! if ('-' == ch) ! ret = parseRemark (probe); ! else ! { ! probe.retreat (); // tag needs the previous one too ! ret = parseTag (probe); ! } ! } } else ! ret = parseString (probe); break; default: ! ret = parseString (probe); break; } *************** *** 153,161 **** * case <code>null</code> is returned. */ ! protected Node parseString () throws ParserException { - Cursor cursor; boolean done; char ch; --- 271,278 ---- * case <code>null</code> is returned. */ ! protected Node parseString (Cursor cursor) throws ParserException { boolean done; char ch; *************** *** 163,169 **** int begin; int end; ! StringNode ret; - cursor = mCursor.dup (); done = false; while (!done) --- 280,285 ---- int begin; int end; ! Node ret; done = false; while (!done) *************** *** 191,194 **** --- 307,326 ---- } } + + return (makeString (cursor)); + } + + /** + * Create a string node based on the current cursor and the one provided. + */ + protected Node makeString (Cursor cursor) + throws + ParserException + { + int length; + int begin; + int end; + Node ret; + begin = mCursor.getPosition (); end = cursor.getPosition (); *************** *** 196,205 **** if (0 != length) { // got some characters - ret = new StringNode (mPage, begin, end); mCursor = cursor; } else ret = null; ! return (ret); } --- 328,337 ---- if (0 != length) { // got some characters mCursor = cursor; + ret = getNodeFactory ().createStringNode (this, begin, end); } else ret = null; ! return (ret); } *************** *** 300,308 **** * The first slot is for attribute name (kind of like a standalone attribute). */ ! protected Node parseTag () throws ParserException { - Cursor cursor; boolean done; char ch; --- 432,439 ---- * The first slot is for attribute name (kind of like a standalone attribute). */ ! protected Node parseTag (Cursor cursor) throws ParserException { boolean done; char ch; *************** *** 310,321 **** int[] bookmarks; Vector attributes; - int length; - TagNode ret; - cursor = mCursor.dup (); - // sanity check - ch = mPage.getCharacter (cursor); - if ('<' != ch) - return (parseString ()); done = false; attributes = new Vector (); --- 441,445 ---- *************** *** 418,429 **** } } ! length = cursor.getPosition () - mCursor.getPosition (); if (0 != length) { // return tag based on second character, '/', '%', Letter (ch), '!' if (2 > length) // this is an error ! return (parseString ()); ! ret = new TagNode (mPage, mCursor.getPosition (), cursor.getPosition (), attributes); mCursor = cursor; } else --- 542,571 ---- } } ! ! return (makeTag (cursor, attributes)); ! } ! ! /** ! * Create a tag node based on the current cursor and the one provided. ! */ ! protected Node makeTag (Cursor cursor, Vector attributes) ! throws ! ParserException ! { ! int length; ! int begin; ! int end; ! Node ret; ! ! begin = mCursor.getPosition (); ! end = cursor.getPosition (); ! length = end - begin; if (0 != length) { // return tag based on second character, '/', '%', Letter (ch), '!' if (2 > length) // this is an error ! return (makeString (cursor)); mCursor = cursor; + ret = getNodeFactory ().createTagNode (this, begin, end, attributes); } else *************** *** 471,493 **** * We allow terminators like --!> even though this isn't part of the spec. */ ! protected Node parseRemark () throws ParserException { - Cursor cursor; boolean done; char ch; int state; - int length; - RemarkNode ret; - cursor = mCursor.dup (); - // sanity check - ch = mPage.getCharacter (cursor); - if ('<' != ch) - return (parseString ()); - ch = mPage.getCharacter (cursor); - if ('!' != ch) - return (parseString ()); done = false; state = 0; --- 613,624 ---- * We allow terminators like --!> even though this isn't part of the spec. */ ! protected Node parseRemark (Cursor cursor) throws ParserException { boolean done; char ch; int state; done = false; state = 0; *************** *** 501,505 **** state = 1; else ! return (parseString ()); break; case 1: // prior to the second open delimiter --- 632,636 ---- state = 1; else ! return (parseString (cursor)); break; case 1: // prior to the second open delimiter *************** *** 507,515 **** state = 2; else ! return (parseString ()); break; case 2: // prior to the first closing delimiter if ('-' == ch) state = 3; break; case 3: // prior to the second closing delimiter --- 638,648 ---- state = 2; else ! return (parseString (cursor)); break; case 2: // prior to the first closing delimiter if ('-' == ch) state = 3; + else if (0 == ch) + return (parseString (cursor)); // no terminator break; case 3: // prior to the second closing delimiter *************** *** 533,555 **** } } ! length = cursor.getPosition () - mCursor.getPosition (); if (0 != length) { // return tag based on second character, '/', '%', Letter (ch), '!' if (2 > length) // this is an error ! return (parseString ()); ! ret = new RemarkNode (mPage, mCursor.getPosition (), cursor.getPosition ()); mCursor = cursor; } else ret = null; ! return (ret); } /** * Mainline for command line operation */ ! public static void main (String[] args) throws IOException, ParserException { URL url; --- 666,748 ---- } } ! ! return (makeRemark (cursor)); ! } ! ! /** ! * Create a remark node based on the current cursor and the one provided. ! */ ! protected Node makeRemark (Cursor cursor) ! throws ! ParserException ! { ! int length; ! int begin; ! int end; ! Node ret; ! ! begin = mCursor.getPosition (); ! end = cursor.getPosition (); ! length = end - begin; if (0 != length) { // return tag based on second character, '/', '%', Letter (ch), '!' if (2 > length) // this is an error ! return (makeString (cursor)); mCursor = cursor; + ret = getNodeFactory ().createRemarkNode (this, begin, end); } else ret = null; ! return (ret); } + // + // NodeFactory interface + // + + /** + * Create a new string node. + * @param lexer The lexer parsing this string. + * @param start The beginning position of the string. + * @param end The ending positiong of the string. + */ + public Node createStringNode (Lexer lexer, int start, int end) + { + return (new StringNode (lexer.getPage (), start, end)); + } + + /** + * Create a new remark node. + * @param lexer The lexer parsing this remark. + * @param start The beginning position of the remark. + * @param end The ending positiong of the remark. + */ + public Node createRemarkNode (Lexer lexer, int start, int end) + { + return (new RemarkNode (lexer.getPage (), start, end)); + } + + /** + * Create a new tag node. + * @param lexer The lexer parsing this tag. + * @param start The beginning position of the tag. + * @param end The ending positiong of the tag. + * @param attributes The attributes contained in this tag. + */ + public Node createTagNode (Lexer lexer, int start, int end, Vector attributes) + { + return (new TagNode (lexer.getPage (), start, end, attributes)); + } + /** * Mainline for command line operation */ ! public static void main (String[] args) ! throws ! MalformedURLException, ! IOException, ! ParserException { URL url; Index: Page.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/Page.java,v retrieving revision 1.15 retrieving revision 1.16 diff -C2 -d -r1.15 -r1.16 *** Page.java 22 Sep 2003 02:39:59 -0000 1.15 --- Page.java 28 Sep 2003 15:33:57 -0000 1.16 *************** *** 34,37 **** --- 34,38 ---- import java.io.*; + import java.io.IOException; import java.lang.reflect.*; import java.net.*; *************** *** 55,58 **** --- 56,66 ---- /** + * The URL this page is coming from. + * Cached value of <code>getConnection().toExternalForm()</code> or + * <code>setUrl()</code>. + */ + protected String mUrl; + + /** * The source of characters. */ *************** *** 63,71 **** */ protected PageIndex mIndex; /** * Messages for page not there (404). */ ! static private String[] mFourOhFour = { "The web site you seek cannot be located, but countless more exist", --- 71,84 ---- */ protected PageIndex mIndex; + + /** + * The connection this page is coming from or <code>null</code>. + */ + protected URLConnection mConnection; /** * Messages for page not there (404). */ ! static private final String[] mFourOhFour = { "The web site you seek cannot be located, but countless more exist", *************** *** 96,121 **** if (null == connection) throw new IllegalArgumentException ("connection cannot be null"); ! try ! { ! connection.connect (); ! } ! catch (UnknownHostException uhe) ! { ! int message = (int)(Math.random () * mFourOhFour.length); ! throw new ParserException (mFourOhFour[message], uhe); ! } ! catch (IOException ioe) ! { ! throw new ParserException (ioe.getMessage (), ioe); ! } ! try ! { ! mSource = new Source (new Stream (connection.getInputStream ()), getCharacterSet (connection)); ! } ! catch (IOException ioe) ! { ! throw new ParserException (ioe.getMessage (), ioe); ! } ! mIndex = new PageIndex (this); } --- 109,113 ---- if (null == connection) throw new IllegalArgumentException ("connection cannot be null"); ! setConnection (connection); } *************** *** 137,143 **** mSource = new Source (stream, charset); mIndex = new PageIndex (this); } ! public Page (String text) throws ParserException { InputStream stream; --- 129,137 ---- mSource = new Source (stream, charset); mIndex = new PageIndex (this); + mConnection = null; + mUrl = null; } ! public Page (String text) { InputStream stream; *************** *** 153,158 **** catch (UnsupportedEncodingException uee) { ! throw new ParserException ("problem making a page", uee); } } --- 147,264 ---- catch (UnsupportedEncodingException uee) { ! // this is unlikely, so we cover it up with a runtime exception ! throw new IllegalStateException (uee.getMessage ()); } + mConnection = null; + mUrl = null; + } + + /** + * Reset the page by resetting the source of characters. + */ + public void reset () + { + getSource ().reset (); + mIndex = new PageIndex (this); // todo: is this really necessary? + } + + /** + * Get the connection, if any. + * @return The connection object for this page, or null if this page + * is built from a stream or a string. + */ + public URLConnection getConnection () + { + return (mConnection); + } + + /** + * Set the URLConnection to be used by this page. + * @param connection The connection to use. + * It will be connected by this method. + * @exception ParserException If the <code>connect()</code> method fails, + * or an I/O error occurs opening the input stream or the character set + * designated in the HTTP header is unsupported. + */ + public void setConnection (URLConnection connection) + throws + ParserException + { + Stream stream; + String charset; + + + mUrl = null; + mConnection = connection; + try + { + getConnection ().connect (); + } + catch (UnknownHostException uhe) + { + int message = (int)(Math.random () * mFourOhFour.length); + throw new ParserException (mFourOhFour[message], uhe); + } + catch (IOException ioe) + { + throw new ParserException (ioe.getMessage (), ioe); + } + charset = getCharacterSet (); + try + { + stream = new Stream (getConnection ().getInputStream ()); + try + { + mSource = new Source (stream, charset); + } + catch (UnsupportedEncodingException uee) + { + StringBuffer msg; + String message; + + msg = new StringBuffer (1024); + msg.append (getConnection ().getURL ().toExternalForm ()); + msg.append (" has an encoding ("); + msg.append (charset); + msg.append (") which is not supported, using "); + msg.append (DEFAULT_CHARSET); + System.out.println (msg.toString ()); + charset = DEFAULT_CHARSET; + mSource = new Source (stream, charset); + } + } + catch (IOException ioe) + { + throw new ParserException (ioe.getMessage (), ioe); + } + mIndex = new PageIndex (this); + } + + /** + * Get the URL for this page. + * @return The url for the connection, or <code>null</code> if there is none. + */ + public String getUrl () + { + URLConnection connection; + if (null == mUrl) + { + connection = getConnection (); + if (null != connection) + mUrl = connection.getURL ().toExternalForm (); + } + + return (mUrl); + } + + /** + * Set the URL for this page. + * This doesn't affect the contents of the page, just the interpretation + * of relative links from this point forward. + * @param url The new URL. + */ + public void setUrl (String url) + { + mUrl = url; } *************** *** 260,277 **** /** * Try and extract the character set from the HTTP header. - * @param connection The connection with the charset info. * @return The character set name to use for this HTML page. */ ! protected String getCharacterSet (URLConnection connection) { final String CONTENT_TYPE_STRING = "Content-Type"; ! String string; String ret; ret = DEFAULT_CHARSET; ! string = connection.getHeaderField (CONTENT_TYPE_STRING); ! if (null != string) ! ret = getCharset (string); return (ret); --- 366,386 ---- /** * Try and extract the character set from the HTTP header. * @return The character set name to use for this HTML page. */ ! public String getCharacterSet () { final String CONTENT_TYPE_STRING = "Content-Type"; ! URLConnection connection; String string; String ret; ret = DEFAULT_CHARSET; ! connection = getConnection (); ! if (null != connection) ! { ! string = connection.getHeaderField (CONTENT_TYPE_STRING); ! if (null != string) ! ret = getCharset (string); ! } return (ret); *************** *** 302,306 **** * @see #DEFAULT_CHARSET */ ! protected String getCharset (String content) { final String CHARSET_STRING = "charset"; --- 411,415 ---- * @see #DEFAULT_CHARSET */ ! public String getCharset (String content) { final String CHARSET_STRING = "charset"; *************** *** 408,411 **** --- 517,598 ---- /** + * Get the current encoding being used. + * @return The encoding used to convert characters. + */ + public String getEncoding () + { + return (mSource.getEncoding ()); + } + + /** + * Try and extract the character set from the HTTP header. + * @param connection The connection with the charset info. + * @return The character set name to use for this HTML page. + */ + public void setEncoding (String character_set) + throws + ParserException + { + InputStream stream; + + stream = getSource ().getStream (); + try + { + stream.reset (); + mIndex = new PageIndex (this); + mSource = new Source (stream, character_set); + } + catch (IOException ioe) + { + throw new ParserException (ioe.getMessage (), ioe); + } + + // code from Parser: + + // /* If there is no connection (getConnection() returns null) it simply sets + // * the character set name stored in the parser (Note: the lexer object + // * which must have been set in the constructor or by <code>setLexer()</code>, + // * may or may not be using this character set). + //// * Otherwise (getConnection() doesn't return null) it does this by reopening the + //// * input stream of the connection and creating a reader that uses this + //// * character set. In this case, this method sets two of the fields in the + //// * parser object; <code>character_set</code> and <code>reader</code>. + //// * It does not adjust <code>resourceLocn</code>, <code>url_conn</code>, + //// * <code>scanners</code> or <code>feedback</code>. The two fields are set + //// * atomicly by this method, either they are both set or none of them is set. + //// * Trying to set the encoding to null or an empty string is a noop. + //// * @exception ParserException If the opening of the reader + // */ + // String chs; + // BufferedInputStream in; + // + // if ((null != encoding) && !"".equals (encoding)) + // if (null == getConnection ()) + // character_set = encoding; + // else + // { + // chs = getEncoding (); + // in = input; + // try + // { + // character_set = encoding; + // if (null != getLexer ()) + // getLexer ().getPage ().setCharset (encoding); + // } + // catch (IOException ioe) + // { + // String msg = "setEncoding() : Error in opening a connection to " + getConnection ().getURL ().toExternalForm (); + // ParserException ex = new ParserException (msg, ioe); + // feedback.error (msg, ex); + // character_set = chs; + // input = in; + // throw ex; + // } + // } + // } + // + } + + /** * Get the line number for a cursor. * @param cursor The character offset into the page. *************** *** 418,421 **** --- 605,618 ---- /** + * Get the line number for a cursor. + * @param position The character offset into the page. + * @return The line number the character is in. + */ + public int row (int position) + { + return (mIndex.row (position)); + } + + /** * Get the column number for a cursor. * @param cursor The character offset into the page. *************** *** 428,431 **** --- 625,638 ---- /** + * Get the column number for a cursor. + * @param position The character offset into the page. + * @return The character offset into the line this cursor is on. + */ + public int column (int position) + { + return (mIndex.column (position)); + } + + /** * Get the text identified by the given limits. * @param start The starting position, zero based. *************** *** 494,496 **** --- 701,858 ---- getText (buffer, 0, mSource.mOffset); } + + /** + * Get the text line the position of the cursor lies on. + * @param cursor The position to calculate for. + * @return The contents of the URL or file corresponding to the line number + * containg the cursor position. + */ + public String getLine (Cursor cursor) + { + int line; + int start; + int end; + + line = row (cursor); + start = mIndex.elementAt (line); + line++; + end = mIndex.last (); + if (end <= line) + end = mIndex.elementAt (end); + else + end = mSource.mOffset; + return (getText (start, end)); + } + + // todo refactor into common code method: + + /** + * Get the text line the position of the cursor lies on. + * @param cursor The position to calculate for. + * @return The contents of the URL or file corresponding to the line number + * containg the cursor position. + */ + public String getLine (int position) + { + int line; + int start; + int end; + + line = row (position); + start = mIndex.elementAt (line); + line++; + end = mIndex.last (); + if (end <= line) + end = mIndex.elementAt (end); + else + end = mSource.mOffset; + return (getText (start, end)); + } } + + // /** + // * The default charset. + // * This should be <code>ISO-8859-1</code>, + // * see RFC 2616 (http://www.ietf.org/rfc/rfc2616.txt?number=2616) section 3.7.1 + // * Another alias is "8859_1". + // */ + // protected static final String DEFAULT_CHARSET = "ISO-8859-1"; + // + // /** + // * Trigger for charset detection. + // */ + // protected static final String CHARSET_STRING = "charset"; + // + // + // /** + // * Try and extract the character set from the HTTP header. + // * @param connection The connection with the charset info. + // * @return The character set name to use for this HTML page. + // */ + // protected String getCharacterSet (URLConnection connection) + // { + // final String field = "Content-Type"; + // + // String string; + // String ret; + // + // ret = DEFAULT_CHARSET; + // string = connection.getHeaderField (field); + // if (null != string) + // ret = getCharset (string); + // + // return (ret); + // } + // + // /** + // * Get a CharacterSet name corresponding to a charset parameter. + // * @param content A text line of the form: + // * <pre> + // * text/html; charset=Shift_JIS + // * </pre> + // * which is applicable both to the HTTP header field Content-Type and + // * the meta tag http-equiv="Content-Type". + // * Note this method also handles non-compliant quoted charset directives such as: + // * <pre> + // * text/html; charset="UTF-8" + // * </pre> + // * and + // * <pre> + // * text/html; charset='UTF-8' + // * </pre> + // * @return The character set name to use when reading the input stream. + // * For JDKs that have the Charset class this is qualified by passing + // * the name to findCharset() to render it into canonical form. + // * If the charset parameter is not found in the given string, the default + // * character set is returned. + // * @see ParserHelper#findCharset + // * @see #DEFAULT_CHARSET + // */ + // protected String getCharset(String content) + // { + // int index; + // String ret; + // + // ret = DEFAULT_CHARSET; + // if (null != content) + // { + // index = content.indexOf(CHARSET_STRING); + // + // if (index != -1) + // { + // content = content.substring(index + CHARSET_STRING.length()).trim(); + // if (content.startsWith("=")) + // { + // content = content.substring(1).trim(); + // index = content.indexOf(";"); + // if (index != -1) + // content = content.substring(0, index); + // + // //remove any double quotes from around charset string + // if (content.startsWith ("\"") && content.endsWith ("\"") && (1 < content.length ())) + // content = content.substring (1, content.length () - 1); + // + // //remove any single quote from around charset string + // if (content.startsWith ("'") && content.endsWith ("'") && (1 < content.length ())) + // content = content.substring (1, content.length () - 1); + // + // ret = ParserHelper.findCharset(content, ret); + // // Charset names are not case-sensitive; + // // that is, case is always ignored when comparing charset names. + // if (!ret.equalsIgnoreCase(content)) + // { + // feedback.info ( + // "detected charset \"" + // + content + // + "\", using \"" + // + ret + // + "\""); + // } + // } + // } + // } + // + // return (ret); + // } + // + Index: Source.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/Source.java,v retrieving revision 1.9 retrieving revision 1.10 diff -C2 -d -r1.9 -r1.10 *** Source.java 22 Sep 2003 02:39:59 -0000 1.9 --- Source.java 28 Sep 2003 15:33:57 -0000 1.10 *************** *** 64,67 **** --- 64,72 ---- /** + * The character set in use. + */ + protected String mEncoding; + + /** * The converter from bytes to characters. */ *************** *** 123,129 **** --- 128,140 ---- mStream = stream; if (null == charset) + { mReader = new InputStreamReader (stream); + mEncoding = mReader.getEncoding (); + } else + { + mEncoding = charset; mReader = new InputStreamReader (stream, charset); + } mBuffer = new char[buffer_size]; mLevel = 0; *************** *** 133,136 **** --- 144,165 ---- /** + * Get the input stream being used. + * @return The current input stream. + */ + public InputStream getStream () + { + return (mStream); + } + + /** + * Get the encoding being used to convert characters. + * @return The current encoding. + */ + public String getEncoding () + { + return (mEncoding); + } + + /** * Fetch more characters from the underlying reader. * Has no effect if the underlying reader has been drained. *************** *** 279,297 **** /** ! * Reset the stream. If the stream has been marked, then attempt to ! * reposition it at the mark. If the stream has not been marked, then ! * attempt to reset it in some way appropriate to the particular stream, ! * for example by repositioning it to its starting point. Not all ! * character-input streams support the reset() operation, and some support ! * reset() without supporting mark(). ! * @exception IOException If the stream has not been marked, ! * or if the mark has been invalidated, ! * or if the stream does not support reset(), ! * or if some other I/O error occurs */ ! public void reset () throws IOException { if (null == mStream) // mStream goes null on close() ! throw new IOException ("reader is closed"); if (-1 != mMark) mOffset = mMark; --- 308,319 ---- /** ! * Reset the source. ! * Repositions the read point to begin at zero. ! * @exception IllegalStateException If the source has been closed. */ ! public void reset () { if (null == mStream) // mStream goes null on close() ! throw new IllegalStateException ("source is closed"); if (-1 != mMark) mOffset = mMark; |
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser In directory sc8-pr-cvs1:/tmp/cvs-serv30684 Modified Files: AbstractNode.java Parser.java RemarkNode.java StringNode.java Removed Files: NodeReader.java RemarkNodeParser.java Log Message: Lexer Integration Removed old Parser classes. Removed EndTag, this class was replaced by a call to the new isEndTag() method on the Tag class The StringNode, RemarkNode and tags.Tag class now derive from their lexeme counterparts in lexer.nodes instead of the other way around. The beginnings of a node factory interface are included. This was added so the lexer could return 'visitable' nodes to the parser. The parser acts as it's own node factory, as does the Lexer. The node count for parsing goes up in most cases because every whitespace (i.e. newline) now counts as a StringNode. This has whacked out a lot of the tests that were expecting fewer nodes or a certain type of node at a particular index. Attributes now maintain their order and case. The count of attributes also went up because whitespace is maintained within tags too. The storage in a Vector means the element 0 Attribute is actually the name of the tag, rather than having the $TAGNAME entry in a HashTable. Index: AbstractNode.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/AbstractNode.java,v retrieving revision 1.14 retrieving revision 1.15 diff -C2 -d -r1.14 -r1.15 *** AbstractNode.java 22 Sep 2003 02:39:58 -0000 1.14 --- AbstractNode.java 28 Sep 2003 15:33:57 -0000 1.15 *************** *** 156,163 **** * deep the links are embedded. */ ! public void collectInto(NodeList collectionList, Class nodeType) { ! if (nodeType.getName().equals(this.getClass().getName())) { collectionList.add(this); - } } --- 156,163 ---- * deep the links are embedded. */ ! public void collectInto(NodeList collectionList, Class nodeType) ! { ! if (nodeType.getName().equals(this.getClass().getName())) collectionList.add(this); } Index: Parser.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/Parser.java,v retrieving revision 1.61 retrieving revision 1.62 diff -C2 -d -r1.61 -r1.62 *** Parser.java 22 Sep 2003 02:39:58 -0000 1.61 --- Parser.java 28 Sep 2003 15:33:57 -0000 1.62 *************** *** 44,50 **** import java.util.Hashtable; import java.util.Map; import org.htmlparser.parserHelper.ParserHelper; - import org.htmlparser.parserHelper.TagParser; import org.htmlparser.scanners.AppletScanner; import org.htmlparser.scanners.BodyScanner; --- 44,56 ---- import java.util.Hashtable; import java.util.Map; [...999 lines suppressed...] + name = ret.getTagName (); + scanner = (TagScanner)scanners.get (name); + save = getPreviousOpenScanner (); + if ((null != scanner) && scanner.evaluate (ret.getText (), save)) + { + setPreviousOpenScanner (scanner); + try + { + ret = scanner.createScannedNode (ret, lexer.getPage ().getUrl (), lexer); + } + finally + { + setPreviousOpenScanner (save); + } + } + } + + return (ret); } } Index: RemarkNode.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/RemarkNode.java,v retrieving revision 1.31 retrieving revision 1.32 diff -C2 -d -r1.31 -r1.32 *** RemarkNode.java 22 Sep 2003 02:39:58 -0000 1.31 --- RemarkNode.java 28 Sep 2003 15:33:57 -0000 1.32 *************** *** 30,33 **** --- 30,34 ---- package org.htmlparser; + import org.htmlparser.lexer.Page; import org.htmlparser.util.NodeList; import org.htmlparser.visitors.NodeVisitor; *************** *** 36,73 **** * The remark tag is identified and represented by this class. */ ! public class RemarkNode extends AbstractNode { public final static String REMARK_NODE_FILTER="-r"; ! /** ! * Tag contents will have the contents of the comment tag. ! */ ! String tagContents; ! /** ! * The HTMLRemarkTag is constructed by providing the beginning posn, ending posn ! * and the tag contents. ! * @param nodeBegin beginning position of the tag ! * @param nodeEnd ending position of the tag ! * @param tagContents contents of the remark tag ! */ ! public RemarkNode(int nodeBegin, int nodeEnd, String tagContents) ! { ! super(nodeBegin,nodeEnd); ! this.tagContents = tagContents; ! } /** ! * Returns the text contents of the comment tag. */ ! public String getText() { ! return tagContents; ! } ! public String toPlainTextString() { ! return tagContents; ! } ! public String toHtml() { ! return "<!--"+tagContents+"-->"; } /** * Print the contents of the remark tag. --- 37,75 ---- * The remark tag is identified and represented by this class. */ ! public class RemarkNode ! extends ! org.htmlparser.lexer.nodes.RemarkNode { public final static String REMARK_NODE_FILTER="-r"; ! // /** ! // * Tag contents will have the contents of the comment tag. ! // */ ! // String tagContents; ! // ! // /** ! // * The HTMLRemarkTag is constructed by providing the beginning posn, ending posn ! // * and the tag contents. ! // * @param nodeBegin beginning position of the tag ! // * @param nodeEnd ending position of the tag ! // * @param tagContents contents of the remark tag ! // */ ! // public RemarkNode(int nodeBegin, int nodeEnd, String tagContents) ! // { ! // super(nodeBegin,nodeEnd); ! // this.tagContents = tagContents; ! // } /** ! * Constructor takes in the text string, beginning and ending posns. ! * @param page The page this string is on. ! * @param start The beginning position of the string. ! * @param end The ending positiong of the string. */ ! public RemarkNode (Page page, int start, int end) { ! super (page, start, end); } + /** * Print the contents of the remark tag. *************** *** 75,79 **** public String toString() { ! return "Comment Tag : "+tagContents+"; begins at : "+elementBegin()+"; ends at : "+elementEnd()+"\n"; } --- 77,81 ---- public String toString() { ! return "Comment Tag : "+getText()+"; begins at : "+elementBegin()+"; ends at : "+elementEnd()+"\n"; } *************** *** 82,87 **** } ! public void accept(Object visitor) { ! ((NodeVisitor)visitor).visitRemarkNode(this); } --- 84,90 ---- } ! public void accept(Object visitor) ! { ! ((NodeVisitor)visitor).visitRemarkNode (this); } Index: StringNode.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/StringNode.java,v retrieving revision 1.39 retrieving revision 1.40 diff -C2 -d -r1.39 -r1.40 *** StringNode.java 22 Sep 2003 02:39:58 -0000 1.39 --- StringNode.java 28 Sep 2003 15:33:57 -0000 1.40 *************** *** 30,33 **** --- 30,34 ---- package org.htmlparser; + import org.htmlparser.lexer.Page; import org.htmlparser.util.NodeList; import org.htmlparser.visitors.NodeVisitor; *************** *** 36,48 **** * Normal text in the html document is identified and represented by this class. */ ! public class StringNode extends AbstractNode { public static final String STRING_FILTER="-string"; ! /** ! * The text of the string. ! */ ! protected StringBuffer textBuffer; ! /** * Constructor takes in the text string, beginning and ending posns. --- 37,51 ---- * Normal text in the html document is identified and represented by this class. */ ! public class StringNode ! extends ! org.htmlparser.lexer.nodes.StringNode { public static final String STRING_FILTER="-string"; ! // /** ! // * The text of the string. ! // */ ! // protected StringBuffer textBuffer; ! // /** * Constructor takes in the text string, beginning and ending posns. *************** *** 53,81 **** public StringNode (StringBuffer text, int textBegin,int textEnd) { ! super(textBegin,textEnd); ! this.textBuffer = text; } /** ! * Returns the text of the string line ! */ ! public String getText() { ! return textBuffer.toString(); ! } ! /** ! * Sets the string contents of the node. ! * @param text The new text for the node. */ ! public void setText(String text) { ! textBuffer = new StringBuffer (text); ! } ! ! public String toPlainTextString() { ! return textBuffer.toString(); ! } ! ! public String toHtml() { ! return textBuffer.toString(); } --- 56,71 ---- public StringNode (StringBuffer text, int textBegin,int textEnd) { ! super(new Page (text.toString ()), textBegin,textEnd); } /** ! * Constructor takes in the text string, beginning and ending posns. ! * @param page The page this string is on. ! * @param start The beginning position of the string. ! * @param end The ending positiong of the string. */ ! public StringNode (Page page, int start, int end) { ! super (page, start, end); } *************** *** 88,93 **** } ! public void accept(Object visitor) { ! ((NodeVisitor)visitor).visitStringNode(this); } } --- 78,84 ---- } ! public void accept(Object visitor) ! { ! ((NodeVisitor)visitor).visitStringNode (this); } } --- NodeReader.java DELETED --- --- RemarkNodeParser.java DELETED --- |
From: <der...@us...> - 2003-09-28 15:34:56
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/nodes In directory sc8-pr-cvs1:/tmp/cvs-serv30684/lexer/nodes Modified Files: StringNode.java TagNode.java Added Files: NodeFactory.java Log Message: Lexer Integration Removed old Parser classes. Removed EndTag, this class was replaced by a call to the new isEndTag() method on the Tag class The StringNode, RemarkNode and tags.Tag class now derive from their lexeme counterparts in lexer.nodes instead of the other way around. The beginnings of a node factory interface are included. This was added so the lexer could return 'visitable' nodes to the parser. The parser acts as it's own node factory, as does the Lexer. The node count for parsing goes up in most cases because every whitespace (i.e. newline) now counts as a StringNode. This has whacked out a lot of the tests that were expecting fewer nodes or a certain type of node at a particular index. Attributes now maintain their order and case. The count of attributes also went up because whitespace is maintained within tags too. The storage in a Vector means the element 0 Attribute is actually the name of the tag, rather than having the $TAGNAME entry in a HashTable. --- NEW FILE: NodeFactory.java --- // HTMLParser Library $Name: $ - A java-based parser for HTML // http://sourceforge.org/projects/htmlparser // Copyright (C) 2003 Derrick Oswald // // Revision Control Information // // $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/nodes/NodeFactory.java,v $ // $Author: derrickoswald $ // $Date: 2003/09/28 15:33:58 $ // $Revision: 1.1 $ // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // package org.htmlparser.lexer.nodes; import java.util.Vector; import org.htmlparser.lexer.Lexer; import org.htmlparser.Node; import org.htmlparser.util.ParserException; /** * This interface defines the methods needed to create new nodes. * The factory is used when lexing to generate the nodes passed * back to the caller. */ public interface NodeFactory { /** * Create a new string node. * @param lexer The lexer parsing this string. * @param start The beginning position of the string. * @param end The ending positiong of the string. */ public Node createStringNode (Lexer lexer, int start, int end) throws ParserException; /** * Create a new remark node. * @param lexer The lexer parsing this remark. * @param start The beginning position of the remark. * @param end The ending positiong of the remark. */ public Node createRemarkNode (Lexer lexer, int start, int end) throws ParserException; /** * Create a new tag node. * @param lexer The lexer parsing this tag. * @param start The beginning position of the tag. * @param end The ending positiong of the tag. * @param attributes The attributes contained in this tag. */ public Node createTagNode (Lexer lexer, int start, int end, Vector attributes) throws ParserException; } Index: StringNode.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/nodes/StringNode.java,v retrieving revision 1.7 retrieving revision 1.8 diff -C2 -d -r1.7 -r1.8 *** StringNode.java 22 Sep 2003 02:39:59 -0000 1.7 --- StringNode.java 28 Sep 2003 15:33:57 -0000 1.8 *************** *** 67,79 **** public void setText (String text) { ! try ! { ! mPage = new Page (text); ! nodeBegin = 0; ! nodeEnd = text.length (); ! } ! catch (ParserException pe) ! { ! } } --- 67,73 ---- public void setText (String text) { ! mPage = new Page (text); ! nodeBegin = 0; ! nodeEnd = text.length (); } Index: TagNode.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/nodes/TagNode.java,v retrieving revision 1.11 retrieving revision 1.12 diff -C2 -d -r1.11 -r1.12 *** TagNode.java 23 Sep 2003 03:41:33 -0000 1.11 --- TagNode.java 28 Sep 2003 15:33:57 -0000 1.12 *************** *** 358,362 **** public String getTagName () { ! return (getAttribute (TAGNAME).toUpperCase ()); } --- 358,368 ---- public String getTagName () { ! String ret; ! ! ret = getAttribute (TAGNAME).toUpperCase (); ! if (ret.startsWith ("/")) // end tag ! ret = ret.substring (1); ! ! return (ret); } *************** *** 483,495 **** public void setText (String text) { ! try ! { ! mPage = new Page (text); ! nodeBegin = 0; ! nodeEnd = text.length (); ! } ! catch (ParserException pe) ! { ! } } --- 489,495 ---- public void setText (String text) { ! mPage = new Page (text); ! nodeBegin = 0; ! nodeEnd = text.length (); } *************** *** 536,551 **** public String toString () { ! String tag; Cursor start; Cursor end; ! tag = getTagName (); ! if (tag.startsWith ("/")) ! tag = "End"; else ! tag = "Tag"; start = new Cursor (getPage (), elementBegin ()); end = new Cursor (getPage (), elementEnd ()); ! return (tag + " (" + start.toString () + "," + end.toString () + "): " + getText ()); } --- 536,550 ---- public String toString () { ! String type; Cursor start; Cursor end; ! if (isEndTag ()) ! type = "End"; else ! type = "Tag"; start = new Cursor (getPage (), elementBegin ()); end = new Cursor (getPage (), elementEnd ()); ! return (type + " (" + start.toString () + "," + end.toString () + "): " + getText ()); } *************** *** 557,561 **** public boolean breaksFlow () { ! return (breakTags.containsKey (getText ().toUpperCase ())); } --- 556,560 ---- public boolean breaksFlow () { ! return (breakTags.containsKey (getTagName ().toUpperCase ())); } *************** *** 581,600 **** } - /** - * Sometimes, a scanner may need to request a re-evaluation of the - * attributes in a tag. This may happen when there is some correction - * activity. An example of its usage can be found in ImageTag. - * <br> - * <B>Note:<B> This is an intensive task, hence call only when - * really necessary - * @return Hashtable - */ - public Hashtable redoParseAttributes () - { - mAttributes = null; - getAttributesEx (); - return (getAttributes ()); - } - public void accept (Object visitor) { --- 580,583 ---- *************** *** 621,623 **** --- 604,610 ---- } + public boolean isEndTag () + { + return ('/' == getAttribute (TAGNAME).toUpperCase ().charAt (0)); + } } |
From: <der...@us...> - 2003-09-28 15:34:56
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/beans In directory sc8-pr-cvs1:/tmp/cvs-serv30684/beans Modified Files: StringBean.java Log Message: Lexer Integration Removed old Parser classes. Removed EndTag, this class was replaced by a call to the new isEndTag() method on the Tag class The StringNode, RemarkNode and tags.Tag class now derive from their lexeme counterparts in lexer.nodes instead of the other way around. The beginnings of a node factory interface are included. This was added so the lexer could return 'visitable' nodes to the parser. The parser acts as it's own node factory, as does the Lexer. The node count for parsing goes up in most cases because every whitespace (i.e. newline) now counts as a StringNode. This has whacked out a lot of the tests that were expecting fewer nodes or a certain type of node at a particular index. Attributes now maintain their order and case. The count of attributes also went up because whitespace is maintained within tags too. The storage in a Vector means the element 0 Attribute is actually the name of the tag, rather than having the $TAGNAME entry in a HashTable. Index: StringBean.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/beans/StringBean.java,v retrieving revision 1.27 retrieving revision 1.28 diff -C2 -d -r1.27 -r1.28 *** StringBean.java 22 Sep 2003 02:39:58 -0000 1.27 --- StringBean.java 28 Sep 2003 15:33:57 -0000 1.28 *************** *** 36,40 **** import org.htmlparser.Parser; import org.htmlparser.StringNode; - import org.htmlparser.tags.EndTag; import org.htmlparser.tags.LinkTag; import org.htmlparser.tags.Tag; --- 36,39 ---- *************** *** 600,620 **** /** - * Possibly resets the state of the PRE and SCRIPT flags. - * @param end The end tag. - */ - public void visitEndTag (EndTag end) - { - String name; - - name = end.getTagName (); - if (name.equalsIgnoreCase ("PRE")) - mIsPre = false; - else if (name.equalsIgnoreCase ("SCRIPT")) - mIsScript = false; - } - - /** * Appends a newline to the output if the tag breaks flow, and * possibly sets the state of the PRE and SCRIPT flags. */ public void visitTag (Tag tag) --- 599,606 ---- /** * Appends a newline to the output if the tag breaks flow, and * possibly sets the state of the PRE and SCRIPT flags. + * Possibly resets the state of the PRE and SCRIPT flags if it's + * an end tag. */ public void visitTag (Tag tag) *************** *** 623,632 **** name = tag.getTagName (); ! if (name.equalsIgnoreCase ("PRE")) ! mIsPre = true; ! else if (name.equalsIgnoreCase ("SCRIPT")) ! mIsScript = true; ! if (tag.breaksFlow ()) ! carriage_return (); } --- 609,628 ---- name = tag.getTagName (); ! if (tag.isEndTag ()) ! { ! if (name.equalsIgnoreCase ("/PRE")) ! mIsPre = false; ! else if (name.equalsIgnoreCase ("/SCRIPT")) ! mIsScript = false; ! } ! else ! { ! if (name.equalsIgnoreCase ("PRE")) ! mIsPre = true; ! else if (name.equalsIgnoreCase ("SCRIPT")) ! mIsScript = true; ! if (tag.breaksFlow ()) ! carriage_return (); ! } } |
From: <der...@us...> - 2003-09-23 03:41:41
|
Update of /cvsroot/htmlparser/htmlparser/resources In directory sc8-pr-cvs1:/tmp/cvs-serv11902/resources Added Files: cvs2cl.pl htmlparser_checks.xml java.header Removed Files: fit.jar lexer runCrawler.bat runLexer.bat runParser.bat runRipper.bat runThumbelina.bat thumbelina Log Message: Distribution cleanup. - Removed duplicate documentation files from src.zip. - Jars are now built in lib, and stay there, rather than being deleting in the clean task. *** NOTE *** No more release directory. - Added checkstyle-all-3.1.jar to the lib directory, so others can run it too. - Moved executable scripts from resources to a new bin directory so they can be executed in a development environment. - Moved fit.jar from resources to the lib directory. This left the resources directory empty, but... - Moved cvs2cl and checkstyle files into the resources directory. - Eliminated staging of source files and release files just to construct a zip. These are now aggregated by their respective zip tasks. - Changed name of changeLog task to changelog. - Fixed a few javadoc warnings. - Removed the spurious 'run' from the front of all the names of the DOS batch files. The only files that aren't shipped now are the results, specs and .ssh directory, (whatever they are), and the development environment is identical to the unpacked zips except for maybe the built directories (distribution, javadocs). --- NEW FILE: cvs2cl.pl --- #!/bin/sh exec perl -w -x $0 ${1+"$@"} # -*- mode: perl; perl-indent-level: 2; -*- #!perl -w ############################################################## ### ### ### cvs2cl.pl: produce ChangeLog(s) from `cvs log` output. ### ### ### ############################################################## ## $Revision: 1.1 $ ## $Date: 2003/09/23 03:41:34 $ ## $Author: derrickoswald $ ## ## (C) 2001,2002,2003 Martyn J. Pearce <fl...@cp...>, under the GNU GPL. ## (C) 1999 Karl Fogel <kf...@re...>, under the GNU GPL. ## ## (Extensively hacked on by Melissa O'Neill <on...@cs...>.) [...2298 lines suppressed...] Anyway, rather than fix this in Text::Wrap, we might as well write a new wrap() which has the following much-needed features: * initial indentation, like current Text::Wrap() * subsequent line indentation, like current Text::Wrap() * user chooses among: force-break long words, leave them alone, or die()? * preserve existing indentation: chopped chunks from an indented line are indented by same (like this line, not counting the asterisk!) * optional list of things to preserve on line starts, default ">" Note that the last two are essentially the same concept, so unify in implementation and give a good interface to controlling them. And how about: Optionally, when encounter a line pre-indented by same as previous line, then strip the newline and refill, but indent by the same. Yeah... --- NEW FILE: htmlparser_checks.xml --- <?xml version="1.0"?> <!DOCTYPE module PUBLIC "-//Puppy Crawl//DTD Check Configuration 1.1//EN" "http://www.puppycrawl.com/dtds/configuration_1_1.dtd"> <!-- Checkstyle configuration that checks the sun coding conventions from: - the Java Language Specification at http://java.sun.com/docs/books/jls/second_edition/html/index.html - the Sun Code Conventions at http://java.sun.com/docs/codeconv/ - the Javadoc guidelines at http://java.sun.com/j2se/javadoc/writingdoccomments/index.html - the JDK Api documentation http://java.sun.com/j2se/docs/api/index.html - some best practices Checkstyle is very configurable. Be sure to read the documentation at http://checkstyle.sf.net (or in your downloaded distribution). Most Checks are configurable, be sure to consult the documentation. To completely disable a check, just comment it out or delete it from the file. Finally, it is worth reading the documentation. --> <module name="Checker"> <!-- Checks that a package.html file exists for each package. --> <!-- See http://checkstyle.sf.net/config_javadoc.html#PackageHtml --> <module name="PackageHtml"/> <!-- Checks whether files end with a new line. --> <!-- See http://checkstyle.sf.net/config_misc.html#NewlineAtEndOfFile --> <module name="NewlineAtEndOfFile"/> <!-- Checks that property files contain the same keys. --> <!-- See http://checkstyle.sf.net/config_misc.html#Translation --> <module name="Translation"/> <module name="TreeWalker"> <!-- Checks for Javadoc comments. --> <!-- See http://checkstyle.sf.net/config_javadoc.html --> <module name="JavadocMethod"> <property name="allowUndeclaredRTE" value="true"/> </module> <module name="JavadocType"/> <module name="JavadocVariable"/> <!-- Checks for Naming Conventions. --> <!-- See http://checkstyle.sf.net/config_naming.html --> <module name="ConstantName"/> <module name="LocalFinalVariableName"/> <module name="LocalVariableName"/> <module name="MemberName"/> <module name="MethodName"/> <module name="PackageName"/> <module name="ParameterName"/> <module name="StaticVariableName"/> <module name="TypeName"/> <!-- Checks for Headers --> <!-- See http://checkstyle.sf.net/config_header.html --> <module name="RegexpHeader"> <!-- The follow property value demonstrates the ability --> <!-- to have access to ANT properties. In this case it uses --> <!-- the ${basedir} property to allow Checkstyle to be run --> <!-- from any directory within a project. --> <property name="headerFile" value="${basedir}/resources/java.header"/> </module> <!-- Following interprets the header file as regular expressions. --> <!-- <module name="RegexpHeader"/> --> <!-- Checks for imports --> <!-- See http://checkstyle.sf.net/config_import.html --> <module name="AvoidStarImport"/> <module name="IllegalImport"/> <!-- defaults to sun.* packages --> <module name="RedundantImport"/> <module name="UnusedImports"/> <!-- Checks for Size Violations. --> <!-- See http://checkstyle.sf.net/config_sizes.html --> <module name="FileLength"/> <module name="LineLength"/> <module name="MethodLength"/> <module name="ParameterNumber"/> <!-- Checks for whitespace --> <!-- See http://checkstyle.sf.net/config_whitespace.html --> <module name="EmptyForIteratorPad"/> <module name="NoWhitespaceAfter"/> <module name="NoWhitespaceBefore"/> <module name="OperatorWrap"/> <module name="ParenPad"/> <module name="TabCharacter"/> <module name="WhitespaceAfter"> <property name="tokens" value="COMMA, SEMI"/> </module> <module name="WhitespaceAround"/> <!-- Modifier Checks --> <!-- See http://checkstyle.sf.net/config_modifiers.html --> <module name="ModifierOrder"/> <module name="RedundantModifier"/> <!-- Checks for blocks. You know, those {}'s --> <!-- See http://checkstyle.sf.net/config_blocks.html --> <module name="AvoidNestedBlocks"/> <module name="EmptyBlock"/> <module name="LeftCurly"> <property name="option" value="nl"/> </module> <!-- module name="NeedBraces"/--> <module name="RightCurly"> <property name="option" value="alone"/> </module> <!-- Checks for common coding problems --> <!-- See http://checkstyle.sf.net/config_coding.html --> <module name="AvoidInlineConditionals"/> <module name="DoubleCheckedLocking"/> <!-- MY FAVOURITE --> <module name="EmptyStatement"/> <module name="EqualsHashCode"/> <module name="HiddenField"/> <module name="IllegalInstantiation"/> <module name="InnerAssignment"/> <module name="MagicNumber"/> <module name="MissingSwitchDefault"/> <!--module name="RedundantThrows"/--> <module name="SimplifyBooleanExpression"/> <module name="SimplifyBooleanReturn"/> <!-- Checks for class design --> <!-- See http://checkstyle.sf.net/config_design.html --> <!--module name="DesignForExtension"/--> <module name="FinalClass"/> <module name="HideUtilityClassConstructor"/> <module name="InterfaceIsType"/> <module name="VisibilityModifier"> <property name="protectedAllowed" value="true"/> </module> <!-- Miscellaneous other checks. --> <!-- See http://checkstyle.sf.net/config_misc.html --> <module name="ArrayTypeStyle"/> <module name="FinalParameters"/> <module name="GenericIllegalRegexp"> <property name="format" value="\s+$"/> <property name="message" value="Line has trailing spaces."/> </module> <module name="TodoComment"/> <module name="UpperEll"/> </module> </module> --- NEW FILE: java.header --- // HTMLParser Library \$Name: .*\$ - A java-based parser for HTML // http://sourceforge.org/projects/htmlparser // Copyright \(C\) \d\d\d\d .* // // Revision Control Information // // \$Source: .*\$ // \$Author: .*\$ // \$Date: .*\$ // \$Revision: .*\$ // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or \(at your option\) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // --- fit.jar DELETED --- --- lexer DELETED --- --- runCrawler.bat DELETED --- --- runLexer.bat DELETED --- --- runParser.bat DELETED --- --- runRipper.bat DELETED --- --- runThumbelina.bat DELETED --- --- thumbelina DELETED --- |
From: <der...@us...> - 2003-09-23 03:41:41
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tags In directory sc8-pr-cvs1:/tmp/cvs-serv11902/src/org/htmlparser/tags Modified Files: AppletTag.java Log Message: Distribution cleanup. - Removed duplicate documentation files from src.zip. - Jars are now built in lib, and stay there, rather than being deleting in the clean task. *** NOTE *** No more release directory. - Added checkstyle-all-3.1.jar to the lib directory, so others can run it too. - Moved executable scripts from resources to a new bin directory so they can be executed in a development environment. - Moved fit.jar from resources to the lib directory. This left the resources directory empty, but... - Moved cvs2cl and checkstyle files into the resources directory. - Eliminated staging of source files and release files just to construct a zip. These are now aggregated by their respective zip tasks. - Changed name of changeLog task to changelog. - Fixed a few javadoc warnings. - Removed the spurious 'run' from the front of all the names of the DOS batch files. The only files that aren't shipped now are the results, specs and .ssh directory, (whatever they are), and the development environment is identical to the unpacked zips except for maybe the built directories (distribution, javadocs). Index: AppletTag.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tags/AppletTag.java,v retrieving revision 1.28 retrieving revision 1.29 diff -C2 -d -r1.28 -r1.29 *** AppletTag.java 22 Sep 2003 02:40:01 -0000 1.28 --- AppletTag.java 23 Sep 2003 03:41:33 -0000 1.29 *************** *** 149,153 **** /** * Set the <code>CODE<code> attribute. ! * @param The new applet class. */ public void setAppletClass (String newAppletClass) --- 149,153 ---- /** * Set the <code>CODE<code> attribute. ! * @param newAppletClass The new applet class. */ public void setAppletClass (String newAppletClass) *************** *** 158,162 **** /** * Set the enclosed <code>PARM<code> children. ! * @param The new parameters. */ public void setAppletParams (Hashtable newAppletParams) --- 158,162 ---- /** * Set the enclosed <code>PARM<code> children. ! * @param newAppletParams The new parameters. */ public void setAppletParams (Hashtable newAppletParams) *************** *** 200,204 **** /** * Set the <code>ARCHIVE<code> attribute. ! * @param The new archive file. */ public void setArchive (String newArchive) --- 200,204 ---- /** * Set the <code>ARCHIVE<code> attribute. ! * @param newArchive The new archive file. */ public void setArchive (String newArchive) *************** *** 209,213 **** /** * Set the <code>CODEBASE<code> attribute. ! * @param The new applet code base. */ public void setCodeBase (String newCodeBase) --- 209,213 ---- /** * Set the <code>CODEBASE<code> attribute. ! * @param newCodeBase The new applet code base. */ public void setCodeBase (String newCodeBase) |