[Htmlparser-cvs] htmlparser/src/org/htmlparser/tests ParserTest.java,1.42,1.43 ParserTestCase.java,1
Brought to you by:
derrickoswald
From: <der...@us...> - 2003-09-29 21:38:48
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests In directory sc8-pr-cvs1:/tmp/cvs-serv30684/tests Modified Files: ParserTest.java ParserTestCase.java Log Message: Lexer Integration Removed old Parser classes. Removed EndTag, this class was replaced by a call to the new isEndTag() method on the Tag class The StringNode, RemarkNode and tags.Tag class now derive from their lexeme counterparts in lexer.nodes instead of the other way around. The beginnings of a node factory interface are included. This was added so the lexer could return 'visitable' nodes to the parser. The parser acts as it's own node factory, as does the Lexer. The node count for parsing goes up in most cases because every whitespace (i.e. newline) now counts as a StringNode. This has whacked out a lot of the tests that were expecting fewer nodes or a certain type of node at a particular index. Attributes now maintain their order and case. The count of attributes also went up because whitespace is maintained within tags too. The storage in a Vector means the element 0 Attribute is actually the name of the tag, rather than having the $TAGNAME entry in a HashTable. Index: ParserTest.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/ParserTest.java,v retrieving revision 1.42 retrieving revision 1.43 diff -C2 -d -r1.42 -r1.43 *** ParserTest.java 22 Sep 2003 02:40:04 -0000 1.42 --- ParserTest.java 28 Sep 2003 15:33:58 -0000 1.43 *************** *** 41,44 **** --- 41,46 ---- import org.htmlparser.Parser; import org.htmlparser.StringNode; + import org.htmlparser.lexer.Lexer; + import org.htmlparser.lexer.Page; import org.htmlparser.scanners.FormScanner; import org.htmlparser.scanners.TagScanner; *************** *** 89,93 **** throw new ParserException("You must be offline! This test needs you to be connected to the internet.",e); } - parser.getReader().mark(5000); Node [] node = new AbstractNode[500]; --- 91,94 ---- *************** *** 98,102 **** } int cnt = i; ! parser.getReader().reset(); // Now try getting the elements again i = 0; --- 99,103 ---- } int cnt = i; ! parser.reset (); // Now try getting the elements again i = 0; *************** *** 349,353 **** i++; } ! assertEquals("Expected nodes",12,i); } catch (Exception e) --- 350,354 ---- i++; } ! assertEquals("Expected nodes",20,i); } catch (Exception e) *************** *** 421,425 **** for (NodeIterator e = parser.elements(); e.hasMoreNodes();) nodes[i++] = e.nextNode(); ! assertEquals ("Expected nodes", 14, i); } --- 422,426 ---- for (NodeIterator e = parser.elements(); e.hasMoreNodes();) nodes[i++] = e.nextNode(); ! assertEquals ("Expected nodes", 23, i); } *************** *** 476,479 **** --- 477,481 ---- URL url; URLConnection connection; + Page page; Parser parser; String idiots = "http://users.aol.com/geinster/rej.htm"; *************** *** 485,492 **** // this little subclass just gets around normal JDK 1.4 processing // that filters out bogus character sets ! parser = new Parser () { ! protected String getCharset(String content) { int index; String ret; --- 487,495 ---- // this little subclass just gets around normal JDK 1.4 processing // that filters out bogus character sets ! page = new Page ("") { ! public String getCharset(String content) { + final String CHARSET_STRING = "charset"; int index; String ret; *************** *** 495,509 **** if (null != content) { ! index = content.indexOf(CHARSET_STRING); if (index != -1) { ! content = content.substring(index + CHARSET_STRING.length()).trim(); ! if (content.startsWith("=")) { ! content = content.substring(1).trim(); ! index = content.indexOf(";"); if (index != -1) ! content = content.substring(0, index); //remove any double quotes from around charset string --- 498,512 ---- if (null != content) { ! index = content.indexOf (CHARSET_STRING); if (index != -1) { ! content = content.substring (index + CHARSET_STRING.length ()).trim (); ! if (content.startsWith ("=")) { ! content = content.substring (1).trim (); ! index = content.indexOf (";"); if (index != -1) ! content = content.substring (0, index); //remove any double quotes from around charset string *************** *** 523,527 **** } }; ! parser.setConnection (connection); // must be the default assertTrue ("Wrong encoding", parser.getEncoding ().equals ("ISO-8859-1")); --- 526,531 ---- } }; ! page.setConnection (connection); ! parser = new Parser (new Lexer (page)); // must be the default assertTrue ("Wrong encoding", parser.getEncoding ().equals ("ISO-8859-1")); *************** *** 538,546 **** public void testNullUrl() { Parser parser; ! try { ! parser = new Parser("http://someoneexisting.com", Parser.noFeedback); assertTrue("Should have thrown an exception!",false); } ! catch (ParserException e) { } --- 542,552 ---- public void testNullUrl() { Parser parser; ! try ! { ! parser = new Parser("http://none.existant.url.org", Parser.noFeedback); assertTrue("Should have thrown an exception!",false); } ! catch (ParserException e) ! { } *************** *** 559,564 **** } ! assertEquals("Expected nodes",12,i); } public void testLinkCollection() throws ParserException { createParser( --- 565,571 ---- } ! assertEquals("Expected nodes",20,i); } + public void testLinkCollection() throws ParserException { createParser( *************** *** 619,623 **** node.collectInto(collectionList,LinkTag.class); } ! assertEquals("Size of collection vector should be 11",11,collectionList.size()); // All items in collection vector should be links for (SimpleNodeIterator e = collectionList.elements();e.hasMoreNodes();) { --- 626,631 ---- node.collectInto(collectionList,LinkTag.class); } ! // NOTE: the link within the script is also found... this may be debatable ! assertEquals("Size of collection vector should be 12",12,collectionList.size()); // All items in collection vector should be links for (SimpleNodeIterator e = collectionList.elements();e.hasMoreNodes();) { *************** *** 732,736 **** { Node node = e.nextNode(); ! if (7 == i) { assertTrue ("not a tag", node instanceof Tag); --- 740,744 ---- { Node node = e.nextNode(); ! if (10 == i) { assertTrue ("not a tag", node instanceof Tag); *************** *** 739,743 **** i++; } ! assertEquals("Expected nodes",16,i); } } --- 747,751 ---- i++; } ! assertEquals("Expected nodes",21,i); } } Index: ParserTestCase.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/ParserTestCase.java,v retrieving revision 1.28 retrieving revision 1.29 diff -C2 -d -r1.28 -r1.29 *** ParserTestCase.java 22 Sep 2003 02:40:04 -0000 1.28 --- ParserTestCase.java 28 Sep 2003 15:33:58 -0000 1.29 *************** *** 37,43 **** import org.htmlparser.AbstractNode; import org.htmlparser.Node; - import org.htmlparser.NodeReader; import org.htmlparser.Parser; import org.htmlparser.StringNode; import org.htmlparser.tags.FormTag; import org.htmlparser.tags.InputTag; --- 37,44 ---- import org.htmlparser.AbstractNode; import org.htmlparser.Node; import org.htmlparser.Parser; import org.htmlparser.StringNode; + import org.htmlparser.lexer.Lexer; + import org.htmlparser.lexer.Page; import org.htmlparser.tags.FormTag; import org.htmlparser.tags.InputTag; *************** *** 54,58 **** protected Node node []; protected int nodeCount; ! protected NodeReader reader; public ParserTestCase(String name) { --- 55,59 ---- protected Node node []; protected int nodeCount; ! protected Lexer mLexer; public ParserTestCase(String name) { *************** *** 67,98 **** protected void createParser(String inputHTML) { ! String testHTML = new String(inputHTML); ! StringReader sr = new StringReader(testHTML); ! reader = new NodeReader(new BufferedReader(sr),5000); ! parser = new Parser(reader,new DefaultParserFeedback()); node = new AbstractNode[40]; } ! protected void createParser(String inputHTML,int numNodes) { ! String testHTML = new String(inputHTML); ! StringReader sr = new StringReader(testHTML); ! reader = new NodeReader(new BufferedReader(sr),5000); ! parser = new Parser(reader,new DefaultParserFeedback()); node = new AbstractNode[numNodes]; } protected void createParser(String inputHTML, String url) { ! String testHTML = new String(inputHTML); ! StringReader sr = new StringReader(testHTML); ! reader = new NodeReader(new BufferedReader(sr),url); ! parser = new Parser(reader,new DefaultParserFeedback()); node = new AbstractNode[40]; } protected void createParser(String inputHTML, String url,int numNodes) { ! String testHTML = new String(inputHTML); ! StringReader sr = new StringReader(testHTML); ! reader = new NodeReader(new BufferedReader(sr),url); ! parser = new Parser(reader,new DefaultParserFeedback()); node = new AbstractNode[numNodes]; } --- 68,94 ---- protected void createParser(String inputHTML) { ! mLexer = new Lexer (new Page (inputHTML)); ! parser = new Parser(mLexer, new DefaultParserFeedback()); node = new AbstractNode[40]; } ! protected void createParser(String inputHTML,int numNodes) ! { ! Lexer lexer = new Lexer (inputHTML); ! parser = new Parser (lexer, new DefaultParserFeedback()); node = new AbstractNode[numNodes]; } protected void createParser(String inputHTML, String url) { ! Lexer lexer = new Lexer (inputHTML); ! lexer.getPage ().setUrl (url); ! parser = new Parser (lexer, new DefaultParserFeedback()); node = new AbstractNode[40]; } protected void createParser(String inputHTML, String url,int numNodes) { ! Lexer lexer = new Lexer (inputHTML); ! lexer.getPage ().setUrl (url); ! parser = new Parser (lexer, new DefaultParserFeedback()); node = new AbstractNode[numNodes]; } *************** *** 295,306 **** Tag tag = (Tag)node; if (tag.isEmptyXmlTag()) { // Add end tag ! String currLine = parser.getReader().getCurrentLine(); ! int pos = parser.getReader().getLastReadPosition(); ! currLine = ! currLine.substring(0,pos+1)+ ! "</"+tag.getTagName()+">"+ ! currLine.substring(pos+1,currLine.length()); ! parser.getReader().changeLine(currLine); } } --- 291,303 ---- Tag tag = (Tag)node; if (tag.isEmptyXmlTag()) { + // oh crap... // Add end tag ! // String currLine = parser.getReader().getCurrentLine(); ! // int pos = parser.getReader().getLastReadPosition(); ! // currLine = ! // currLine.substring(0,pos+1)+ ! // "</"+tag.getTagName()+">"+ ! // currLine.substring(pos+1,currLine.length()); ! // parser.getReader().changeLine(currLine); } } |