[Htmlparser-cvs] htmlparser/src/org/htmlparser/tests/lexerTests AllTests.java,NONE,1.1 PageTests.jav
Brought to you by:
derrickoswald
From: <der...@us...> - 2003-07-17 01:42:23
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/lexerTests In directory sc8-pr-cvs1:/tmp/cvs-serv31631/org/htmlparser/tests/lexerTests Added Files: AllTests.java PageTests.java SourceTests.java StreamTests.java Log Message: Initial drop of new io subsystem. --- NEW FILE: AllTests.java --- // HTMLParser Library v1_4_20030525 - A java-based parser for HTML // Copyright (C) Dec 31, 2000 Somik Raha // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // // For any questions or suggestions, you can write to me at : // Email :so...@in... // // Postal Address : // Somik Raha // Extreme Programmer & Coach // Industrial Logic Corporation // 2583 Cedar Street, Berkeley, // CA 94708, USA // Website : http://www.industriallogic.com package org.htmlparser.tests.lexerTests; import junit.framework.TestCase; import junit.framework.TestSuite; public class AllTests extends TestCase { public AllTests (String name) { super (name); } public static TestSuite suite () { TestSuite suite = new TestSuite ("Lexer Tests"); suite.addTestSuite (StreamTests.class); suite.addTestSuite (SourceTests.class); suite.addTestSuite (PageTests.class); return suite; } /** * Mainline for all suites of tests. * @param args Command line arguments. The following options * are understood: * <pre> * -text -- use junit.textui.TestRunner * -awt -- use junit.awtui.TestRunner * -swing -- use junit.swingui.TestRunner (default) * </pre> * All other options are passed on to the junit framework. */ public static void main(String[] args) { String runner; int i; String arguments[]; Class cls; runner = null; for (i = 0; (i < args.length) && (null == runner); i++) { if (args[i].equalsIgnoreCase ("-text")) runner = "junit.textui.TestRunner"; else if (args[i].equalsIgnoreCase ("-awt")) runner = "junit.awtui.TestRunner"; else if (args[i].equalsIgnoreCase ("-swing")) runner = "junit.swingui.TestRunner"; } if (null != runner) { // remove it from the arguments arguments = new String[args.length - 1]; System.arraycopy (args, 0, arguments, 0, i - 1); System.arraycopy (args, i, arguments, i - 1, args.length - i); args = arguments; } else runner = "junit.swingui.TestRunner"; /* * from http://www.mail-archive.com/commons-user%40jakarta.apache.org/msg02958.html * * The problem is within the UI test runners of JUnit. They bring * with them a custom classloader, which causes the * LogConfigurationException. Unfortunately Log4j doesn't work * either. * * Solution: Disable "Reload classes every run" or start JUnit with * command line option -noloading before the name of the Testsuite. */ // append the test class arguments = new String[args.length + 2]; System.arraycopy (args, 0, arguments, 0, args.length); arguments[arguments.length - 2] = "-noloading"; arguments[arguments.length - 1] = "org.htmlparser.tests.lexerTests.AllTests"; // invoke main() of the test runner try { cls = Class.forName (runner); java.lang.reflect.Method method = cls.getDeclaredMethod ( "main", new Class[] { String[].class }); method.invoke ( null, new Object[] { arguments }); } catch (Throwable t) { System.err.println ( "cannot run unit test (" + t.getMessage () + ")"); } } } --- NEW FILE: PageTests.java --- // HTMLParser Library v1_4_20030525 - A java-based parser for HTML // Copyright (C) Dec 31, 2000 Somik Raha // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // // For any questions or suggestions, you can write to me at : // Email :so...@in... // // Postal Address : // Somik Raha // Extreme Programmer & Coach // Industrial Logic Corporation // 2583 Cedar Street, Berkeley, // CA 94708, USA // Website : http://www.industriallogic.com package org.htmlparser.tests.lexerTests; import java.io.IOException; import java.io.UnsupportedEncodingException; import java.net.URL; import java.net.URLConnection; import junit.framework.TestCase; import org.htmlparser.lexer.Page; public class PageTests extends TestCase { /** * The default charset. * This should be <code>ISO-8859-1</code>, * see RFC 2616 (http://www.ietf.org/rfc/rfc2616.txt?number=2616) section 3.7.1 * Another alias is "8859_1". */ public static final String DEFAULT_CHARSET = "ISO-8859-1"; /** * Test the third level page class. */ public PageTests (String name) { super (name); } /** * Test initialization with a null value. */ public void testNull () throws IOException, UnsupportedEncodingException { Page page; try { page = new Page (null); assertTrue ("null value in constructor", false); } catch (IllegalArgumentException iae) { // expected outcome } } /** * Test initialization with a real value. */ public void testURLConnection () throws IOException, UnsupportedEncodingException { String link; URL url; Page page; link = "http://www.ibm.com/jp/"; url = new URL (link); page = new Page (url.openConnection ()); } } --- NEW FILE: SourceTests.java --- // HTMLParser Library v1_4_20030525 - A java-based parser for HTML // Copyright (C) Dec 31, 2000 Somik Raha // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // // For any questions or suggestions, you can write to me at : // Email :so...@in... // // Postal Address : // Somik Raha // Extreme Programmer & Coach // Industrial Logic Corporation // 2583 Cedar Street, Berkeley, // CA 94708, USA // Website : http://www.industriallogic.com package org.htmlparser.tests.lexerTests; import java.io.BufferedInputStream; import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.net.MalformedURLException; import java.net.URL; import java.net.URLConnection; import java.util.ArrayList; import java.util.Random; import junit.framework.TestCase; import org.htmlparser.lexer.Stream; import org.htmlparser.lexer.Source; public class SourceTests extends TestCase { /** * The default charset. * This should be <code>ISO-8859-1</code>, * see RFC 2616 (http://www.ietf.org/rfc/rfc2616.txt?number=2616) section 3.7.1 * Another alias is "8859_1". */ public static final String DEFAULT_CHARSET = "ISO-8859-1"; /** * Test the first level stream class. */ public SourceTests (String name) { super (name); } /** * Test initialization with a null value. */ public void testNull () throws IOException { Source source; source = new Source (null); assertTrue ("erroneous character", -1 == source.read ()); } /** * Test initialization with a null charset name. */ public void testEmpty () throws IOException { Source source; source = new Source (new Stream (new ByteArrayInputStream (new byte[0])), null); assertTrue ("erroneous character", -1 == source.read ()); } /** * Test initialization with an input stream having only one byte. */ public void testOneByte () throws IOException { Source source; source = new Source (new Stream (new ByteArrayInputStream (new byte[] { (byte)0x42 })), null); assertTrue ("erroneous character", 'B' == source.read ()); assertTrue ("extra character", -1 == source.read ()); } /** * Test close. */ public void testClose () throws IOException { Source source; source = new Source (new Stream (new ByteArrayInputStream ("hello word".getBytes ())), null); assertTrue ("no character", -1 != source.read ()); source.close (); try { source.read (); fail ("not closed"); } catch (IOException ioe) { // expected outcome } } /** * Test reset. */ public void testReset () throws IOException { String reference; Source source; StringBuffer buffer; int c; reference = "Now is the time for all good men to come to the aid of the party"; source = new Source (new Stream (new ByteArrayInputStream (reference.getBytes (DEFAULT_CHARSET))), null); buffer = new StringBuffer (reference.length ()); while (-1 != (c = source.read ())) buffer.append ((char)c); assertTrue ("string incorrect", reference.equals (buffer.toString ())); source.reset (); buffer.setLength (0); while (-1 != (c = source.read ())) buffer.append ((char)c); assertTrue ("string incorrect", reference.equals (buffer.toString ())); source.close (); } /** * Test reset in the middle of reading. */ public void testMidReset () throws IOException { String reference; Source source; StringBuffer buffer; int c; reference = "Now is the time for all good men to come to the aid of the party"; source = new Source (new Stream (new ByteArrayInputStream (reference.getBytes (DEFAULT_CHARSET))), null); buffer = new StringBuffer (reference.length ()); for (int i = 0; i < 25; i++) buffer.append ((char)source.read ()); source.reset (); for (int i = 0; i < 25; i++) source.read (); while (-1 != (c = source.read ())) buffer.append ((char)c); assertTrue ("string incorrect", reference.equals (buffer.toString ())); source.close (); } /** * Test mark/reset in the middle of reading. */ public void testMarkReset () throws IOException { String reference; Source source; StringBuffer buffer; int c; reference = "Now is the time for all good men to come to the aid of the party"; source = new Source (new Stream (new ByteArrayInputStream (reference.getBytes (DEFAULT_CHARSET))), null); assertTrue ("not markable", source.markSupported ()); buffer = new StringBuffer (reference.length ()); for (int i = 0; i < 25; i++) buffer.append ((char)source.read ()); source.mark (88); for (int i = 0; i < 25; i++) source.read (); source.reset (); while (-1 != (c = source.read ())) buffer.append ((char)c); assertTrue ("string incorrect", reference.equals (buffer.toString ())); source.close (); } /** * Test skip. */ public void testSkip () throws IOException { String part1; String part2; String part3; String reference; Source source; StringBuffer buffer; int c; part1 = "Now is the time "; part2 = "for all good men "; part3 = "to come to the aid of the party"; reference = part1 + part2 + part3; source = new Source (new Stream (new ByteArrayInputStream (reference.getBytes (DEFAULT_CHARSET))), null); buffer = new StringBuffer (reference.length ()); for (int i = 0; i < part1.length (); i++) buffer.append ((char)source.read ()); source.skip (part2.length ()); while (-1 != (c = source.read ())) buffer.append ((char)c); assertTrue ("string incorrect", (part1 + part3).equals (buffer.toString ())); source.close (); } /** * Test multi-byte read. */ public void testMultByte () throws IOException { String reference; Source source; char[] buffer; reference = "Now is the time for all good men to come to the aid of the party"; source = new Source (new Stream (new ByteArrayInputStream (reference.getBytes (DEFAULT_CHARSET))), null); buffer = new char[reference.length ()]; source.read (buffer, 0, buffer.length); assertTrue ("string incorrect", reference.equals (new String (buffer))); assertTrue ("extra character", -1 == source.read ()); source.close (); } /** * Test positioned multi-byte read. */ public void testPositionedMultByte () throws IOException { String part1; String part2; String part3; String reference; Source source; char[] buffer; int c; int length; part1 = "Now is the time "; part2 = "for all good men "; part3 = "to come to the aid of the party"; reference = part1 + part2 + part3; source = new Source (new Stream (new ByteArrayInputStream (reference.getBytes (DEFAULT_CHARSET))), null); buffer = new char[reference.length ()]; for (int i = 0; i < part1.length (); i++) buffer[i] = (char)source.read (); length = source.read (buffer, part1.length (), part2.length ()); assertTrue ("incorrect length", part2.length () == length); length += part1.length (); for (int i = 0; i < part3.length (); i++) buffer[i + length] = (char)source.read (); assertTrue ("string incorrect", reference.equals (new String (buffer))); assertTrue ("extra character", -1 == source.read ()); source.close (); } /** * Test ready. */ public void testReady () throws IOException { Source source; source = new Source (new Stream (new ByteArrayInputStream (new byte[] { (byte)0x42, (byte)0x62 })), null); assertTrue ("ready?", !source.ready ()); assertTrue ("erroneous character", 'B' == source.read ()); assertTrue ("not ready", source.ready ()); assertTrue ("erroneous character", 'b' == source.read ()); assertTrue ("ready?", !source.ready ()); assertTrue ("extra character", -1 == source.read ()); } /** * Test that the same characters are returned as with another reader. */ public void testSameChars () throws IOException { String link; ArrayList chars1; ArrayList chars2; URL url; URLConnection connection; InputStreamReader in; int c; Source source; int index; // pick a big file link = "http://sourceforge.net/projects/htmlparser/HTMLParser_Coverage.html"; chars1 = new ArrayList (); chars2 = new ArrayList (); try { url = new URL (link); connection = url.openConnection (); connection.connect (); in = new InputStreamReader (new BufferedInputStream (connection.getInputStream ()), DEFAULT_CHARSET); while (-1 != (c = in.read ())) chars1.add (new Character ((char)c)); in.close (); connection = url.openConnection (); connection.connect (); source = new Source (new Stream (connection.getInputStream ())); while (-1 != (c = source.read ())) chars2.add (new Character ((char)c)); source.close (); index = 0; while (index < chars1.size ()) { assertEquals ("characters differ at position " + index, chars1.get (index), chars2.get (index)); index++; } assertTrue ("extra characters", index == chars2.size ()); } catch (MalformedURLException murle) { fail ("bad url " + link); } } } --- NEW FILE: StreamTests.java --- // HTMLParser Library v1_4_20030525 - A java-based parser for HTML // Copyright (C) Dec 31, 2000 Somik Raha // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // // For any questions or suggestions, you can write to me at : // Email :so...@in... // // Postal Address : // Somik Raha // Extreme Programmer & Coach // Industrial Logic Corporation // 2583 Cedar Street, Berkeley, // CA 94708, USA // Website : http://www.industriallogic.com package org.htmlparser.tests.lexerTests; import java.io.BufferedInputStream; import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; import java.net.MalformedURLException; import java.net.URL; import java.net.URLConnection; import java.util.ArrayList; import java.util.Random; import junit.framework.TestCase; import org.htmlparser.lexer.Stream; public class StreamTests extends TestCase { /** * Test the first level stream class. */ public StreamTests (String name) { super (name); } /** * Test initialization with a null value. */ public void testNull () throws IOException { Stream stream; stream = new Stream (null); assertTrue ("erroneous character", -1 == stream.read ()); } /** * Test initialization with an empty input stream. */ public void testEmpty () throws IOException { Stream stream; stream = new Stream (new ByteArrayInputStream (new byte[0])); assertTrue ("erroneous character", -1 == stream.read ()); } /** * Test initialization with an input stream having only one byte. */ public void testOneByte () throws IOException { Stream stream; stream = new Stream (new ByteArrayInputStream (new byte[] { (byte)0x42 })); assertTrue ("erroneous character", 0x42 == stream.read ()); assertTrue ("erroneous character", -1 == stream.read ()); } /** * Test that the same bytes are returned as with a naked input stream. */ public void testSameBytes () throws IOException { String link; ArrayList bytes1; ArrayList bytes2; URL url; URLConnection connection; BufferedInputStream in; int b; Stream stream; int index; // pick a big file link = "http://sourceforge.net/projects/htmlparser/HTMLParser_Coverage.html"; bytes1 = new ArrayList (); bytes2 = new ArrayList (); try { url = new URL (link); connection = url.openConnection (); connection.connect (); in = new BufferedInputStream (connection.getInputStream ()); while (-1 != (b = in.read ())) bytes1.add (new Byte ((byte)b)); in.close (); connection = url.openConnection (); connection.connect (); stream = new Stream (connection.getInputStream ()); while (-1 != (b = stream.read ())) bytes2.add (new Byte ((byte)b)); stream.close (); index = 0; while (index < bytes1.size ()) { assertEquals ("bytes differ at position " + index, bytes1.get (index), bytes2.get (index)); index++; } assertTrue ("extra bytes", index == bytes2.size ()); } catch (MalformedURLException murle) { fail ("bad url " + link); } } /** * Test that threading works and is faster than a naked input stream. * This, admittedly contrived, test illustrates the following principles: * <li>the underlying network code is already multi-threaded, so there may * not be a need to use application level threading in most cases</li> * <li>results may vary based on network connection speed, JVM, and * especially application usage pattterns</li> * <li>issues only show up with large files, in my case greater than * about 72,400 bytes, since the underlying network code reads that far * into the socket before throttling back and waiting</li> * <li>this is only applicable to TCP/IP usage, disk access would not * have this problem, since the cost of reading disk is much less than * the round-trip cost of a TCP/IP handshake</li> * So, what does it do? It sets up to read a URL two ways, once with a * naked input stream, and then with the Stream class. In each case, before * reading, it delays about 2 seconds (for me anyway) to allow the java.net * implementation to read ahead and then throttle back. The threaded Stream * though keeps reading while this delay is going on and hence gets a big * chunk of the file in memory. This advantage translates to a faster * spin through the bytes after the delay. */ public void testThreaded () throws IOException { String link; URL url; URLConnection connection; BufferedInputStream in; int index; long begin; double bytes_per_second; int delay; Stream stream; long time1; long time2; Thread thread; long available1; long available2; // pick a big file link = "http://htmlparser.sourceforge.net/javadoc_1_3/index-all.html"; try { url = new URL (link); // estimate the connection speed System.gc (); index = 0; connection = url.openConnection (); connection.connect (); in = new BufferedInputStream (connection.getInputStream ()); begin = System.currentTimeMillis (); while (-1 != in.read ()) index++; bytes_per_second = 1000.0 * index / (System.currentTimeMillis () - begin); in.close (); delay = (int)(1.5 * 1000 * bytes_per_second / 72400); // 72400 is the throttle limit on my machine // try the naked input stream System.gc (); index = 0; available1 = 0; connection = url.openConnection (); connection.connect (); in = new BufferedInputStream (connection.getInputStream ()); try { Thread.sleep (delay); } catch (Exception e) { e.printStackTrace (); } begin = System.currentTimeMillis (); do { index++; if (0 == index % 1000) available1 += in.available (); } while (-1 != in.read ()); time1 = System.currentTimeMillis () - begin; in.close (); // try a threaded stream System.gc (); index = 0; available2 = 0; connection = url.openConnection (); connection.connect (); int length = connection.getContentLength (); stream = new Stream (connection.getInputStream (), length); thread = new Thread (stream); thread.setPriority (Thread.NORM_PRIORITY - 1); thread.start (); try { Thread.sleep (delay); } catch (Exception e) { e.printStackTrace (); } begin = System.currentTimeMillis (); do { index++; if (0 == index % 1000) available2 += stream.available (); } while (-1 != stream.read ()); time2 = System.currentTimeMillis () - begin; System.out.println ("fills: " + stream.fills); System.out.println ("reallocations: " + stream.reallocations); System.out.println ("synchronous: " + stream.synchronous); System.out.println ("buffer size: " + stream.mBuffer.length); System.out.println ("bytes: " + stream.mLevel); stream.close (); System.out.println ("time (" + time2 + ") vs. (" + time1 + ") for " + index + " bytes"); double samples = index / 1000; System.out.println ("average available bytes (" + available2/samples + ") vs. (" + available1/samples + ")"); assertTrue ("slower (" + time2 + ") vs. (" + time1 + ")", time2 < time1); assertTrue ("average available bytes not greater (" + available2/samples + ") vs. (" + available1/samples + ")", available2 > available1); } catch (MalformedURLException murle) { fail ("bad url " + link); } } /** * Test that mark and reset work as per the contract. */ public void testMarkReset () throws IOException { String link; ArrayList bytes1; ArrayList bytes2; URL url; URLConnection connection; Stream stream; int b; int index; // pick a small file > 2000 bytes link = "http://sourceforge.net/projects/htmlparser/overview-summary.html"; bytes1 = new ArrayList (); bytes2 = new ArrayList (); try { url = new URL (link); connection = url.openConnection (); connection.connect (); stream = new Stream (connection.getInputStream ()); assertTrue ("mark not supported", stream.markSupported ()); for (int i = 0; i < 1000; i++) { b = stream.read (); bytes1.add (new Byte ((byte)b)); } stream.reset (); for (int i = 0; i < 1000; i++) { b = stream.read (); bytes2.add (new Byte ((byte)b)); } index = 0; while (index < bytes1.size ()) { assertEquals ("bytes differ at position " + index, bytes1.get (index), bytes2.get (index)); index++; } bytes1.clear (); bytes2.clear (); stream.mark (1000); // the 1000 is ignored for (int i = 0; i < 1000; i++) { b = stream.read (); bytes1.add (new Byte ((byte)b)); } stream.reset (); for (int i = 0; i < 1000; i++) { b = stream.read (); bytes2.add (new Byte ((byte)b)); } stream.close (); index = 0; while (index < bytes1.size ()) { assertEquals ("bytes differ at position " + (index + 1000), bytes1.get (index), bytes2.get (index)); index++; } } catch (MalformedURLException murle) { fail ("bad url " + link); } } /** * Test that mark and reset work as per the contract when threaded. */ public void testMarkResetThreaded () throws IOException { String link; ArrayList bytes1; ArrayList bytes2; URL url; URLConnection connection; Stream stream; int b; int index; // pick a small file > 2000 bytes link = "http://sourceforge.net/projects/htmlparser/overview-summary.html"; bytes1 = new ArrayList (); bytes2 = new ArrayList (); try { url = new URL (link); connection = url.openConnection (); connection.connect (); stream = new Stream (connection.getInputStream ()); (new Thread (stream)).start (); assertTrue ("mark not supported", stream.markSupported ()); for (int i = 0; i < 1000; i++) { b = stream.read (); bytes1.add (new Byte ((byte)b)); } stream.reset (); for (int i = 0; i < 1000; i++) { b = stream.read (); bytes2.add (new Byte ((byte)b)); } index = 0; while (index < bytes1.size ()) { assertEquals ("bytes differ at position " + index, bytes1.get (index), bytes2.get (index)); index++; } bytes1.clear (); bytes2.clear (); stream.mark (1000); // the 1000 is ignored for (int i = 0; i < 1000; i++) { b = stream.read (); bytes1.add (new Byte ((byte)b)); } stream.reset (); for (int i = 0; i < 1000; i++) { b = stream.read (); bytes2.add (new Byte ((byte)b)); } stream.close (); index = 0; while (index < bytes1.size ()) { assertEquals ("bytes differ at position " + (index + 1000), bytes1.get (index), bytes2.get (index)); index++; } } catch (MalformedURLException murle) { fail ("bad url " + link); } } /** * Test close. */ public void testClose () throws IOException { Stream stream; stream = new Stream (new ByteArrayInputStream (new byte[] { (byte)0x42, (byte)0x78 })); assertTrue ("erroneous character", 0x42 == stream.read ()); stream.close (); assertTrue ("not closed", -1 == stream.read ()); } } |