htmlparser-cvs Mailing List for HTML Parser (Page 23)
Brought to you by:
derrickoswald
You can subscribe to this list here.
2003 |
Jan
|
Feb
|
Mar
|
Apr
|
May
(141) |
Jun
(108) |
Jul
(66) |
Aug
(127) |
Sep
(155) |
Oct
(149) |
Nov
(72) |
Dec
(72) |
---|---|---|---|---|---|---|---|---|---|---|---|---|
2004 |
Jan
(100) |
Feb
(36) |
Mar
(21) |
Apr
(3) |
May
(87) |
Jun
(28) |
Jul
(84) |
Aug
(5) |
Sep
(14) |
Oct
|
Nov
|
Dec
|
2005 |
Jan
(1) |
Feb
(39) |
Mar
(26) |
Apr
(38) |
May
(14) |
Jun
(10) |
Jul
|
Aug
|
Sep
(13) |
Oct
(8) |
Nov
(10) |
Dec
|
2006 |
Jan
|
Feb
(1) |
Mar
(17) |
Apr
(20) |
May
(28) |
Jun
(24) |
Jul
|
Aug
|
Sep
|
Oct
|
Nov
|
Dec
|
2015 |
Jan
|
Feb
|
Mar
(1) |
Apr
|
May
|
Jun
|
Jul
|
Aug
|
Sep
|
Oct
|
Nov
|
Dec
|
From: <der...@us...> - 2004-01-19 23:14:26
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/parserapplications In directory sc8-pr-cvs1:/tmp/cvs-serv32229/src/org/htmlparser/parserapplications Modified Files: SiteCapturer.java Log Message: Update version to 1.4-20040119. Index: SiteCapturer.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/parserapplications/SiteCapturer.java,v retrieving revision 1.4 retrieving revision 1.5 diff -C2 -d -r1.4 -r1.5 *** SiteCapturer.java 14 Jan 2004 02:53:46 -0000 1.4 --- SiteCapturer.java 19 Jan 2004 23:14:18 -0000 1.5 *************** *** 41,50 **** --- 41,55 ---- import javax.swing.JOptionPane; + import org.htmlparser.NodeFilter; import org.htmlparser.Parser; import org.htmlparser.PrototypicalNodeFactory; + import org.htmlparser.filters.AndFilter; + import org.htmlparser.filters.HasAttributeFilter; + import org.htmlparser.filters.NodeClassFilter; import org.htmlparser.tags.BaseHrefTag; import org.htmlparser.tags.FrameTag; import org.htmlparser.tags.ImageTag; import org.htmlparser.tags.LinkTag; + import org.htmlparser.tags.MetaTag; import org.htmlparser.util.NodeIterator; import org.htmlparser.util.NodeList; *************** *** 53,57 **** /** * Save a web site locally. ! * Illustrative prgram to save a web site contents locally. * It was created to demonstrate URL rewriting in it's simplest form. * It uses customized tags in the NodeFactory to alter the URLs. --- 58,62 ---- /** * Save a web site locally. ! * Illustrative program to save a web site contents locally. * It was created to demonstrate URL rewriting in it's simplest form. * It uses customized tags in the NodeFactory to alter the URLs. *************** *** 124,127 **** --- 129,137 ---- /** + * The filter to apply to the nodes retrieved. + */ + protected NodeFilter mFilter; + + /** * Copy buffer size. * Resources are moved to disk in chunks this size or less. *************** *** 136,139 **** --- 146,151 ---- PrototypicalNodeFactory factory; + mSource = null; + mTarget = null; mPages = new ArrayList (); mFinished = new HashSet (); *************** *** 147,150 **** --- 159,164 ---- factory.registerTag (new LocalImageTag ()); mParser.setNodeFactory (factory); + mCaptureResources = true; + mFilter = null; } *************** *** 213,216 **** --- 227,249 ---- } + + /** Getter for property filter. + * @return Value of property filter. + * + */ + public NodeFilter getFilter () + { + return (mFilter); + } + + /** Setter for property filter. + * @param filter New value of property filter. + * + */ + public void setFilter (NodeFilter filter) + { + mFilter = filter; + } + /** * Returns <code>true</code> if the link is one we are interested in. *************** *** 281,285 **** String ret; ! if (link.equals (getSource ())) ret = "index.html"; // handle the root page specially else if (link.startsWith (getSource ()) --- 314,318 ---- String ret; ! if (link.equals (getSource ()) || (!getSource ().endsWith ("/") && link.equals (getSource () + "/"))) ret = "index.html"; // handle the root page specially else if (link.startsWith (getSource ()) *************** *** 382,391 **** * Process a single page. */ ! protected void process () throws ParserException { String url; NodeList list; File file; File dir; --- 415,428 ---- * Process a single page. */ ! protected void process (NodeFilter filter) throws ParserException { String url; + int bookmark; NodeList list; + NodeList robots; + MetaTag robot; + String content; File file; File dir; *************** *** 398,402 **** try ! { // fetch the page and gather the list of nodes mParser.setURL (url); list = new NodeList (); --- 435,441 ---- try ! { ! bookmark = mPages.size (); ! // fetch the page and gather the list of nodes mParser.setURL (url); list = new NodeList (); *************** *** 404,407 **** --- 443,468 ---- list.add (e.nextNode ()); // URL conversion occurs in the tags + // handle robots meta tag according to http://www.robotstxt.org/wc/meta-user.html + // <meta name="robots" content="index,follow" /> + // <meta name="robots" content="noindex,nofollow" /> + robots = list.extractAllNodesThatMatch ( + new AndFilter ( + new NodeClassFilter (MetaTag.class), + new HasAttributeFilter ("name", "robots")), true); + if (0 != robots.size ()) + { + robot = (MetaTag)robots.elementAt (0); + content = robot.getAttribute ("content").toLowerCase (); + if ((-1 != content.indexOf ("none")) || (-1 != content.indexOf ("nofollow"))) + // reset mPages + for (int i = bookmark; i < mPages.size (); i++) + mPages.remove (i); + if ((-1 != content.indexOf ("none")) || (-1 != content.indexOf ("noindex"))) + return; + } + + if (null != filter) + list.keepAllNodesThatMatch (filter, true); + // save the page locally file = new File (getTarget (), makeLocalLink (url, "")); *************** *** 409,412 **** --- 470,481 ---- if (!dir.exists ()) dir.mkdirs (); + else if (!dir.isDirectory ()) + { + dir = new File (dir.getParentFile (), dir.getName () + ".content"); + if (!dir.exists ()) + dir.mkdirs (); + file = new File (dir, file.getName ()); + } + try { *************** *** 581,585 **** try { ! process (); while (0 != mImages.size ()) copy (); --- 650,654 ---- try { ! process (getFilter ()); while (0 != mImages.size ()) copy (); |
From: <der...@us...> - 2004-01-19 23:14:25
|
Update of /cvsroot/htmlparser/htmlparser/docs In directory sc8-pr-cvs1:/tmp/cvs-serv32229/docs Modified Files: release.txt Log Message: Update version to 1.4-20040119. Index: release.txt =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/docs/release.txt,v retrieving revision 1.53 retrieving revision 1.54 diff -C2 -d -r1.53 -r1.54 *** release.txt 4 Jan 2004 19:03:35 -0000 1.53 --- release.txt 19 Jan 2004 23:14:17 -0000 1.54 *************** *** 1,3 **** ! HTMLParser Version 1.4 (Integration Build Jan 04, 2004) ********************************************* --- 1,3 ---- ! HTMLParser Version 1.4 (Integration Build Jan 19, 2004) ********************************************* |
From: <der...@us...> - 2004-01-19 23:14:25
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser In directory sc8-pr-cvs1:/tmp/cvs-serv32229/src/org/htmlparser Modified Files: Parser.java Log Message: Update version to 1.4-20040119. Index: Parser.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/Parser.java,v retrieving revision 1.83 retrieving revision 1.84 diff -C2 -d -r1.83 -r1.84 *** Parser.java 14 Jan 2004 02:53:46 -0000 1.83 --- Parser.java 19 Jan 2004 23:14:18 -0000 1.84 *************** *** 85,89 **** */ public final static String ! VERSION_DATE = "Jan 04, 2004" ; --- 85,89 ---- */ public final static String ! VERSION_DATE = "Jan 19, 2004" ; |
From: <der...@us...> - 2004-01-19 23:13:08
|
Update of /cvsroot/htmlparser/htmlparser/docs In directory sc8-pr-cvs1:/tmp/cvs-serv31736 Modified Files: changes.txt Log Message: Update version to 1.4-20040119. Fix missed application file. Index: changes.txt =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/docs/changes.txt,v retrieving revision 1.194 retrieving revision 1.195 diff -C2 -d -r1.194 -r1.195 *** changes.txt 4 Jan 2004 19:03:35 -0000 1.194 --- changes.txt 19 Jan 2004 23:13:05 -0000 1.195 *************** *** 13,16 **** --- 13,95 ---- ******************************************************************************* + Integration Build 1.4 - 20040119 + -------------------------------- + + 2004-01-19 17:44 derrickoswald + + * src/org/htmlparser/tags/CompositeTag.java: + + Fix CompositeTag.toString() which caused java.lang.StackOverflowError for tags of the form <td width="69"/>. + In this case the end tag is 'this' tag which wasn't handled by the output code. + Added testXMLTypeToString() to ParserTest. + + 2004-01-13 21:53 derrickoswald + + * build.xml, src/doc-files/todo.html, + src/org/htmlparser/Parser.java, + src/org/htmlparser/PrototypicalNodeFactory.java, + src/org/htmlparser/RemarkNode.java, + src/org/htmlparser/StringNode.java, + src/org/htmlparser/lexer/nodes/TagNode.java, + src/org/htmlparser/parserapplications/SiteCapturer.java, + src/org/htmlparser/scanners/CompositeTagScanner.java, + src/org/htmlparser/scanners/ScriptScanner.java, + src/org/htmlparser/tags/BaseHrefTag.java, + src/org/htmlparser/tags/FormTag.java, + src/org/htmlparser/tags/FrameTag.java, + src/org/htmlparser/tags/ImageTag.java, + src/org/htmlparser/tags/JspTag.java, + src/org/htmlparser/tags/SelectTag.java, + src/org/htmlparser/tags/Tag.java, + src/org/htmlparser/tags/TextareaTag.java, + src/org/htmlparser/tests/FunctionalTests.java, + src/org/htmlparser/tests/ParserTest.java, + src/org/htmlparser/tests/ParserTestCase.java, + src/org/htmlparser/tests/lexerTests/AttributeTests.java, + src/org/htmlparser/tests/lexerTests/KitTest.java, + src/org/htmlparser/tests/lexerTests/LexerTests.java, + src/org/htmlparser/tests/lexerTests/PageTests.java, + src/org/htmlparser/tests/lexerTests/SourceTests.java, + src/org/htmlparser/tests/lexerTests/StreamTests.java, + src/org/htmlparser/tests/scannersTests/CompositeTagScannerTest.java, + src/org/htmlparser/tests/scannersTests/JspScannerTest.java, + src/org/htmlparser/tests/scannersTests/ScriptScannerTest.java, + src/org/htmlparser/tests/scannersTests/TagScannerTest.java, + src/org/htmlparser/tests/tagTests/BaseHrefTagTest.java, + src/org/htmlparser/tests/tagTests/BulletTagTest.java, + src/org/htmlparser/tests/tagTests/ImageTagTest.java, + src/org/htmlparser/tests/tagTests/JspTagTest.java, + src/org/htmlparser/tests/tagTests/LinkTagTest.java, + src/org/htmlparser/tests/tagTests/ScriptTagTest.java, + src/org/htmlparser/tests/utilTests/CharacterTranslationTest.java, + src/org/htmlparser/util/ParserUtils.java, + src/org/htmlparser/util/sort/Sort.java, + src/org/htmlparser/visitors/HtmlPage.java, + src/org/htmlparser/visitors/TextExtractingVisitor.java: + + Remove unneeded imports. + + 2004-01-10 10:23 derrickoswald + + * src/org/htmlparser/: beans/StringBean.java, lexer/Page.java, + tests/lexerTests/LexerTests.java, + util/EncodingChangeException.java, util/IteratorImpl.java: + + Fix bug #874175 StringBean doesn't handle charset change well + Add EncodingChangeException to distinguish a recoverable character set change + occuring after the lexer has already coughed up some characters using the wrong + encoding. Added testEncodingChange in LexerTests to excercise it. + Changed IteratorImpl to not wrap a ParserException with another ParserException. + Changed StringBean to retry the URL when an encoding change exception is caught. + + 2004-01-09 19:06 derrickoswald + + * src/org/htmlparser/: filters/HasAttributeFilter.java, + parserapplications/SiteCapturer.java, + parserapplications/WikiCapturer.java, util/NodeList.java: + + First pass at the wiki capturer. + Added useful extensions to the HasAttributeFilter, SiteCapturer and NodeList + Integration Build 1.4 - 20040104 -------------------------------- |
From: <der...@us...> - 2004-01-19 22:45:02
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tags In directory sc8-pr-cvs1:/tmp/cvs-serv22765 Modified Files: CompositeTag.java Log Message: Fix CompositeTag.toString() which caused java.lang.StackOverflowError for tags of the form <td width="69"/>. In this case the end tag is 'this' tag which wasn't handled by the output code. Added testXMLTypeToString() to ParserTest. Index: CompositeTag.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tags/CompositeTag.java,v retrieving revision 1.71 retrieving revision 1.72 diff -C2 -d -r1.71 -r1.72 *** CompositeTag.java 2 Jan 2004 16:24:54 -0000 1.71 --- CompositeTag.java 19 Jan 2004 22:44:59 -0000 1.72 *************** *** 484,488 **** } ! if (null != getEndTag ()) // eliminate virtual tags // if (!(getEndTag ().getStartPosition () == getEndTag ().getEndPosition ())) --- 484,488 ---- } ! if ((null != getEndTag ()) && (this != getEndTag ())) // 2nd guard handles <tag/> // eliminate virtual tags // if (!(getEndTag ().getStartPosition () == getEndTag ().getEndPosition ())) |
From: <der...@us...> - 2004-01-14 03:20:04
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/utilTests In directory sc8-pr-cvs1:/tmp/cvs-serv32457 Modified Files: CharacterTranslationTest.java Log Message: Index: CharacterTranslationTest.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/utilTests/CharacterTranslationTest.java,v retrieving revision 1.40 retrieving revision 1.41 diff -C2 -d -r1.40 -r1.41 *** CharacterTranslationTest.java 14 Jan 2004 03:10:55 -0000 1.40 --- CharacterTranslationTest.java 14 Jan 2004 03:20:01 -0000 1.41 *************** *** 65,83 **** } ! public void testInitialCharacterEntityReferenceWithoutSemi () ! { ! assertEquals ( ! "character entity reference without a semicolon at start of string doesn't work", ! "\u00f7 is the division sign.", ! Translate.decode ("÷ is the division sign.")); ! } ! ! public void testInitialNumericCharacterReferenceWithoutSemi () ! { ! assertEquals ( ! "numeric character reference without a semicolon at start of string doesn't work", ! "\u00f7 is the division sign.", ! Translate.decode ("÷ is the division sign.")); ! } public void testFinalCharacterEntityReference () --- 65,83 ---- } ! // public void testInitialCharacterEntityReferenceWithoutSemi () ! // { ! // assertEquals ( ! // "character entity reference without a semicolon at start of string doesn't work", ! // "\u00f7 is the division sign.", ! // Translate.decode ("÷ is the division sign.")); ! // } ! // ! // public void testInitialNumericCharacterReferenceWithoutSemi () ! // { ! // assertEquals ( ! // "numeric character reference without a semicolon at start of string doesn't work", ! // "\u00f7 is the division sign.", ! // Translate.decode ("÷ is the division sign.")); ! // } public void testFinalCharacterEntityReference () |
From: <der...@us...> - 2004-01-14 03:10:58
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/utilTests In directory sc8-pr-cvs1:/tmp/cvs-serv31253 Modified Files: CharacterTranslationTest.java Log Message: Index: CharacterTranslationTest.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/utilTests/CharacterTranslationTest.java,v retrieving revision 1.39 retrieving revision 1.40 diff -C2 -d -r1.39 -r1.40 *** CharacterTranslationTest.java 14 Jan 2004 02:53:47 -0000 1.39 --- CharacterTranslationTest.java 14 Jan 2004 03:10:55 -0000 1.40 *************** *** 153,241 **** } ! public byte[] encodedecode (byte[] bytes) ! throws ! IOException ! { ! InputStream in; ! ByteArrayOutputStream out; ! ! // encode ! in = new ByteArrayInputStream (bytes); ! out = new ByteArrayOutputStream (); ! Translate.encode (in, new PrintStream (out)); ! in.close (); ! out.close (); ! ! // decode ! in = new ByteArrayInputStream (out.toByteArray ()); ! out = new ByteArrayOutputStream (); ! Translate.decode (in, new PrintStream (out)); ! in.close (); ! out.close (); ! ! return (out.toByteArray ()); ! } ! ! public void check (byte[] reference, byte[] result) ! throws ! IOException ! { ! InputStream ref; ! InputStream in; ! int i; ! int i1; ! int i2; ! ! ref = new ByteArrayInputStream (reference); ! in = new ByteArrayInputStream (result); ! i = 0; ! do ! { ! i1 = ref.read (); ! i2 = in.read (); ! if (i1 != i2) ! fail ("byte difference detected at offset " + i); ! i++; ! } ! while (-1 != i1); ! ref.close (); ! in.close (); ! } ! ! // public void testInitialCharacterEntityReferenceCodec () // throws // IOException // { ! // byte[] data = "\u00f7 is the division sign.".getBytes (); ! // check (data, encodedecode (data)); // } - - public void testEncodeDecodePage () throws IOException - { - URL url; - URLConnection connection; - InputStream in; - ByteArrayOutputStream out; - byte[] bytes; - byte[] result; - int c; - - // get some bytes - url = new URL ("http://sourceforge.net/projects/htmlparser"); - connection = url.openConnection (); - in = connection.getInputStream (); - out = new ByteArrayOutputStream (); - while (-1 != (c = in.read ())) - out.write (c); - in.close (); - out.close (); - bytes = out.toByteArray (); - - // run it through - result = encodedecode (bytes); - - // check - check (bytes, result); - } } --- 153,241 ---- } ! // public byte[] encodedecode (byte[] bytes) // throws // IOException // { ! // InputStream in; ! // ByteArrayOutputStream out; ! // ! // // encode ! // in = new ByteArrayInputStream (bytes); ! // out = new ByteArrayOutputStream (); ! // Translate.encode (in, new PrintStream (out)); ! // in.close (); ! // out.close (); ! // ! // // decode ! // in = new ByteArrayInputStream (out.toByteArray ()); ! // out = new ByteArrayOutputStream (); ! // Translate.decode (in, new PrintStream (out)); ! // in.close (); ! // out.close (); ! // ! // return (out.toByteArray ()); ! // } ! // ! // public void check (byte[] reference, byte[] result) ! // throws ! // IOException ! // { ! // InputStream ref; ! // InputStream in; ! // int i; ! // int i1; ! // int i2; ! // ! // ref = new ByteArrayInputStream (reference); ! // in = new ByteArrayInputStream (result); ! // i = 0; ! // do ! // { ! // i1 = ref.read (); ! // i2 = in.read (); ! // if (i1 != i2) ! // fail ("byte difference detected at offset " + i); ! // i++; ! // } ! // while (-1 != i1); ! // ref.close (); ! // in.close (); ! // } ! // ! //// public void testInitialCharacterEntityReferenceCodec () ! //// throws ! //// IOException ! //// { ! //// byte[] data = "\u00f7 is the division sign.".getBytes (); ! //// check (data, encodedecode (data)); ! //// } ! // ! // public void testEncodeDecodePage () throws IOException ! // { ! // URL url; ! // URLConnection connection; ! // InputStream in; ! // ByteArrayOutputStream out; ! // byte[] bytes; ! // byte[] result; ! // int c; ! // ! // // get some bytes ! // url = new URL ("http://sourceforge.net/projects/htmlparser"); ! // connection = url.openConnection (); ! // in = connection.getInputStream (); ! // out = new ByteArrayOutputStream (); ! // while (-1 != (c = in.read ())) ! // out.write (c); ! // in.close (); ! // out.close (); ! // bytes = out.toByteArray (); ! // ! // // run it through ! // result = encodedecode (bytes); ! // ! // // check ! // check (bytes, result); // } } |
From: <der...@us...> - 2004-01-14 02:58:09
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tags In directory sc8-pr-cvs1:/tmp/cvs-serv28895 Modified Files: InputTag.java Log Message: Remove unneeded imports. Index: InputTag.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tags/InputTag.java,v retrieving revision 1.34 retrieving revision 1.35 diff -C2 -d -r1.34 -r1.35 *** InputTag.java 2 Jan 2004 16:24:55 -0000 1.34 --- InputTag.java 14 Jan 2004 02:58:06 -0000 1.35 *************** *** 27,32 **** package org.htmlparser.tags; - import org.htmlparser.util.ParserUtils; - /** * An input tag in a form. --- 27,30 ---- |
From: <der...@us...> - 2004-01-14 02:54:01
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/util/sort In directory sc8-pr-cvs1:/tmp/cvs-serv28098/src/org/htmlparser/util/sort Modified Files: Sort.java Log Message: Index: Sort.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/util/sort/Sort.java,v retrieving revision 1.11 retrieving revision 1.12 diff -C2 -d -r1.11 -r1.12 *** Sort.java 2 Jan 2004 16:24:58 -0000 1.11 --- Sort.java 14 Jan 2004 02:53:47 -0000 1.12 *************** *** 485,488 **** --- 485,489 ---- /** * Binary search for an object + * @param vector The vector of <code>Ordered</code> objects. * @param ref The name to search for. * @return The index at which reference was found or is to be inserted. *************** *** 492,495 **** --- 493,549 ---- return (bsearch (vector, ref, 0, vector.size () - 1)); } + + /** + * Binary search for an object + * @param array The array of <code>Ordered</code> objects. + * @param ref The name to search for. + * @param lo The lower index within which to look. + * @param hi The upper index within which to look. + * @return The index at which reference was found or is to be inserted. + */ + public static int bsearch (Ordered[] array, Ordered ref, int lo, int hi) + { int num; + int mid; + int half; + int result; + int ret; + + ret = -1; + + num = (hi - lo) + 1; + while ((-1 == ret) && (lo <= hi)) + { + half = num / 2; + mid = lo + ((0 != (num & 1)) ? half : half - 1); + result = ref.compare (array[mid]); + if (0 == result) + ret = mid; + else if (0 > result) + { + hi = mid - 1; + num = ((0 != (num & 1)) ? half : half - 1); + } + else + { + lo = mid + 1; + num = half; + } + } + if (-1 == ret) + ret = lo; + + return (ret); + } + + /** + * Binary search for an object + * @param array The array of <code>Ordered</code> objects. + * @param ref The name to search for. + * @return The index at which reference was found or is to be inserted. + */ + public static int bsearch (Ordered[] array, Ordered ref) + { + return (bsearch (array, ref, 0, array.length - 1)); + } } |
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/lexerTests In directory sc8-pr-cvs1:/tmp/cvs-serv28098/src/org/htmlparser/tests/lexerTests Modified Files: AttributeTests.java KitTest.java LexerTests.java PageTests.java SourceTests.java StreamTests.java Log Message: Index: AttributeTests.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/lexerTests/AttributeTests.java,v retrieving revision 1.10 retrieving revision 1.11 diff -C2 -d -r1.10 -r1.11 *** AttributeTests.java 2 Jan 2004 16:24:55 -0000 1.10 --- AttributeTests.java 14 Jan 2004 02:53:47 -0000 1.11 *************** *** 29,36 **** import java.util.Hashtable; import java.util.Vector; - import junit.framework.TestSuite; import org.htmlparser.Node; - import org.htmlparser.Parser; import org.htmlparser.PrototypicalNodeFactory; import org.htmlparser.lexer.nodes.Attribute; --- 29,34 ---- Index: KitTest.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/lexerTests/KitTest.java,v retrieving revision 1.5 retrieving revision 1.6 diff -C2 -d -r1.5 -r1.6 *** KitTest.java 20 Oct 2003 01:28:03 -0000 1.5 --- KitTest.java 14 Jan 2004 02:53:47 -0000 1.6 *************** *** 31,38 **** import java.util.Vector; import javax.swing.text.BadLocationException; - import javax.swing.text.Document; - import javax.swing.text.EditorKit; import javax.swing.text.Element; - import javax.swing.text.ElementIterator; import javax.swing.text.MutableAttributeSet; import javax.swing.text.html.HTML; --- 31,35 ---- *************** *** 43,47 **** import org.htmlparser.lexer.Cursor; import org.htmlparser.lexer.Lexer; - import org.htmlparser.lexer.Page; import org.htmlparser.AbstractNode; import org.htmlparser.lexer.nodes.Attribute; --- 40,43 ---- *************** *** 608,611 **** --- 604,610 ---- * * $Log$ + * Revision 1.6 2004/01/14 02:53:47 derrickoswald + * *** empty log message *** + * * Revision 1.5 2003/10/20 01:28:03 derrickoswald * Removed lexer level AbstractNode. Index: LexerTests.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/lexerTests/LexerTests.java,v retrieving revision 1.16 retrieving revision 1.17 diff -C2 -d -r1.16 -r1.17 *** LexerTests.java 10 Jan 2004 15:23:33 -0000 1.16 --- LexerTests.java 14 Jan 2004 02:53:47 -0000 1.17 *************** *** 27,39 **** package org.htmlparser.tests.lexerTests; - import java.io.BufferedReader; - import java.io.ByteArrayInputStream; import java.io.IOException; - import java.io.InputStream; - import java.io.InputStreamReader; - import java.io.StringReader; - import java.io.UnsupportedEncodingException; import java.net.URL; - import java.net.URLConnection; import java.util.HashSet; --- 27,32 ---- *************** *** 41,48 **** import org.htmlparser.Parser; import org.htmlparser.lexer.Lexer; - import org.htmlparser.lexer.Page; - import org.htmlparser.lexer.PageIndex; - import org.htmlparser.lexer.Source; - import org.htmlparser.lexer.Stream; import org.htmlparser.lexer.nodes.RemarkNode; import org.htmlparser.lexer.nodes.StringNode; --- 34,37 ---- *************** *** 52,56 **** import org.htmlparser.util.NodeIterator; import org.htmlparser.util.NodeList; - import org.htmlparser.util.EncodingChangeException; import org.htmlparser.util.ParserException; --- 41,44 ---- *************** *** 621,629 **** * causes spurious tags. * The root cause is characters bracketed by [esc]$B and [esc](J (contrary ! * to what is indicated in the j_s_nightingale analysis of the problem) that * sometimes have an angle bracket (< or 0x3c) embedded in them. These * are taken to be tags by the parser, instead of being considered strings. * <p> ! * The URL http://www.009.com/ has an ISO-8859-1 encoding (the default), but * Japanese characters intermixed on the page with English, using the JIS * encoding. We detect failure by looking for weird tag names which were --- 609,617 ---- * causes spurious tags. * The root cause is characters bracketed by [esc]$B and [esc](J (contrary ! * to what is indicated in then j_s_nightingale analysis of the problem) that * sometimes have an angle bracket (< or 0x3c) embedded in them. These * are taken to be tags by the parser, instead of being considered strings. * <p> ! * The URL refrenced has an ISO-8859-1 encoding (the default), but * Japanese characters intermixed on the page with English, using the JIS * encoding. We detect failure by looking for weird tag names which were *************** *** 667,671 **** NodeIterator iterator; ! parser = new Parser ("http://htmlparser.sourceforge.net/test/www_009_com.html"); iterator = parser.elements (); while (iterator.hasMoreNodes ()) --- 655,659 ---- NodeIterator iterator; ! parser = new Parser ("http://www.009.com/"); iterator = parser.elements (); while (iterator.hasMoreNodes ()) *************** *** 746,784 **** } - /** - * See bug #874175 StringBean doesn't handle charset change well - * Force an encoding change exception, reset and re-read. - */ - public void testEncodingChange () - throws - ParserException - { - NodeIterator iterator; - Node node; - boolean success; - - parser = new Parser ("http://htmlparser.sourceforge.net/test/www_china-pub_com.html"); - success = false; - try - { - for (iterator = parser.elements (); iterator.hasMoreNodes (); ) - node = iterator.nextNode (); - } - catch (EncodingChangeException ece) - { - success = true; - try - { - parser.reset (); - for (iterator = parser.elements (); iterator.hasMoreNodes (); ) - node = iterator.nextNode (); - } - catch (ParserException pe) - { - success = false; - } - } - assertTrue ("encoding change failed", success); - } } --- 734,737 ---- Index: PageTests.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/lexerTests/PageTests.java,v retrieving revision 1.15 retrieving revision 1.16 diff -C2 -d -r1.15 -r1.16 *** PageTests.java 2 Jan 2004 16:24:56 -0000 1.15 --- PageTests.java 14 Jan 2004 02:53:47 -0000 1.16 *************** *** 28,32 **** import java.io.IOException; - import java.io.UnsupportedEncodingException; import java.net.URL; import java.net.URLConnection; --- 28,31 ---- Index: SourceTests.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/lexerTests/SourceTests.java,v retrieving revision 1.15 retrieving revision 1.16 diff -C2 -d -r1.15 -r1.16 *** SourceTests.java 2 Jan 2004 16:24:56 -0000 1.15 --- SourceTests.java 14 Jan 2004 02:53:47 -0000 1.16 *************** *** 30,40 **** import java.io.ByteArrayInputStream; import java.io.IOException; - import java.io.InputStream; import java.io.InputStreamReader; import java.net.MalformedURLException; import java.net.URL; import java.net.URLConnection; - import java.util.ArrayList; - import java.util.Random; import org.htmlparser.lexer.Stream; --- 30,37 ---- Index: StreamTests.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/lexerTests/StreamTests.java,v retrieving revision 1.15 retrieving revision 1.16 diff -C2 -d -r1.15 -r1.16 *** StreamTests.java 2 Jan 2004 16:24:56 -0000 1.15 --- StreamTests.java 14 Jan 2004 02:53:47 -0000 1.16 *************** *** 30,39 **** import java.io.ByteArrayInputStream; import java.io.IOException; - import java.io.InputStream; import java.net.MalformedURLException; import java.net.URL; import java.net.URLConnection; import java.util.ArrayList; - import java.util.Random; import org.htmlparser.lexer.Stream; --- 30,37 ---- |
From: <der...@us...> - 2004-01-14 02:54:01
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/utilTests In directory sc8-pr-cvs1:/tmp/cvs-serv28098/src/org/htmlparser/tests/utilTests Modified Files: CharacterTranslationTest.java Log Message: Index: CharacterTranslationTest.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/utilTests/CharacterTranslationTest.java,v retrieving revision 1.38 retrieving revision 1.39 diff -C2 -d -r1.38 -r1.39 *** CharacterTranslationTest.java 2 Jan 2004 16:24:57 -0000 1.38 --- CharacterTranslationTest.java 14 Jan 2004 02:53:47 -0000 1.39 *************** *** 27,30 **** --- 27,37 ---- package org.htmlparser.tests.utilTests; + import java.io.ByteArrayInputStream; + import java.io.ByteArrayOutputStream; + import java.io.IOException; + import java.io.InputStream; + import java.io.PrintStream; + import java.net.URL; + import java.net.URLConnection; import org.htmlparser.tests.ParserTestCase; import org.htmlparser.util.Translate; *************** *** 63,67 **** "character entity reference without a semicolon at start of string doesn't work", "\u00f7 is the division sign.", ! Translate.decode ("÷ is the division sign.")); } --- 70,74 ---- "character entity reference without a semicolon at start of string doesn't work", "\u00f7 is the division sign.", ! Translate.decode ("÷ is the division sign.")); } *************** *** 71,75 **** "numeric character reference without a semicolon at start of string doesn't work", "\u00f7 is the division sign.", ! Translate.decode ("÷ is the division sign.")); } --- 78,82 ---- "numeric character reference without a semicolon at start of string doesn't work", "\u00f7 is the division sign.", ! Translate.decode ("÷ is the division sign.")); } *************** *** 145,148 **** --- 152,241 ---- Translate.encode ("<a href=\"http://www.w3.org/TR/REC-html40/sgml/entities.html\">http://www.w3.org/TR/REC-html40/sgml/entities.html</a>")); } + + public byte[] encodedecode (byte[] bytes) + throws + IOException + { + InputStream in; + ByteArrayOutputStream out; + + // encode + in = new ByteArrayInputStream (bytes); + out = new ByteArrayOutputStream (); + Translate.encode (in, new PrintStream (out)); + in.close (); + out.close (); + + // decode + in = new ByteArrayInputStream (out.toByteArray ()); + out = new ByteArrayOutputStream (); + Translate.decode (in, new PrintStream (out)); + in.close (); + out.close (); + + return (out.toByteArray ()); + } + + public void check (byte[] reference, byte[] result) + throws + IOException + { + InputStream ref; + InputStream in; + int i; + int i1; + int i2; + + ref = new ByteArrayInputStream (reference); + in = new ByteArrayInputStream (result); + i = 0; + do + { + i1 = ref.read (); + i2 = in.read (); + if (i1 != i2) + fail ("byte difference detected at offset " + i); + i++; + } + while (-1 != i1); + ref.close (); + in.close (); + } + + // public void testInitialCharacterEntityReferenceCodec () + // throws + // IOException + // { + // byte[] data = "\u00f7 is the division sign.".getBytes (); + // check (data, encodedecode (data)); + // } + + public void testEncodeDecodePage () throws IOException + { + URL url; + URLConnection connection; + InputStream in; + ByteArrayOutputStream out; + byte[] bytes; + byte[] result; + int c; + + // get some bytes + url = new URL ("http://sourceforge.net/projects/htmlparser"); + connection = url.openConnection (); + in = connection.getInputStream (); + out = new ByteArrayOutputStream (); + while (-1 != (c = in.read ())) + out.write (c); + in.close (); + out.close (); + bytes = out.toByteArray (); + + // run it through + result = encodedecode (bytes); + + // check + check (bytes, result); + } } |
From: <der...@us...> - 2004-01-14 02:54:01
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/visitors In directory sc8-pr-cvs1:/tmp/cvs-serv28098/src/org/htmlparser/visitors Modified Files: HtmlPage.java TextExtractingVisitor.java Log Message: Index: HtmlPage.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/visitors/HtmlPage.java,v retrieving revision 1.41 retrieving revision 1.42 diff -C2 -d -r1.41 -r1.42 *** HtmlPage.java 2 Jan 2004 16:24:58 -0000 1.41 --- HtmlPage.java 14 Jan 2004 02:53:47 -0000 1.42 *************** *** 27,34 **** package org.htmlparser.visitors; - import org.htmlparser.Node; import org.htmlparser.Parser; - import org.htmlparser.RemarkNode; - import org.htmlparser.StringNode; import org.htmlparser.tags.BodyTag; import org.htmlparser.tags.TableTag; --- 27,31 ---- Index: TextExtractingVisitor.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/visitors/TextExtractingVisitor.java,v retrieving revision 1.39 retrieving revision 1.40 diff -C2 -d -r1.39 -r1.40 *** TextExtractingVisitor.java 2 Jan 2004 16:24:58 -0000 1.39 --- TextExtractingVisitor.java 14 Jan 2004 02:53:47 -0000 1.40 *************** *** 29,33 **** import org.htmlparser.StringNode; import org.htmlparser.tags.Tag; - import org.htmlparser.tags.TitleTag; import org.htmlparser.util.Translate; --- 29,32 ---- |
From: <der...@us...> - 2004-01-14 02:54:00
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests In directory sc8-pr-cvs1:/tmp/cvs-serv28098/src/org/htmlparser/tests Modified Files: FunctionalTests.java ParserTest.java ParserTestCase.java Log Message: Index: FunctionalTests.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/FunctionalTests.java,v retrieving revision 1.53 retrieving revision 1.54 diff -C2 -d -r1.53 -r1.54 *** FunctionalTests.java 2 Jan 2004 16:24:55 -0000 1.53 --- FunctionalTests.java 14 Jan 2004 02:53:47 -0000 1.54 *************** *** 29,39 **** import java.io.BufferedReader; import java.io.IOException; - import java.io.InputStream; - import java.io.InputStreamReader; - import java.io.Reader; - import java.net.MalformedURLException; - import java.net.URL; - import junit.framework.TestCase; import junit.framework.TestSuite; --- 29,33 ---- *************** *** 43,47 **** import org.htmlparser.tags.ImageTag; import org.htmlparser.util.DefaultParserFeedback; - import org.htmlparser.util.LinkProcessor; import org.htmlparser.util.NodeIterator; import org.htmlparser.util.ParserException; --- 37,40 ---- Index: ParserTest.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/ParserTest.java,v retrieving revision 1.53 retrieving revision 1.54 diff -C2 -d -r1.53 -r1.54 *** ParserTest.java 2 Jan 2004 16:24:55 -0000 1.53 --- ParserTest.java 14 Jan 2004 02:53:47 -0000 1.54 *************** *** 36,40 **** import java.net.URL; import java.net.URLConnection; - import java.util.Map; import org.htmlparser.AbstractNode; --- 36,39 ---- *************** *** 47,51 **** import org.htmlparser.lexer.Lexer; import org.htmlparser.lexer.Page; - import org.htmlparser.scanners.TagScanner; import org.htmlparser.tags.BodyTag; import org.htmlparser.tags.ImageTag; --- 46,49 ---- Index: ParserTestCase.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/ParserTestCase.java,v retrieving revision 1.43 retrieving revision 1.44 diff -C2 -d -r1.43 -r1.44 *** ParserTestCase.java 2 Jan 2004 16:24:55 -0000 1.43 --- ParserTestCase.java 14 Jan 2004 02:53:47 -0000 1.44 *************** *** 27,36 **** package org.htmlparser.tests; - import java.io.BufferedReader; - import java.io.StringReader; import java.util.Enumeration; import java.util.Iterator; import java.util.Properties; - import java.util.Vector; import junit.framework.TestCase; --- 27,33 ---- |
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/scannersTests In directory sc8-pr-cvs1:/tmp/cvs-serv28098/src/org/htmlparser/tests/scannersTests Modified Files: CompositeTagScannerTest.java JspScannerTest.java ScriptScannerTest.java TagScannerTest.java Log Message: Index: CompositeTagScannerTest.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/scannersTests/CompositeTagScannerTest.java,v retrieving revision 1.58 retrieving revision 1.59 diff -C2 -d -r1.58 -r1.59 *** CompositeTagScannerTest.java 2 Jan 2004 16:24:56 -0000 1.58 --- CompositeTagScannerTest.java 14 Jan 2004 02:53:47 -0000 1.59 *************** *** 27,36 **** package org.htmlparser.tests.scannersTests; - import java.util.Vector; import org.htmlparser.AbstractNode; import org.htmlparser.Node; import org.htmlparser.PrototypicalNodeFactory; import org.htmlparser.StringNode; - import org.htmlparser.lexer.Page; import org.htmlparser.scanners.CompositeTagScanner; import org.htmlparser.tags.CompositeTag; --- 27,34 ---- *************** *** 42,46 **** import org.htmlparser.tags.Tag; import org.htmlparser.tests.ParserTestCase; - import org.htmlparser.util.NodeList; import org.htmlparser.util.ParserException; --- 40,43 ---- Index: JspScannerTest.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/scannersTests/JspScannerTest.java,v retrieving revision 1.36 retrieving revision 1.37 diff -C2 -d -r1.36 -r1.37 *** JspScannerTest.java 2 Jan 2004 16:24:56 -0000 1.36 --- JspScannerTest.java 14 Jan 2004 02:53:47 -0000 1.37 *************** *** 29,33 **** import org.htmlparser.Parser; import org.htmlparser.PrototypicalNodeFactory; - import org.htmlparser.scanners.JspScanner; import org.htmlparser.tags.JspTag; import org.htmlparser.tests.ParserTestCase; --- 29,32 ---- Index: ScriptScannerTest.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/scannersTests/ScriptScannerTest.java,v retrieving revision 1.51 retrieving revision 1.52 diff -C2 -d -r1.51 -r1.52 *** ScriptScannerTest.java 2 Jan 2004 16:24:56 -0000 1.51 --- ScriptScannerTest.java 14 Jan 2004 02:53:47 -0000 1.52 *************** *** 31,35 **** import org.htmlparser.Node; import org.htmlparser.Parser; - import org.htmlparser.scanners.ScriptScanner; import org.htmlparser.tags.BodyTag; import org.htmlparser.tags.ScriptTag; --- 31,34 ---- Index: TagScannerTest.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/scannersTests/TagScannerTest.java,v retrieving revision 1.39 retrieving revision 1.40 diff -C2 -d -r1.39 -r1.40 *** TagScannerTest.java 2 Jan 2004 16:24:56 -0000 1.39 --- TagScannerTest.java 14 Jan 2004 02:53:47 -0000 1.40 *************** *** 27,39 **** package org.htmlparser.tests.scannersTests; - import java.util.Vector; - import org.htmlparser.Node; - import org.htmlparser.Parser; - import org.htmlparser.lexer.Lexer; - import org.htmlparser.lexer.Page; - import org.htmlparser.scanners.TagScanner; import org.htmlparser.tags.Tag; import org.htmlparser.tests.ParserTestCase; - import org.htmlparser.util.NodeIterator; import org.htmlparser.util.ParserException; import org.htmlparser.util.ParserUtils; --- 27,32 ---- |
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/tagTests In directory sc8-pr-cvs1:/tmp/cvs-serv28098/src/org/htmlparser/tests/tagTests Modified Files: BaseHrefTagTest.java BulletTagTest.java ImageTagTest.java JspTagTest.java LinkTagTest.java ScriptTagTest.java Log Message: Index: BaseHrefTagTest.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/tagTests/BaseHrefTagTest.java,v retrieving revision 1.38 retrieving revision 1.39 diff -C2 -d -r1.38 -r1.39 *** BaseHrefTagTest.java 2 Jan 2004 16:24:57 -0000 1.38 --- BaseHrefTagTest.java 14 Jan 2004 02:53:47 -0000 1.39 *************** *** 27,32 **** package org.htmlparser.tests.tagTests; - import java.util.Vector; - import org.htmlparser.PrototypicalNodeFactory; import org.htmlparser.tags.BaseHrefTag; --- 27,30 ---- Index: BulletTagTest.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/tagTests/BulletTagTest.java,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** BulletTagTest.java 7 Dec 2003 23:41:43 -0000 1.1 --- BulletTagTest.java 14 Jan 2004 02:53:47 -0000 1.2 *************** *** 27,31 **** package org.htmlparser.tests.tagTests; - import org.htmlparser.Node; import org.htmlparser.tests.ParserTestCase; import org.htmlparser.tags.Bullet; --- 27,30 ---- Index: ImageTagTest.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/tagTests/ImageTagTest.java,v retrieving revision 1.41 retrieving revision 1.42 diff -C2 -d -r1.41 -r1.42 *** ImageTagTest.java 2 Jan 2004 16:24:57 -0000 1.41 --- ImageTagTest.java 14 Jan 2004 02:53:47 -0000 1.42 *************** *** 36,45 **** import org.htmlparser.tags.TableRow; import org.htmlparser.tests.ParserTestCase; - import org.htmlparser.util.LinkProcessor; import org.htmlparser.util.NodeIterator; - import org.htmlparser.util.NodeList; import org.htmlparser.util.ParserException; import org.htmlparser.util.ParserUtils; - import org.htmlparser.util.SimpleNodeIterator; public class ImageTagTest extends ParserTestCase --- 36,42 ---- Index: JspTagTest.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/tagTests/JspTagTest.java,v retrieving revision 1.42 retrieving revision 1.43 diff -C2 -d -r1.42 -r1.43 *** JspTagTest.java 2 Jan 2004 16:24:57 -0000 1.42 --- JspTagTest.java 14 Jan 2004 02:53:47 -0000 1.43 *************** *** 29,33 **** import org.htmlparser.Parser; import org.htmlparser.PrototypicalNodeFactory; - import org.htmlparser.scanners.JspScanner; import org.htmlparser.tags.JspTag; import org.htmlparser.tags.Tag; --- 29,32 ---- Index: LinkTagTest.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/tagTests/LinkTagTest.java,v retrieving revision 1.45 retrieving revision 1.46 diff -C2 -d -r1.45 -r1.46 *** LinkTagTest.java 2 Jan 2004 16:24:57 -0000 1.45 --- LinkTagTest.java 14 Jan 2004 02:53:47 -0000 1.46 *************** *** 27,31 **** package org.htmlparser.tests.tagTests; - import java.util.Vector; import org.htmlparser.AbstractNode; import org.htmlparser.Node; --- 27,30 ---- Index: ScriptTagTest.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/tagTests/ScriptTagTest.java,v retrieving revision 1.42 retrieving revision 1.43 diff -C2 -d -r1.42 -r1.43 *** ScriptTagTest.java 2 Jan 2004 16:24:57 -0000 1.42 --- ScriptTagTest.java 14 Jan 2004 02:53:47 -0000 1.43 *************** *** 29,37 **** import org.htmlparser.Parser; import org.htmlparser.PrototypicalNodeFactory; - import org.htmlparser.StringNode; import org.htmlparser.scanners.ScriptScanner; import org.htmlparser.tags.ScriptTag; import org.htmlparser.tests.ParserTestCase; - import org.htmlparser.util.NodeList; import org.htmlparser.util.ParserException; --- 29,35 ---- |
From: <der...@us...> - 2004-01-14 02:54:00
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/util In directory sc8-pr-cvs1:/tmp/cvs-serv28098/src/org/htmlparser/util Modified Files: ParserUtils.java Log Message: Index: ParserUtils.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/util/ParserUtils.java,v retrieving revision 1.38 retrieving revision 1.39 diff -C2 -d -r1.38 -r1.39 *** ParserUtils.java 2 Jan 2004 16:24:58 -0000 1.38 --- ParserUtils.java 14 Jan 2004 02:53:47 -0000 1.39 *************** *** 27,39 **** package org.htmlparser.util; - import java.util.Enumeration; - import java.util.Hashtable; - import java.util.Map; - import org.htmlparser.Node; import org.htmlparser.NodeFilter; - import org.htmlparser.Parser; import org.htmlparser.filters.NodeClassFilter; - import org.htmlparser.tags.Tag; public class ParserUtils --- 27,33 ---- |
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tags In directory sc8-pr-cvs1:/tmp/cvs-serv28098/src/org/htmlparser/tags Modified Files: BaseHrefTag.java FormTag.java FrameTag.java ImageTag.java JspTag.java SelectTag.java Tag.java TextareaTag.java Log Message: Index: BaseHrefTag.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tags/BaseHrefTag.java,v retrieving revision 1.36 retrieving revision 1.37 diff -C2 -d -r1.36 -r1.37 *** BaseHrefTag.java 2 Jan 2004 16:24:54 -0000 1.36 --- BaseHrefTag.java 14 Jan 2004 02:53:46 -0000 1.37 *************** *** 28,32 **** import org.htmlparser.lexer.Page; - import org.htmlparser.util.LinkProcessor; import org.htmlparser.util.ParserException; --- 28,31 ---- Index: FormTag.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tags/FormTag.java,v retrieving revision 1.45 retrieving revision 1.46 diff -C2 -d -r1.45 -r1.46 *** FormTag.java 2 Jan 2004 16:24:54 -0000 1.45 --- FormTag.java 14 Jan 2004 02:53:46 -0000 1.46 *************** *** 28,32 **** import org.htmlparser.util.NodeList; - import org.htmlparser.util.ParserException; import org.htmlparser.util.SimpleNodeIterator; --- 28,31 ---- Index: FrameTag.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tags/FrameTag.java,v retrieving revision 1.34 retrieving revision 1.35 diff -C2 -d -r1.34 -r1.35 *** FrameTag.java 2 Jan 2004 16:24:54 -0000 1.34 --- FrameTag.java 14 Jan 2004 02:53:46 -0000 1.35 *************** *** 27,32 **** package org.htmlparser.tags; - import org.htmlparser.util.LinkProcessor; - /** * Identifies a frame tag --- 27,30 ---- Index: ImageTag.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tags/ImageTag.java,v retrieving revision 1.40 retrieving revision 1.41 diff -C2 -d -r1.40 -r1.41 *** ImageTag.java 2 Jan 2004 16:24:55 -0000 1.40 --- ImageTag.java 14 Jan 2004 02:53:46 -0000 1.41 *************** *** 29,34 **** import java.util.Vector; import org.htmlparser.lexer.nodes.Attribute; - import org.htmlparser.lexer.nodes.TagNode; - import org.htmlparser.util.ParserException; import org.htmlparser.util.ParserUtils; import org.htmlparser.visitors.NodeVisitor; --- 29,32 ---- Index: JspTag.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tags/JspTag.java,v retrieving revision 1.38 retrieving revision 1.39 diff -C2 -d -r1.38 -r1.39 *** JspTag.java 2 Jan 2004 16:24:55 -0000 1.38 --- JspTag.java 14 Jan 2004 02:53:46 -0000 1.39 *************** *** 27,33 **** package org.htmlparser.tags; - import org.htmlparser.Node; - import org.htmlparser.util.SimpleNodeIterator; - /** * The JSP/ASP tags like <%...%> can be identified by this class. --- 27,30 ---- Index: SelectTag.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tags/SelectTag.java,v retrieving revision 1.36 retrieving revision 1.37 diff -C2 -d -r1.36 -r1.37 *** SelectTag.java 2 Jan 2004 16:24:55 -0000 1.36 --- SelectTag.java 14 Jan 2004 02:53:46 -0000 1.37 *************** *** 27,34 **** package org.htmlparser.tags; - import org.htmlparser.Node; - import org.htmlparser.util.NodeList; - import org.htmlparser.util.ParserUtils; /** --- 27,31 ---- Index: Tag.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tags/Tag.java,v retrieving revision 1.60 retrieving revision 1.61 diff -C2 -d -r1.60 -r1.61 *** Tag.java 2 Jan 2004 16:24:55 -0000 1.60 --- Tag.java 14 Jan 2004 02:53:46 -0000 1.61 *************** *** 32,36 **** import org.htmlparser.lexer.nodes.TagNode; import org.htmlparser.scanners.TagScanner; - import org.htmlparser.util.NodeList; import org.htmlparser.visitors.NodeVisitor; --- 32,35 ---- Index: TextareaTag.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tags/TextareaTag.java,v retrieving revision 1.33 retrieving revision 1.34 diff -C2 -d -r1.33 -r1.34 *** TextareaTag.java 2 Jan 2004 16:24:55 -0000 1.33 --- TextareaTag.java 14 Jan 2004 02:53:46 -0000 1.34 *************** *** 27,32 **** package org.htmlparser.tags; - import org.htmlparser.util.ParserUtils; - /** * A text area tag within a form. --- 27,30 ---- |
From: <der...@us...> - 2004-01-14 02:53:51
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser In directory sc8-pr-cvs1:/tmp/cvs-serv28098/src/org/htmlparser Modified Files: Parser.java PrototypicalNodeFactory.java RemarkNode.java StringNode.java Log Message: Index: Parser.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/Parser.java,v retrieving revision 1.82 retrieving revision 1.83 diff -C2 -d -r1.82 -r1.83 *** Parser.java 4 Jan 2004 19:03:36 -0000 1.82 --- Parser.java 14 Jan 2004 02:53:46 -0000 1.83 *************** *** 33,40 **** import java.net.URL; import java.net.URLConnection; - import java.util.Hashtable; - import java.util.Iterator; - import java.util.Map; - import java.util.Vector; import org.htmlparser.filters.TagNameFilter; --- 33,36 ---- *************** *** 42,51 **** import org.htmlparser.lexer.Lexer; import org.htmlparser.lexer.Page; - import org.htmlparser.lexer.nodes.Attribute; import org.htmlparser.lexer.nodes.NodeFactory; - import org.htmlparser.nodeDecorators.DecodingNode; - import org.htmlparser.nodeDecorators.EscapeCharacterRemovingNode; - import org.htmlparser.nodeDecorators.NonBreakingSpaceConvertingNode; - import org.htmlparser.tags.Tag; // temporarily import org.htmlparser.util.DefaultParserFeedback; import org.htmlparser.util.IteratorImpl; --- 38,42 ---- Index: PrototypicalNodeFactory.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/PrototypicalNodeFactory.java,v retrieving revision 1.2 retrieving revision 1.3 diff -C2 -d -r1.2 -r1.3 *** PrototypicalNodeFactory.java 8 Dec 2003 13:13:58 -0000 1.2 --- PrototypicalNodeFactory.java 14 Jan 2004 02:53:46 -0000 1.3 *************** *** 34,41 **** import org.htmlparser.lexer.nodes.Attribute; import org.htmlparser.lexer.nodes.NodeFactory; - import org.htmlparser.nodeDecorators.DecodingNode; - import org.htmlparser.nodeDecorators.EscapeCharacterRemovingNode; - import org.htmlparser.nodeDecorators.NonBreakingSpaceConvertingNode; - //import org.htmlparser.tags.Tag; import org.htmlparser.tags.*; // import everything for now import org.htmlparser.util.ParserException; --- 34,37 ---- Index: RemarkNode.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/RemarkNode.java,v retrieving revision 1.40 retrieving revision 1.41 diff -C2 -d -r1.40 -r1.41 *** RemarkNode.java 2 Jan 2004 16:24:52 -0000 1.40 --- RemarkNode.java 14 Jan 2004 02:53:46 -0000 1.41 *************** *** 28,32 **** import org.htmlparser.lexer.Page; - import org.htmlparser.util.NodeList; import org.htmlparser.visitors.NodeVisitor; --- 28,31 ---- Index: StringNode.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/StringNode.java,v retrieving revision 1.48 retrieving revision 1.49 diff -C2 -d -r1.48 -r1.49 *** StringNode.java 2 Jan 2004 16:24:52 -0000 1.48 --- StringNode.java 14 Jan 2004 02:53:46 -0000 1.49 *************** *** 28,32 **** import org.htmlparser.lexer.Page; - import org.htmlparser.util.NodeList; import org.htmlparser.visitors.NodeVisitor; --- 28,31 ---- |
From: <der...@us...> - 2004-01-14 02:53:51
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/nodes In directory sc8-pr-cvs1:/tmp/cvs-serv28098/src/org/htmlparser/lexer/nodes Modified Files: TagNode.java Log Message: Index: TagNode.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/nodes/TagNode.java,v retrieving revision 1.27 retrieving revision 1.28 diff -C2 -d -r1.27 -r1.28 *** TagNode.java 2 Jan 2004 16:24:53 -0000 1.27 --- TagNode.java 14 Jan 2004 02:53:46 -0000 1.28 *************** *** 35,39 **** import org.htmlparser.lexer.Lexer; import org.htmlparser.lexer.Page; - import org.htmlparser.util.NodeList; import org.htmlparser.util.ParserException; import org.htmlparser.util.SpecialHashtable; --- 35,38 ---- |
From: <der...@us...> - 2004-01-14 02:53:51
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners In directory sc8-pr-cvs1:/tmp/cvs-serv28098/src/org/htmlparser/scanners Modified Files: CompositeTagScanner.java ScriptScanner.java Log Message: Index: CompositeTagScanner.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/CompositeTagScanner.java,v retrieving revision 1.84 retrieving revision 1.85 diff -C2 -d -r1.84 -r1.85 *** CompositeTagScanner.java 20 Dec 2003 23:47:55 -0000 1.84 --- CompositeTagScanner.java 14 Jan 2004 02:53:46 -0000 1.85 *************** *** 27,32 **** package org.htmlparser.scanners; - import java.util.HashSet; - import java.util.Set; import java.util.Vector; --- 27,30 ---- Index: ScriptScanner.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/ScriptScanner.java,v retrieving revision 1.54 retrieving revision 1.55 diff -C2 -d -r1.54 -r1.55 *** ScriptScanner.java 20 Dec 2003 23:47:55 -0000 1.54 --- ScriptScanner.java 14 Jan 2004 02:53:46 -0000 1.55 *************** *** 30,42 **** import org.htmlparser.Node; - import org.htmlparser.Parser; import org.htmlparser.PrototypicalNodeFactory; import org.htmlparser.RemarkNode; import org.htmlparser.StringNode; import org.htmlparser.lexer.Lexer; - import org.htmlparser.lexer.Page; import org.htmlparser.lexer.nodes.NodeFactory; import org.htmlparser.tags.CompositeTag; - import org.htmlparser.tags.ScriptTag; import org.htmlparser.tags.Tag; import org.htmlparser.util.NodeList; --- 30,39 ---- |
From: <der...@us...> - 2004-01-14 02:53:51
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/parserapplications In directory sc8-pr-cvs1:/tmp/cvs-serv28098/src/org/htmlparser/parserapplications Modified Files: SiteCapturer.java Log Message: Index: SiteCapturer.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/parserapplications/SiteCapturer.java,v retrieving revision 1.3 retrieving revision 1.4 diff -C2 -d -r1.3 -r1.4 *** SiteCapturer.java 10 Jan 2004 00:06:03 -0000 1.3 --- SiteCapturer.java 14 Jan 2004 02:53:46 -0000 1.4 *************** *** 41,57 **** import javax.swing.JOptionPane; - import org.htmlparser.Node; - import org.htmlparser.NodeFilter; import org.htmlparser.Parser; import org.htmlparser.PrototypicalNodeFactory; - import org.htmlparser.filters.AndFilter; - import org.htmlparser.filters.HasAttributeFilter; - import org.htmlparser.filters.NodeClassFilter; - import org.htmlparser.lexer.nodes.Attribute; import org.htmlparser.tags.BaseHrefTag; import org.htmlparser.tags.FrameTag; import org.htmlparser.tags.ImageTag; import org.htmlparser.tags.LinkTag; - import org.htmlparser.tags.MetaTag; import org.htmlparser.util.NodeIterator; import org.htmlparser.util.NodeList; --- 41,50 ---- *************** *** 60,64 **** /** * Save a web site locally. ! * Illustrative program to save a web site contents locally. * It was created to demonstrate URL rewriting in it's simplest form. * It uses customized tags in the NodeFactory to alter the URLs. --- 53,57 ---- /** * Save a web site locally. ! * Illustrative prgram to save a web site contents locally. * It was created to demonstrate URL rewriting in it's simplest form. * It uses customized tags in the NodeFactory to alter the URLs. *************** *** 131,139 **** /** - * The filter to apply to the nodes retrieved. - */ - protected NodeFilter mFilter; - - /** * Copy buffer size. * Resources are moved to disk in chunks this size or less. --- 124,127 ---- *************** *** 148,153 **** PrototypicalNodeFactory factory; - mSource = null; - mTarget = null; mPages = new ArrayList (); mFinished = new HashSet (); --- 136,139 ---- *************** *** 161,166 **** factory.registerTag (new LocalImageTag ()); mParser.setNodeFactory (factory); - mCaptureResources = true; - mFilter = null; } --- 147,150 ---- *************** *** 229,251 **** } - - /** Getter for property filter. - * @return Value of property filter. - * - */ - public NodeFilter getFilter () - { - return (mFilter); - } - - /** Setter for property filter. - * @param filter New value of property filter. - * - */ - public void setFilter (NodeFilter filter) - { - mFilter = filter; - } - /** * Returns <code>true</code> if the link is one we are interested in. --- 213,216 ---- *************** *** 316,320 **** String ret; ! if (link.equals (getSource ()) || (!getSource ().endsWith ("/") && link.equals (getSource () + "/"))) ret = "index.html"; // handle the root page specially else if (link.startsWith (getSource ()) --- 281,285 ---- String ret; ! if (link.equals (getSource ())) ret = "index.html"; // handle the root page specially else if (link.startsWith (getSource ()) *************** *** 417,430 **** * Process a single page. */ ! protected void process (NodeFilter filter) throws ParserException { String url; - int bookmark; NodeList list; - NodeList robots; - MetaTag robot; - String content; File file; File dir; --- 382,391 ---- * Process a single page. */ ! protected void process () throws ParserException { String url; NodeList list; File file; File dir; *************** *** 437,443 **** try ! { ! bookmark = mPages.size (); ! // fetch the page and gather the list of nodes mParser.setURL (url); list = new NodeList (); --- 398,402 ---- try ! { // fetch the page and gather the list of nodes mParser.setURL (url); list = new NodeList (); *************** *** 445,470 **** list.add (e.nextNode ()); // URL conversion occurs in the tags - // handle robots meta tag according to http://www.robotstxt.org/wc/meta-user.html - // <meta name="robots" content="index,follow" /> - // <meta name="robots" content="noindex,nofollow" /> - robots = list.extractAllNodesThatMatch ( - new AndFilter ( - new NodeClassFilter (MetaTag.class), - new HasAttributeFilter ("name", "robots")), true); - if (0 != robots.size ()) - { - robot = (MetaTag)robots.elementAt (0); - content = robot.getAttribute ("content").toLowerCase (); - if ((-1 != content.indexOf ("none")) || (-1 != content.indexOf ("nofollow"))) - // reset mPages - for (int i = bookmark; i < mPages.size (); i++) - mPages.remove (i); - if ((-1 != content.indexOf ("none")) || (-1 != content.indexOf ("noindex"))) - return; - } - - if (null != filter) - list.keepAllNodesThatMatch (filter, true); - // save the page locally file = new File (getTarget (), makeLocalLink (url, "")); --- 404,407 ---- *************** *** 472,483 **** if (!dir.exists ()) dir.mkdirs (); - else if (!dir.isDirectory ()) - { - dir = new File (dir.getParentFile (), dir.getName () + ".content"); - if (!dir.exists ()) - dir.mkdirs (); - file = new File (dir, file.getName ()); - } - try { --- 409,412 ---- *************** *** 652,656 **** try { ! process (getFilter ()); while (0 != mImages.size ()) copy (); --- 581,585 ---- try { ! process (); while (0 != mImages.size ()) copy (); |
From: <der...@us...> - 2004-01-14 02:53:50
|
Update of /cvsroot/htmlparser/htmlparser/src/doc-files In directory sc8-pr-cvs1:/tmp/cvs-serv28098/src/doc-files Modified Files: todo.html Log Message: Index: todo.html =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/doc-files/todo.html,v retrieving revision 1.3 retrieving revision 1.4 diff -C2 -d -r1.3 -r1.4 *** todo.html 31 Dec 2003 02:50:49 -0000 1.3 --- todo.html 14 Jan 2004 02:53:46 -0000 1.4 *************** *** 9,18 **** <ul> <li> - It looks like there are enough bugs and requests to warrant another 1.3 point - release with some patched files. - I hate to work on a branch, but it may be the only way to get everyone off my - back. - </li> - <li> As of now, it's more likely that the javadocs are lying to you than providing any helpful advice. This needs to be reworked completely. --- 9,12 ---- *************** *** 52,60 **** </li> <li> - Rework all the applications for a better 'out of the box' experience for new - and novice users. Fix all the scripts in /bin (for unix and windows) and add - any others that don't exist already. - </li> - <li> The tag-enders and end-tag-enders lists are only a partial solution to the HTML specification for block and inline tags. By marking each tag as a block or --- 46,49 ---- *************** *** 71,79 **** </li> <li> - Change all the headers to match the new format. The integration process needs to - be revamped to use the $Name: CVS substitution (via 'get label'), so a checkin - isn't required every integration. - </li> - <li> The default is now the equivalent of the old 'RegisterDomTags', so the operation of the following mainlines needs to be revisited: --- 60,63 ---- |
From: <der...@us...> - 2004-01-14 02:53:49
|
Update of /cvsroot/htmlparser/htmlparser In directory sc8-pr-cvs1:/tmp/cvs-serv28098 Modified Files: build.xml Log Message: Index: build.xml =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/build.xml,v retrieving revision 1.57 retrieving revision 1.58 diff -C2 -d -r1.57 -r1.58 *** build.xml 4 Jan 2004 03:23:08 -0000 1.57 --- build.xml 14 Jan 2004 02:53:46 -0000 1.58 *************** *** 237,240 **** --- 237,241 ---- <include name="org/htmlparser/util/LinkProcessor.class"/> <include name="org/htmlparser/util/Translate.class"/> + <include name="org/htmlparser/util/EncodingChangeException.class"/> <include name="org/htmlparser/util/sort/**/*.class"/> <include name="org/htmlparser/parserHelper/SpecialHashtable.class"/> |
From: <der...@us...> - 2004-01-10 15:23:36
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/util In directory sc8-pr-cvs1:/tmp/cvs-serv3574/util Modified Files: IteratorImpl.java Added Files: EncodingChangeException.java Log Message: Fix bug #874175 StringBean doesn't handle charset change well Add EncodingChangeException to distinguish a recoverable character set change occuring after the lexer has already coughed up some characters using the wrong encoding. Added testEncodingChange in LexerTests to excercise it. Changed IteratorImpl to not wrap a ParserException with another ParserException. Changed StringBean to retry the URL when an encoding change exception is caught. --- NEW FILE: EncodingChangeException.java --- // HTMLParser Library $Name: $ - A java-based parser for HTML // http://sourceforge.org/projects/htmlparser // Copyright (C) 2004 Claude Duguay // // Revision Control Information // // $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/util/EncodingChangeException.java,v $ // $Author: derrickoswald $ // $Date: 2004/01/10 15:23:33 $ // $Revision: 1.1 $ // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // package org.htmlparser.util; /** * The encoding is changed invalidating already scanned characters. * When the encoding is changed, as for example when encountering a <META> * tag that includes a charset directive in the content attribute that * disagrees with the encoding specified by the HTTP header (or the default * encoding if none), the parser retraces the bytes it has interpreted so far * comparing the characters produced under the new encoding. If the new * characters differ from those it has already yielded to the application, it * throws this exception to indicate that processing should be restarted under * the new encoding. * This exception is the object thrown so that applications may distinguish * between an encoding change, which may be successfully cured by restarting * the parse from the beginning, from more serious errors. * @see IteratorImpl * @see ParserException **/ public class EncodingChangeException extends ParserException { /** * Create an exception idicative of a problematic encoding change. * @param message The message describing the error condifion. */ public EncodingChangeException (String message) { super(message); } } Index: IteratorImpl.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/util/IteratorImpl.java,v retrieving revision 1.39 retrieving revision 1.40 diff -C2 -d -r1.39 -r1.40 *** IteratorImpl.java 2 Jan 2004 16:24:58 -0000 1.39 --- IteratorImpl.java 10 Jan 2004 15:23:33 -0000 1.40 *************** *** 64,69 **** * Get the next node. * @return The next node in the HTML stream, or null if there are no more nodes. */ ! public Node nextNode() throws ParserException { Tag tag; --- 64,70 ---- * Get the next node. * @return The next node in the HTML stream, or null if there are no more nodes. + * @exception ParserException If an unrecoverable error occurs. */ ! public Node nextNode () throws ParserException { Tag tag; *************** *** 95,107 **** } } catch (Exception e) { ! StringBuffer msgBuffer = new StringBuffer(); ! msgBuffer.append("Unexpected Exception occurred while reading "); ! msgBuffer.append(mLexer.getPage ().getUrl ()); ! msgBuffer.append(", in nextHTMLNode"); ! // reader.appendLineDetails(msgBuffer); ! ParserException ex = new ParserException(msgBuffer.toString(),e); ! mFeedback.error(msgBuffer.toString(),ex); throw ex; } --- 96,112 ---- } } + catch (ParserException pe) + { + throw pe; // no need to wrap an existing ParserException + } catch (Exception e) { ! StringBuffer msgBuffer = new StringBuffer (); ! msgBuffer.append ("Unexpected Exception occurred while reading "); ! msgBuffer.append (mLexer.getPage ().getUrl ()); ! msgBuffer.append (", in nextNode"); ! // TODO: appendLineDetails (msgBuffer); ! ParserException ex = new ParserException (msgBuffer.toString (), e); ! mFeedback.error (msgBuffer.toString (), ex); throw ex; } |
From: <der...@us...> - 2004-01-10 15:23:36
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/beans In directory sc8-pr-cvs1:/tmp/cvs-serv3574/beans Modified Files: StringBean.java Log Message: Fix bug #874175 StringBean doesn't handle charset change well Add EncodingChangeException to distinguish a recoverable character set change occuring after the lexer has already coughed up some characters using the wrong encoding. Added testEncodingChange in LexerTests to excercise it. Changed IteratorImpl to not wrap a ParserException with another ParserException. Changed StringBean to retry the URL when an encoding change exception is caught. Index: StringBean.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/beans/StringBean.java,v retrieving revision 1.35 retrieving revision 1.36 diff -C2 -d -r1.35 -r1.36 *** StringBean.java 2 Jan 2004 16:24:53 -0000 1.35 --- StringBean.java 10 Jan 2004 15:23:33 -0000 1.36 *************** *** 37,40 **** --- 37,41 ---- import org.htmlparser.tags.Tag; import org.htmlparser.util.ParserException; + import org.htmlparser.util.EncodingChangeException; import org.htmlparser.util.Translate; import org.htmlparser.visitors.NodeVisitor; *************** *** 306,309 **** --- 307,330 ---- } } + catch (EncodingChangeException ece) + { + mIsPre = false; + mIsScript = false; + try + { // try again with the encoding now in force + mParser.reset (); + mBuffer = new StringBuffer (4096); + mParser.visitAllNodesWith (this); + updateStrings (mBuffer.toString ()); + } + catch (ParserException pe) + { + updateStrings (pe.toString ()); + } + finally + { + mBuffer = null; + } + } catch (ParserException pe) { |