htmlparser-cvs Mailing List for HTML Parser (Page 23)
Brought to you by:
derrickoswald
You can subscribe to this list here.
| 2003 |
Jan
|
Feb
|
Mar
|
Apr
|
May
(141) |
Jun
(108) |
Jul
(66) |
Aug
(127) |
Sep
(155) |
Oct
(149) |
Nov
(72) |
Dec
(72) |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2004 |
Jan
(100) |
Feb
(36) |
Mar
(21) |
Apr
(3) |
May
(87) |
Jun
(28) |
Jul
(84) |
Aug
(5) |
Sep
(14) |
Oct
|
Nov
|
Dec
|
| 2005 |
Jan
(1) |
Feb
(39) |
Mar
(26) |
Apr
(38) |
May
(14) |
Jun
(10) |
Jul
|
Aug
|
Sep
(13) |
Oct
(8) |
Nov
(10) |
Dec
|
| 2006 |
Jan
|
Feb
(1) |
Mar
(17) |
Apr
(20) |
May
(28) |
Jun
(24) |
Jul
|
Aug
|
Sep
|
Oct
|
Nov
|
Dec
|
| 2015 |
Jan
|
Feb
|
Mar
(1) |
Apr
|
May
|
Jun
|
Jul
|
Aug
|
Sep
|
Oct
|
Nov
|
Dec
|
|
From: <der...@us...> - 2004-01-19 23:14:26
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/parserapplications
In directory sc8-pr-cvs1:/tmp/cvs-serv32229/src/org/htmlparser/parserapplications
Modified Files:
SiteCapturer.java
Log Message:
Update version to 1.4-20040119.
Index: SiteCapturer.java
===================================================================
RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/parserapplications/SiteCapturer.java,v
retrieving revision 1.4
retrieving revision 1.5
diff -C2 -d -r1.4 -r1.5
*** SiteCapturer.java 14 Jan 2004 02:53:46 -0000 1.4
--- SiteCapturer.java 19 Jan 2004 23:14:18 -0000 1.5
***************
*** 41,50 ****
--- 41,55 ----
import javax.swing.JOptionPane;
+ import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.PrototypicalNodeFactory;
+ import org.htmlparser.filters.AndFilter;
+ import org.htmlparser.filters.HasAttributeFilter;
+ import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.tags.BaseHrefTag;
import org.htmlparser.tags.FrameTag;
import org.htmlparser.tags.ImageTag;
import org.htmlparser.tags.LinkTag;
+ import org.htmlparser.tags.MetaTag;
import org.htmlparser.util.NodeIterator;
import org.htmlparser.util.NodeList;
***************
*** 53,57 ****
/**
* Save a web site locally.
! * Illustrative prgram to save a web site contents locally.
* It was created to demonstrate URL rewriting in it's simplest form.
* It uses customized tags in the NodeFactory to alter the URLs.
--- 58,62 ----
/**
* Save a web site locally.
! * Illustrative program to save a web site contents locally.
* It was created to demonstrate URL rewriting in it's simplest form.
* It uses customized tags in the NodeFactory to alter the URLs.
***************
*** 124,127 ****
--- 129,137 ----
/**
+ * The filter to apply to the nodes retrieved.
+ */
+ protected NodeFilter mFilter;
+
+ /**
* Copy buffer size.
* Resources are moved to disk in chunks this size or less.
***************
*** 136,139 ****
--- 146,151 ----
PrototypicalNodeFactory factory;
+ mSource = null;
+ mTarget = null;
mPages = new ArrayList ();
mFinished = new HashSet ();
***************
*** 147,150 ****
--- 159,164 ----
factory.registerTag (new LocalImageTag ());
mParser.setNodeFactory (factory);
+ mCaptureResources = true;
+ mFilter = null;
}
***************
*** 213,216 ****
--- 227,249 ----
}
+
+ /** Getter for property filter.
+ * @return Value of property filter.
+ *
+ */
+ public NodeFilter getFilter ()
+ {
+ return (mFilter);
+ }
+
+ /** Setter for property filter.
+ * @param filter New value of property filter.
+ *
+ */
+ public void setFilter (NodeFilter filter)
+ {
+ mFilter = filter;
+ }
+
/**
* Returns <code>true</code> if the link is one we are interested in.
***************
*** 281,285 ****
String ret;
! if (link.equals (getSource ()))
ret = "index.html"; // handle the root page specially
else if (link.startsWith (getSource ())
--- 314,318 ----
String ret;
! if (link.equals (getSource ()) || (!getSource ().endsWith ("/") && link.equals (getSource () + "/")))
ret = "index.html"; // handle the root page specially
else if (link.startsWith (getSource ())
***************
*** 382,391 ****
* Process a single page.
*/
! protected void process ()
throws
ParserException
{
String url;
NodeList list;
File file;
File dir;
--- 415,428 ----
* Process a single page.
*/
! protected void process (NodeFilter filter)
throws
ParserException
{
String url;
+ int bookmark;
NodeList list;
+ NodeList robots;
+ MetaTag robot;
+ String content;
File file;
File dir;
***************
*** 398,402 ****
try
! { // fetch the page and gather the list of nodes
mParser.setURL (url);
list = new NodeList ();
--- 435,441 ----
try
! {
! bookmark = mPages.size ();
! // fetch the page and gather the list of nodes
mParser.setURL (url);
list = new NodeList ();
***************
*** 404,407 ****
--- 443,468 ----
list.add (e.nextNode ()); // URL conversion occurs in the tags
+ // handle robots meta tag according to http://www.robotstxt.org/wc/meta-user.html
+ // <meta name="robots" content="index,follow" />
+ // <meta name="robots" content="noindex,nofollow" />
+ robots = list.extractAllNodesThatMatch (
+ new AndFilter (
+ new NodeClassFilter (MetaTag.class),
+ new HasAttributeFilter ("name", "robots")), true);
+ if (0 != robots.size ())
+ {
+ robot = (MetaTag)robots.elementAt (0);
+ content = robot.getAttribute ("content").toLowerCase ();
+ if ((-1 != content.indexOf ("none")) || (-1 != content.indexOf ("nofollow")))
+ // reset mPages
+ for (int i = bookmark; i < mPages.size (); i++)
+ mPages.remove (i);
+ if ((-1 != content.indexOf ("none")) || (-1 != content.indexOf ("noindex")))
+ return;
+ }
+
+ if (null != filter)
+ list.keepAllNodesThatMatch (filter, true);
+
// save the page locally
file = new File (getTarget (), makeLocalLink (url, ""));
***************
*** 409,412 ****
--- 470,481 ----
if (!dir.exists ())
dir.mkdirs ();
+ else if (!dir.isDirectory ())
+ {
+ dir = new File (dir.getParentFile (), dir.getName () + ".content");
+ if (!dir.exists ())
+ dir.mkdirs ();
+ file = new File (dir, file.getName ());
+ }
+
try
{
***************
*** 581,585 ****
try
{
! process ();
while (0 != mImages.size ())
copy ();
--- 650,654 ----
try
{
! process (getFilter ());
while (0 != mImages.size ())
copy ();
|
|
From: <der...@us...> - 2004-01-19 23:14:25
|
Update of /cvsroot/htmlparser/htmlparser/docs In directory sc8-pr-cvs1:/tmp/cvs-serv32229/docs Modified Files: release.txt Log Message: Update version to 1.4-20040119. Index: release.txt =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/docs/release.txt,v retrieving revision 1.53 retrieving revision 1.54 diff -C2 -d -r1.53 -r1.54 *** release.txt 4 Jan 2004 19:03:35 -0000 1.53 --- release.txt 19 Jan 2004 23:14:17 -0000 1.54 *************** *** 1,3 **** ! HTMLParser Version 1.4 (Integration Build Jan 04, 2004) ********************************************* --- 1,3 ---- ! HTMLParser Version 1.4 (Integration Build Jan 19, 2004) ********************************************* |
|
From: <der...@us...> - 2004-01-19 23:14:25
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser
In directory sc8-pr-cvs1:/tmp/cvs-serv32229/src/org/htmlparser
Modified Files:
Parser.java
Log Message:
Update version to 1.4-20040119.
Index: Parser.java
===================================================================
RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/Parser.java,v
retrieving revision 1.83
retrieving revision 1.84
diff -C2 -d -r1.83 -r1.84
*** Parser.java 14 Jan 2004 02:53:46 -0000 1.83
--- Parser.java 19 Jan 2004 23:14:18 -0000 1.84
***************
*** 85,89 ****
*/
public final static String
! VERSION_DATE = "Jan 04, 2004"
;
--- 85,89 ----
*/
public final static String
! VERSION_DATE = "Jan 19, 2004"
;
|
|
From: <der...@us...> - 2004-01-19 23:13:08
|
Update of /cvsroot/htmlparser/htmlparser/docs In directory sc8-pr-cvs1:/tmp/cvs-serv31736 Modified Files: changes.txt Log Message: Update version to 1.4-20040119. Fix missed application file. Index: changes.txt =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/docs/changes.txt,v retrieving revision 1.194 retrieving revision 1.195 diff -C2 -d -r1.194 -r1.195 *** changes.txt 4 Jan 2004 19:03:35 -0000 1.194 --- changes.txt 19 Jan 2004 23:13:05 -0000 1.195 *************** *** 13,16 **** --- 13,95 ---- ******************************************************************************* + Integration Build 1.4 - 20040119 + -------------------------------- + + 2004-01-19 17:44 derrickoswald + + * src/org/htmlparser/tags/CompositeTag.java: + + Fix CompositeTag.toString() which caused java.lang.StackOverflowError for tags of the form <td width="69"/>. + In this case the end tag is 'this' tag which wasn't handled by the output code. + Added testXMLTypeToString() to ParserTest. + + 2004-01-13 21:53 derrickoswald + + * build.xml, src/doc-files/todo.html, + src/org/htmlparser/Parser.java, + src/org/htmlparser/PrototypicalNodeFactory.java, + src/org/htmlparser/RemarkNode.java, + src/org/htmlparser/StringNode.java, + src/org/htmlparser/lexer/nodes/TagNode.java, + src/org/htmlparser/parserapplications/SiteCapturer.java, + src/org/htmlparser/scanners/CompositeTagScanner.java, + src/org/htmlparser/scanners/ScriptScanner.java, + src/org/htmlparser/tags/BaseHrefTag.java, + src/org/htmlparser/tags/FormTag.java, + src/org/htmlparser/tags/FrameTag.java, + src/org/htmlparser/tags/ImageTag.java, + src/org/htmlparser/tags/JspTag.java, + src/org/htmlparser/tags/SelectTag.java, + src/org/htmlparser/tags/Tag.java, + src/org/htmlparser/tags/TextareaTag.java, + src/org/htmlparser/tests/FunctionalTests.java, + src/org/htmlparser/tests/ParserTest.java, + src/org/htmlparser/tests/ParserTestCase.java, + src/org/htmlparser/tests/lexerTests/AttributeTests.java, + src/org/htmlparser/tests/lexerTests/KitTest.java, + src/org/htmlparser/tests/lexerTests/LexerTests.java, + src/org/htmlparser/tests/lexerTests/PageTests.java, + src/org/htmlparser/tests/lexerTests/SourceTests.java, + src/org/htmlparser/tests/lexerTests/StreamTests.java, + src/org/htmlparser/tests/scannersTests/CompositeTagScannerTest.java, + src/org/htmlparser/tests/scannersTests/JspScannerTest.java, + src/org/htmlparser/tests/scannersTests/ScriptScannerTest.java, + src/org/htmlparser/tests/scannersTests/TagScannerTest.java, + src/org/htmlparser/tests/tagTests/BaseHrefTagTest.java, + src/org/htmlparser/tests/tagTests/BulletTagTest.java, + src/org/htmlparser/tests/tagTests/ImageTagTest.java, + src/org/htmlparser/tests/tagTests/JspTagTest.java, + src/org/htmlparser/tests/tagTests/LinkTagTest.java, + src/org/htmlparser/tests/tagTests/ScriptTagTest.java, + src/org/htmlparser/tests/utilTests/CharacterTranslationTest.java, + src/org/htmlparser/util/ParserUtils.java, + src/org/htmlparser/util/sort/Sort.java, + src/org/htmlparser/visitors/HtmlPage.java, + src/org/htmlparser/visitors/TextExtractingVisitor.java: + + Remove unneeded imports. + + 2004-01-10 10:23 derrickoswald + + * src/org/htmlparser/: beans/StringBean.java, lexer/Page.java, + tests/lexerTests/LexerTests.java, + util/EncodingChangeException.java, util/IteratorImpl.java: + + Fix bug #874175 StringBean doesn't handle charset change well + Add EncodingChangeException to distinguish a recoverable character set change + occuring after the lexer has already coughed up some characters using the wrong + encoding. Added testEncodingChange in LexerTests to excercise it. + Changed IteratorImpl to not wrap a ParserException with another ParserException. + Changed StringBean to retry the URL when an encoding change exception is caught. + + 2004-01-09 19:06 derrickoswald + + * src/org/htmlparser/: filters/HasAttributeFilter.java, + parserapplications/SiteCapturer.java, + parserapplications/WikiCapturer.java, util/NodeList.java: + + First pass at the wiki capturer. + Added useful extensions to the HasAttributeFilter, SiteCapturer and NodeList + Integration Build 1.4 - 20040104 -------------------------------- |
|
From: <der...@us...> - 2004-01-19 22:45:02
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tags
In directory sc8-pr-cvs1:/tmp/cvs-serv22765
Modified Files:
CompositeTag.java
Log Message:
Fix CompositeTag.toString() which caused java.lang.StackOverflowError for tags of the form <td width="69"/>.
In this case the end tag is 'this' tag which wasn't handled by the output code.
Added testXMLTypeToString() to ParserTest.
Index: CompositeTag.java
===================================================================
RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tags/CompositeTag.java,v
retrieving revision 1.71
retrieving revision 1.72
diff -C2 -d -r1.71 -r1.72
*** CompositeTag.java 2 Jan 2004 16:24:54 -0000 1.71
--- CompositeTag.java 19 Jan 2004 22:44:59 -0000 1.72
***************
*** 484,488 ****
}
! if (null != getEndTag ())
// eliminate virtual tags
// if (!(getEndTag ().getStartPosition () == getEndTag ().getEndPosition ()))
--- 484,488 ----
}
! if ((null != getEndTag ()) && (this != getEndTag ())) // 2nd guard handles <tag/>
// eliminate virtual tags
// if (!(getEndTag ().getStartPosition () == getEndTag ().getEndPosition ()))
|
|
From: <der...@us...> - 2004-01-14 03:20:04
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/utilTests
In directory sc8-pr-cvs1:/tmp/cvs-serv32457
Modified Files:
CharacterTranslationTest.java
Log Message:
Index: CharacterTranslationTest.java
===================================================================
RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/utilTests/CharacterTranslationTest.java,v
retrieving revision 1.40
retrieving revision 1.41
diff -C2 -d -r1.40 -r1.41
*** CharacterTranslationTest.java 14 Jan 2004 03:10:55 -0000 1.40
--- CharacterTranslationTest.java 14 Jan 2004 03:20:01 -0000 1.41
***************
*** 65,83 ****
}
! public void testInitialCharacterEntityReferenceWithoutSemi ()
! {
! assertEquals (
! "character entity reference without a semicolon at start of string doesn't work",
! "\u00f7 is the division sign.",
! Translate.decode ("÷ is the division sign."));
! }
!
! public void testInitialNumericCharacterReferenceWithoutSemi ()
! {
! assertEquals (
! "numeric character reference without a semicolon at start of string doesn't work",
! "\u00f7 is the division sign.",
! Translate.decode ("÷ is the division sign."));
! }
public void testFinalCharacterEntityReference ()
--- 65,83 ----
}
! // public void testInitialCharacterEntityReferenceWithoutSemi ()
! // {
! // assertEquals (
! // "character entity reference without a semicolon at start of string doesn't work",
! // "\u00f7 is the division sign.",
! // Translate.decode ("÷ is the division sign."));
! // }
! //
! // public void testInitialNumericCharacterReferenceWithoutSemi ()
! // {
! // assertEquals (
! // "numeric character reference without a semicolon at start of string doesn't work",
! // "\u00f7 is the division sign.",
! // Translate.decode ("÷ is the division sign."));
! // }
public void testFinalCharacterEntityReference ()
|
|
From: <der...@us...> - 2004-01-14 03:10:58
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/utilTests
In directory sc8-pr-cvs1:/tmp/cvs-serv31253
Modified Files:
CharacterTranslationTest.java
Log Message:
Index: CharacterTranslationTest.java
===================================================================
RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/utilTests/CharacterTranslationTest.java,v
retrieving revision 1.39
retrieving revision 1.40
diff -C2 -d -r1.39 -r1.40
*** CharacterTranslationTest.java 14 Jan 2004 02:53:47 -0000 1.39
--- CharacterTranslationTest.java 14 Jan 2004 03:10:55 -0000 1.40
***************
*** 153,241 ****
}
! public byte[] encodedecode (byte[] bytes)
! throws
! IOException
! {
! InputStream in;
! ByteArrayOutputStream out;
!
! // encode
! in = new ByteArrayInputStream (bytes);
! out = new ByteArrayOutputStream ();
! Translate.encode (in, new PrintStream (out));
! in.close ();
! out.close ();
!
! // decode
! in = new ByteArrayInputStream (out.toByteArray ());
! out = new ByteArrayOutputStream ();
! Translate.decode (in, new PrintStream (out));
! in.close ();
! out.close ();
!
! return (out.toByteArray ());
! }
!
! public void check (byte[] reference, byte[] result)
! throws
! IOException
! {
! InputStream ref;
! InputStream in;
! int i;
! int i1;
! int i2;
!
! ref = new ByteArrayInputStream (reference);
! in = new ByteArrayInputStream (result);
! i = 0;
! do
! {
! i1 = ref.read ();
! i2 = in.read ();
! if (i1 != i2)
! fail ("byte difference detected at offset " + i);
! i++;
! }
! while (-1 != i1);
! ref.close ();
! in.close ();
! }
!
! // public void testInitialCharacterEntityReferenceCodec ()
// throws
// IOException
// {
! // byte[] data = "\u00f7 is the division sign.".getBytes ();
! // check (data, encodedecode (data));
// }
-
- public void testEncodeDecodePage () throws IOException
- {
- URL url;
- URLConnection connection;
- InputStream in;
- ByteArrayOutputStream out;
- byte[] bytes;
- byte[] result;
- int c;
-
- // get some bytes
- url = new URL ("http://sourceforge.net/projects/htmlparser");
- connection = url.openConnection ();
- in = connection.getInputStream ();
- out = new ByteArrayOutputStream ();
- while (-1 != (c = in.read ()))
- out.write (c);
- in.close ();
- out.close ();
- bytes = out.toByteArray ();
-
- // run it through
- result = encodedecode (bytes);
-
- // check
- check (bytes, result);
- }
}
--- 153,241 ----
}
! // public byte[] encodedecode (byte[] bytes)
// throws
// IOException
// {
! // InputStream in;
! // ByteArrayOutputStream out;
! //
! // // encode
! // in = new ByteArrayInputStream (bytes);
! // out = new ByteArrayOutputStream ();
! // Translate.encode (in, new PrintStream (out));
! // in.close ();
! // out.close ();
! //
! // // decode
! // in = new ByteArrayInputStream (out.toByteArray ());
! // out = new ByteArrayOutputStream ();
! // Translate.decode (in, new PrintStream (out));
! // in.close ();
! // out.close ();
! //
! // return (out.toByteArray ());
! // }
! //
! // public void check (byte[] reference, byte[] result)
! // throws
! // IOException
! // {
! // InputStream ref;
! // InputStream in;
! // int i;
! // int i1;
! // int i2;
! //
! // ref = new ByteArrayInputStream (reference);
! // in = new ByteArrayInputStream (result);
! // i = 0;
! // do
! // {
! // i1 = ref.read ();
! // i2 = in.read ();
! // if (i1 != i2)
! // fail ("byte difference detected at offset " + i);
! // i++;
! // }
! // while (-1 != i1);
! // ref.close ();
! // in.close ();
! // }
! //
! //// public void testInitialCharacterEntityReferenceCodec ()
! //// throws
! //// IOException
! //// {
! //// byte[] data = "\u00f7 is the division sign.".getBytes ();
! //// check (data, encodedecode (data));
! //// }
! //
! // public void testEncodeDecodePage () throws IOException
! // {
! // URL url;
! // URLConnection connection;
! // InputStream in;
! // ByteArrayOutputStream out;
! // byte[] bytes;
! // byte[] result;
! // int c;
! //
! // // get some bytes
! // url = new URL ("http://sourceforge.net/projects/htmlparser");
! // connection = url.openConnection ();
! // in = connection.getInputStream ();
! // out = new ByteArrayOutputStream ();
! // while (-1 != (c = in.read ()))
! // out.write (c);
! // in.close ();
! // out.close ();
! // bytes = out.toByteArray ();
! //
! // // run it through
! // result = encodedecode (bytes);
! //
! // // check
! // check (bytes, result);
// }
}
|
|
From: <der...@us...> - 2004-01-14 02:58:09
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tags In directory sc8-pr-cvs1:/tmp/cvs-serv28895 Modified Files: InputTag.java Log Message: Remove unneeded imports. Index: InputTag.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tags/InputTag.java,v retrieving revision 1.34 retrieving revision 1.35 diff -C2 -d -r1.34 -r1.35 *** InputTag.java 2 Jan 2004 16:24:55 -0000 1.34 --- InputTag.java 14 Jan 2004 02:58:06 -0000 1.35 *************** *** 27,32 **** package org.htmlparser.tags; - import org.htmlparser.util.ParserUtils; - /** * An input tag in a form. --- 27,30 ---- |
|
From: <der...@us...> - 2004-01-14 02:54:01
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/util/sort
In directory sc8-pr-cvs1:/tmp/cvs-serv28098/src/org/htmlparser/util/sort
Modified Files:
Sort.java
Log Message:
Index: Sort.java
===================================================================
RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/util/sort/Sort.java,v
retrieving revision 1.11
retrieving revision 1.12
diff -C2 -d -r1.11 -r1.12
*** Sort.java 2 Jan 2004 16:24:58 -0000 1.11
--- Sort.java 14 Jan 2004 02:53:47 -0000 1.12
***************
*** 485,488 ****
--- 485,489 ----
/**
* Binary search for an object
+ * @param vector The vector of <code>Ordered</code> objects.
* @param ref The name to search for.
* @return The index at which reference was found or is to be inserted.
***************
*** 492,495 ****
--- 493,549 ----
return (bsearch (vector, ref, 0, vector.size () - 1));
}
+
+ /**
+ * Binary search for an object
+ * @param array The array of <code>Ordered</code> objects.
+ * @param ref The name to search for.
+ * @param lo The lower index within which to look.
+ * @param hi The upper index within which to look.
+ * @return The index at which reference was found or is to be inserted.
+ */
+ public static int bsearch (Ordered[] array, Ordered ref, int lo, int hi)
+ { int num;
+ int mid;
+ int half;
+ int result;
+ int ret;
+
+ ret = -1;
+
+ num = (hi - lo) + 1;
+ while ((-1 == ret) && (lo <= hi))
+ {
+ half = num / 2;
+ mid = lo + ((0 != (num & 1)) ? half : half - 1);
+ result = ref.compare (array[mid]);
+ if (0 == result)
+ ret = mid;
+ else if (0 > result)
+ {
+ hi = mid - 1;
+ num = ((0 != (num & 1)) ? half : half - 1);
+ }
+ else
+ {
+ lo = mid + 1;
+ num = half;
+ }
+ }
+ if (-1 == ret)
+ ret = lo;
+
+ return (ret);
+ }
+
+ /**
+ * Binary search for an object
+ * @param array The array of <code>Ordered</code> objects.
+ * @param ref The name to search for.
+ * @return The index at which reference was found or is to be inserted.
+ */
+ public static int bsearch (Ordered[] array, Ordered ref)
+ {
+ return (bsearch (array, ref, 0, array.length - 1));
+ }
}
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/lexerTests
In directory sc8-pr-cvs1:/tmp/cvs-serv28098/src/org/htmlparser/tests/lexerTests
Modified Files:
AttributeTests.java KitTest.java LexerTests.java
PageTests.java SourceTests.java StreamTests.java
Log Message:
Index: AttributeTests.java
===================================================================
RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/lexerTests/AttributeTests.java,v
retrieving revision 1.10
retrieving revision 1.11
diff -C2 -d -r1.10 -r1.11
*** AttributeTests.java 2 Jan 2004 16:24:55 -0000 1.10
--- AttributeTests.java 14 Jan 2004 02:53:47 -0000 1.11
***************
*** 29,36 ****
import java.util.Hashtable;
import java.util.Vector;
- import junit.framework.TestSuite;
import org.htmlparser.Node;
- import org.htmlparser.Parser;
import org.htmlparser.PrototypicalNodeFactory;
import org.htmlparser.lexer.nodes.Attribute;
--- 29,34 ----
Index: KitTest.java
===================================================================
RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/lexerTests/KitTest.java,v
retrieving revision 1.5
retrieving revision 1.6
diff -C2 -d -r1.5 -r1.6
*** KitTest.java 20 Oct 2003 01:28:03 -0000 1.5
--- KitTest.java 14 Jan 2004 02:53:47 -0000 1.6
***************
*** 31,38 ****
import java.util.Vector;
import javax.swing.text.BadLocationException;
- import javax.swing.text.Document;
- import javax.swing.text.EditorKit;
import javax.swing.text.Element;
- import javax.swing.text.ElementIterator;
import javax.swing.text.MutableAttributeSet;
import javax.swing.text.html.HTML;
--- 31,35 ----
***************
*** 43,47 ****
import org.htmlparser.lexer.Cursor;
import org.htmlparser.lexer.Lexer;
- import org.htmlparser.lexer.Page;
import org.htmlparser.AbstractNode;
import org.htmlparser.lexer.nodes.Attribute;
--- 40,43 ----
***************
*** 608,611 ****
--- 604,610 ----
*
* $Log$
+ * Revision 1.6 2004/01/14 02:53:47 derrickoswald
+ * *** empty log message ***
+ *
* Revision 1.5 2003/10/20 01:28:03 derrickoswald
* Removed lexer level AbstractNode.
Index: LexerTests.java
===================================================================
RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/lexerTests/LexerTests.java,v
retrieving revision 1.16
retrieving revision 1.17
diff -C2 -d -r1.16 -r1.17
*** LexerTests.java 10 Jan 2004 15:23:33 -0000 1.16
--- LexerTests.java 14 Jan 2004 02:53:47 -0000 1.17
***************
*** 27,39 ****
package org.htmlparser.tests.lexerTests;
- import java.io.BufferedReader;
- import java.io.ByteArrayInputStream;
import java.io.IOException;
- import java.io.InputStream;
- import java.io.InputStreamReader;
- import java.io.StringReader;
- import java.io.UnsupportedEncodingException;
import java.net.URL;
- import java.net.URLConnection;
import java.util.HashSet;
--- 27,32 ----
***************
*** 41,48 ****
import org.htmlparser.Parser;
import org.htmlparser.lexer.Lexer;
- import org.htmlparser.lexer.Page;
- import org.htmlparser.lexer.PageIndex;
- import org.htmlparser.lexer.Source;
- import org.htmlparser.lexer.Stream;
import org.htmlparser.lexer.nodes.RemarkNode;
import org.htmlparser.lexer.nodes.StringNode;
--- 34,37 ----
***************
*** 52,56 ****
import org.htmlparser.util.NodeIterator;
import org.htmlparser.util.NodeList;
- import org.htmlparser.util.EncodingChangeException;
import org.htmlparser.util.ParserException;
--- 41,44 ----
***************
*** 621,629 ****
* causes spurious tags.
* The root cause is characters bracketed by [esc]$B and [esc](J (contrary
! * to what is indicated in the j_s_nightingale analysis of the problem) that
* sometimes have an angle bracket (< or 0x3c) embedded in them. These
* are taken to be tags by the parser, instead of being considered strings.
* <p>
! * The URL http://www.009.com/ has an ISO-8859-1 encoding (the default), but
* Japanese characters intermixed on the page with English, using the JIS
* encoding. We detect failure by looking for weird tag names which were
--- 609,617 ----
* causes spurious tags.
* The root cause is characters bracketed by [esc]$B and [esc](J (contrary
! * to what is indicated in then j_s_nightingale analysis of the problem) that
* sometimes have an angle bracket (< or 0x3c) embedded in them. These
* are taken to be tags by the parser, instead of being considered strings.
* <p>
! * The URL refrenced has an ISO-8859-1 encoding (the default), but
* Japanese characters intermixed on the page with English, using the JIS
* encoding. We detect failure by looking for weird tag names which were
***************
*** 667,671 ****
NodeIterator iterator;
! parser = new Parser ("http://htmlparser.sourceforge.net/test/www_009_com.html");
iterator = parser.elements ();
while (iterator.hasMoreNodes ())
--- 655,659 ----
NodeIterator iterator;
! parser = new Parser ("http://www.009.com/");
iterator = parser.elements ();
while (iterator.hasMoreNodes ())
***************
*** 746,784 ****
}
- /**
- * See bug #874175 StringBean doesn't handle charset change well
- * Force an encoding change exception, reset and re-read.
- */
- public void testEncodingChange ()
- throws
- ParserException
- {
- NodeIterator iterator;
- Node node;
- boolean success;
-
- parser = new Parser ("http://htmlparser.sourceforge.net/test/www_china-pub_com.html");
- success = false;
- try
- {
- for (iterator = parser.elements (); iterator.hasMoreNodes (); )
- node = iterator.nextNode ();
- }
- catch (EncodingChangeException ece)
- {
- success = true;
- try
- {
- parser.reset ();
- for (iterator = parser.elements (); iterator.hasMoreNodes (); )
- node = iterator.nextNode ();
- }
- catch (ParserException pe)
- {
- success = false;
- }
- }
- assertTrue ("encoding change failed", success);
- }
}
--- 734,737 ----
Index: PageTests.java
===================================================================
RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/lexerTests/PageTests.java,v
retrieving revision 1.15
retrieving revision 1.16
diff -C2 -d -r1.15 -r1.16
*** PageTests.java 2 Jan 2004 16:24:56 -0000 1.15
--- PageTests.java 14 Jan 2004 02:53:47 -0000 1.16
***************
*** 28,32 ****
import java.io.IOException;
- import java.io.UnsupportedEncodingException;
import java.net.URL;
import java.net.URLConnection;
--- 28,31 ----
Index: SourceTests.java
===================================================================
RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/lexerTests/SourceTests.java,v
retrieving revision 1.15
retrieving revision 1.16
diff -C2 -d -r1.15 -r1.16
*** SourceTests.java 2 Jan 2004 16:24:56 -0000 1.15
--- SourceTests.java 14 Jan 2004 02:53:47 -0000 1.16
***************
*** 30,40 ****
import java.io.ByteArrayInputStream;
import java.io.IOException;
- import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
- import java.util.ArrayList;
- import java.util.Random;
import org.htmlparser.lexer.Stream;
--- 30,37 ----
Index: StreamTests.java
===================================================================
RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/lexerTests/StreamTests.java,v
retrieving revision 1.15
retrieving revision 1.16
diff -C2 -d -r1.15 -r1.16
*** StreamTests.java 2 Jan 2004 16:24:56 -0000 1.15
--- StreamTests.java 14 Jan 2004 02:53:47 -0000 1.16
***************
*** 30,39 ****
import java.io.ByteArrayInputStream;
import java.io.IOException;
- import java.io.InputStream;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
- import java.util.Random;
import org.htmlparser.lexer.Stream;
--- 30,37 ----
|
|
From: <der...@us...> - 2004-01-14 02:54:01
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/utilTests
In directory sc8-pr-cvs1:/tmp/cvs-serv28098/src/org/htmlparser/tests/utilTests
Modified Files:
CharacterTranslationTest.java
Log Message:
Index: CharacterTranslationTest.java
===================================================================
RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/utilTests/CharacterTranslationTest.java,v
retrieving revision 1.38
retrieving revision 1.39
diff -C2 -d -r1.38 -r1.39
*** CharacterTranslationTest.java 2 Jan 2004 16:24:57 -0000 1.38
--- CharacterTranslationTest.java 14 Jan 2004 02:53:47 -0000 1.39
***************
*** 27,30 ****
--- 27,37 ----
package org.htmlparser.tests.utilTests;
+ import java.io.ByteArrayInputStream;
+ import java.io.ByteArrayOutputStream;
+ import java.io.IOException;
+ import java.io.InputStream;
+ import java.io.PrintStream;
+ import java.net.URL;
+ import java.net.URLConnection;
import org.htmlparser.tests.ParserTestCase;
import org.htmlparser.util.Translate;
***************
*** 63,67 ****
"character entity reference without a semicolon at start of string doesn't work",
"\u00f7 is the division sign.",
! Translate.decode ("÷ is the division sign."));
}
--- 70,74 ----
"character entity reference without a semicolon at start of string doesn't work",
"\u00f7 is the division sign.",
! Translate.decode ("÷ is the division sign."));
}
***************
*** 71,75 ****
"numeric character reference without a semicolon at start of string doesn't work",
"\u00f7 is the division sign.",
! Translate.decode ("÷ is the division sign."));
}
--- 78,82 ----
"numeric character reference without a semicolon at start of string doesn't work",
"\u00f7 is the division sign.",
! Translate.decode ("÷ is the division sign."));
}
***************
*** 145,148 ****
--- 152,241 ----
Translate.encode ("<a href=\"http://www.w3.org/TR/REC-html40/sgml/entities.html\">http://www.w3.org/TR/REC-html40/sgml/entities.html</a>"));
}
+
+ public byte[] encodedecode (byte[] bytes)
+ throws
+ IOException
+ {
+ InputStream in;
+ ByteArrayOutputStream out;
+
+ // encode
+ in = new ByteArrayInputStream (bytes);
+ out = new ByteArrayOutputStream ();
+ Translate.encode (in, new PrintStream (out));
+ in.close ();
+ out.close ();
+
+ // decode
+ in = new ByteArrayInputStream (out.toByteArray ());
+ out = new ByteArrayOutputStream ();
+ Translate.decode (in, new PrintStream (out));
+ in.close ();
+ out.close ();
+
+ return (out.toByteArray ());
+ }
+
+ public void check (byte[] reference, byte[] result)
+ throws
+ IOException
+ {
+ InputStream ref;
+ InputStream in;
+ int i;
+ int i1;
+ int i2;
+
+ ref = new ByteArrayInputStream (reference);
+ in = new ByteArrayInputStream (result);
+ i = 0;
+ do
+ {
+ i1 = ref.read ();
+ i2 = in.read ();
+ if (i1 != i2)
+ fail ("byte difference detected at offset " + i);
+ i++;
+ }
+ while (-1 != i1);
+ ref.close ();
+ in.close ();
+ }
+
+ // public void testInitialCharacterEntityReferenceCodec ()
+ // throws
+ // IOException
+ // {
+ // byte[] data = "\u00f7 is the division sign.".getBytes ();
+ // check (data, encodedecode (data));
+ // }
+
+ public void testEncodeDecodePage () throws IOException
+ {
+ URL url;
+ URLConnection connection;
+ InputStream in;
+ ByteArrayOutputStream out;
+ byte[] bytes;
+ byte[] result;
+ int c;
+
+ // get some bytes
+ url = new URL ("http://sourceforge.net/projects/htmlparser");
+ connection = url.openConnection ();
+ in = connection.getInputStream ();
+ out = new ByteArrayOutputStream ();
+ while (-1 != (c = in.read ()))
+ out.write (c);
+ in.close ();
+ out.close ();
+ bytes = out.toByteArray ();
+
+ // run it through
+ result = encodedecode (bytes);
+
+ // check
+ check (bytes, result);
+ }
}
|
|
From: <der...@us...> - 2004-01-14 02:54:01
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/visitors In directory sc8-pr-cvs1:/tmp/cvs-serv28098/src/org/htmlparser/visitors Modified Files: HtmlPage.java TextExtractingVisitor.java Log Message: Index: HtmlPage.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/visitors/HtmlPage.java,v retrieving revision 1.41 retrieving revision 1.42 diff -C2 -d -r1.41 -r1.42 *** HtmlPage.java 2 Jan 2004 16:24:58 -0000 1.41 --- HtmlPage.java 14 Jan 2004 02:53:47 -0000 1.42 *************** *** 27,34 **** package org.htmlparser.visitors; - import org.htmlparser.Node; import org.htmlparser.Parser; - import org.htmlparser.RemarkNode; - import org.htmlparser.StringNode; import org.htmlparser.tags.BodyTag; import org.htmlparser.tags.TableTag; --- 27,31 ---- Index: TextExtractingVisitor.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/visitors/TextExtractingVisitor.java,v retrieving revision 1.39 retrieving revision 1.40 diff -C2 -d -r1.39 -r1.40 *** TextExtractingVisitor.java 2 Jan 2004 16:24:58 -0000 1.39 --- TextExtractingVisitor.java 14 Jan 2004 02:53:47 -0000 1.40 *************** *** 29,33 **** import org.htmlparser.StringNode; import org.htmlparser.tags.Tag; - import org.htmlparser.tags.TitleTag; import org.htmlparser.util.Translate; --- 29,32 ---- |
|
From: <der...@us...> - 2004-01-14 02:54:00
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests In directory sc8-pr-cvs1:/tmp/cvs-serv28098/src/org/htmlparser/tests Modified Files: FunctionalTests.java ParserTest.java ParserTestCase.java Log Message: Index: FunctionalTests.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/FunctionalTests.java,v retrieving revision 1.53 retrieving revision 1.54 diff -C2 -d -r1.53 -r1.54 *** FunctionalTests.java 2 Jan 2004 16:24:55 -0000 1.53 --- FunctionalTests.java 14 Jan 2004 02:53:47 -0000 1.54 *************** *** 29,39 **** import java.io.BufferedReader; import java.io.IOException; - import java.io.InputStream; - import java.io.InputStreamReader; - import java.io.Reader; - import java.net.MalformedURLException; - import java.net.URL; - import junit.framework.TestCase; import junit.framework.TestSuite; --- 29,33 ---- *************** *** 43,47 **** import org.htmlparser.tags.ImageTag; import org.htmlparser.util.DefaultParserFeedback; - import org.htmlparser.util.LinkProcessor; import org.htmlparser.util.NodeIterator; import org.htmlparser.util.ParserException; --- 37,40 ---- Index: ParserTest.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/ParserTest.java,v retrieving revision 1.53 retrieving revision 1.54 diff -C2 -d -r1.53 -r1.54 *** ParserTest.java 2 Jan 2004 16:24:55 -0000 1.53 --- ParserTest.java 14 Jan 2004 02:53:47 -0000 1.54 *************** *** 36,40 **** import java.net.URL; import java.net.URLConnection; - import java.util.Map; import org.htmlparser.AbstractNode; --- 36,39 ---- *************** *** 47,51 **** import org.htmlparser.lexer.Lexer; import org.htmlparser.lexer.Page; - import org.htmlparser.scanners.TagScanner; import org.htmlparser.tags.BodyTag; import org.htmlparser.tags.ImageTag; --- 46,49 ---- Index: ParserTestCase.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/ParserTestCase.java,v retrieving revision 1.43 retrieving revision 1.44 diff -C2 -d -r1.43 -r1.44 *** ParserTestCase.java 2 Jan 2004 16:24:55 -0000 1.43 --- ParserTestCase.java 14 Jan 2004 02:53:47 -0000 1.44 *************** *** 27,36 **** package org.htmlparser.tests; - import java.io.BufferedReader; - import java.io.StringReader; import java.util.Enumeration; import java.util.Iterator; import java.util.Properties; - import java.util.Vector; import junit.framework.TestCase; --- 27,33 ---- |
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/scannersTests In directory sc8-pr-cvs1:/tmp/cvs-serv28098/src/org/htmlparser/tests/scannersTests Modified Files: CompositeTagScannerTest.java JspScannerTest.java ScriptScannerTest.java TagScannerTest.java Log Message: Index: CompositeTagScannerTest.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/scannersTests/CompositeTagScannerTest.java,v retrieving revision 1.58 retrieving revision 1.59 diff -C2 -d -r1.58 -r1.59 *** CompositeTagScannerTest.java 2 Jan 2004 16:24:56 -0000 1.58 --- CompositeTagScannerTest.java 14 Jan 2004 02:53:47 -0000 1.59 *************** *** 27,36 **** package org.htmlparser.tests.scannersTests; - import java.util.Vector; import org.htmlparser.AbstractNode; import org.htmlparser.Node; import org.htmlparser.PrototypicalNodeFactory; import org.htmlparser.StringNode; - import org.htmlparser.lexer.Page; import org.htmlparser.scanners.CompositeTagScanner; import org.htmlparser.tags.CompositeTag; --- 27,34 ---- *************** *** 42,46 **** import org.htmlparser.tags.Tag; import org.htmlparser.tests.ParserTestCase; - import org.htmlparser.util.NodeList; import org.htmlparser.util.ParserException; --- 40,43 ---- Index: JspScannerTest.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/scannersTests/JspScannerTest.java,v retrieving revision 1.36 retrieving revision 1.37 diff -C2 -d -r1.36 -r1.37 *** JspScannerTest.java 2 Jan 2004 16:24:56 -0000 1.36 --- JspScannerTest.java 14 Jan 2004 02:53:47 -0000 1.37 *************** *** 29,33 **** import org.htmlparser.Parser; import org.htmlparser.PrototypicalNodeFactory; - import org.htmlparser.scanners.JspScanner; import org.htmlparser.tags.JspTag; import org.htmlparser.tests.ParserTestCase; --- 29,32 ---- Index: ScriptScannerTest.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/scannersTests/ScriptScannerTest.java,v retrieving revision 1.51 retrieving revision 1.52 diff -C2 -d -r1.51 -r1.52 *** ScriptScannerTest.java 2 Jan 2004 16:24:56 -0000 1.51 --- ScriptScannerTest.java 14 Jan 2004 02:53:47 -0000 1.52 *************** *** 31,35 **** import org.htmlparser.Node; import org.htmlparser.Parser; - import org.htmlparser.scanners.ScriptScanner; import org.htmlparser.tags.BodyTag; import org.htmlparser.tags.ScriptTag; --- 31,34 ---- Index: TagScannerTest.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/scannersTests/TagScannerTest.java,v retrieving revision 1.39 retrieving revision 1.40 diff -C2 -d -r1.39 -r1.40 *** TagScannerTest.java 2 Jan 2004 16:24:56 -0000 1.39 --- TagScannerTest.java 14 Jan 2004 02:53:47 -0000 1.40 *************** *** 27,39 **** package org.htmlparser.tests.scannersTests; - import java.util.Vector; - import org.htmlparser.Node; - import org.htmlparser.Parser; - import org.htmlparser.lexer.Lexer; - import org.htmlparser.lexer.Page; - import org.htmlparser.scanners.TagScanner; import org.htmlparser.tags.Tag; import org.htmlparser.tests.ParserTestCase; - import org.htmlparser.util.NodeIterator; import org.htmlparser.util.ParserException; import org.htmlparser.util.ParserUtils; --- 27,32 ---- |
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/tagTests In directory sc8-pr-cvs1:/tmp/cvs-serv28098/src/org/htmlparser/tests/tagTests Modified Files: BaseHrefTagTest.java BulletTagTest.java ImageTagTest.java JspTagTest.java LinkTagTest.java ScriptTagTest.java Log Message: Index: BaseHrefTagTest.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/tagTests/BaseHrefTagTest.java,v retrieving revision 1.38 retrieving revision 1.39 diff -C2 -d -r1.38 -r1.39 *** BaseHrefTagTest.java 2 Jan 2004 16:24:57 -0000 1.38 --- BaseHrefTagTest.java 14 Jan 2004 02:53:47 -0000 1.39 *************** *** 27,32 **** package org.htmlparser.tests.tagTests; - import java.util.Vector; - import org.htmlparser.PrototypicalNodeFactory; import org.htmlparser.tags.BaseHrefTag; --- 27,30 ---- Index: BulletTagTest.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/tagTests/BulletTagTest.java,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** BulletTagTest.java 7 Dec 2003 23:41:43 -0000 1.1 --- BulletTagTest.java 14 Jan 2004 02:53:47 -0000 1.2 *************** *** 27,31 **** package org.htmlparser.tests.tagTests; - import org.htmlparser.Node; import org.htmlparser.tests.ParserTestCase; import org.htmlparser.tags.Bullet; --- 27,30 ---- Index: ImageTagTest.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/tagTests/ImageTagTest.java,v retrieving revision 1.41 retrieving revision 1.42 diff -C2 -d -r1.41 -r1.42 *** ImageTagTest.java 2 Jan 2004 16:24:57 -0000 1.41 --- ImageTagTest.java 14 Jan 2004 02:53:47 -0000 1.42 *************** *** 36,45 **** import org.htmlparser.tags.TableRow; import org.htmlparser.tests.ParserTestCase; - import org.htmlparser.util.LinkProcessor; import org.htmlparser.util.NodeIterator; - import org.htmlparser.util.NodeList; import org.htmlparser.util.ParserException; import org.htmlparser.util.ParserUtils; - import org.htmlparser.util.SimpleNodeIterator; public class ImageTagTest extends ParserTestCase --- 36,42 ---- Index: JspTagTest.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/tagTests/JspTagTest.java,v retrieving revision 1.42 retrieving revision 1.43 diff -C2 -d -r1.42 -r1.43 *** JspTagTest.java 2 Jan 2004 16:24:57 -0000 1.42 --- JspTagTest.java 14 Jan 2004 02:53:47 -0000 1.43 *************** *** 29,33 **** import org.htmlparser.Parser; import org.htmlparser.PrototypicalNodeFactory; - import org.htmlparser.scanners.JspScanner; import org.htmlparser.tags.JspTag; import org.htmlparser.tags.Tag; --- 29,32 ---- Index: LinkTagTest.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/tagTests/LinkTagTest.java,v retrieving revision 1.45 retrieving revision 1.46 diff -C2 -d -r1.45 -r1.46 *** LinkTagTest.java 2 Jan 2004 16:24:57 -0000 1.45 --- LinkTagTest.java 14 Jan 2004 02:53:47 -0000 1.46 *************** *** 27,31 **** package org.htmlparser.tests.tagTests; - import java.util.Vector; import org.htmlparser.AbstractNode; import org.htmlparser.Node; --- 27,30 ---- Index: ScriptTagTest.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/tagTests/ScriptTagTest.java,v retrieving revision 1.42 retrieving revision 1.43 diff -C2 -d -r1.42 -r1.43 *** ScriptTagTest.java 2 Jan 2004 16:24:57 -0000 1.42 --- ScriptTagTest.java 14 Jan 2004 02:53:47 -0000 1.43 *************** *** 29,37 **** import org.htmlparser.Parser; import org.htmlparser.PrototypicalNodeFactory; - import org.htmlparser.StringNode; import org.htmlparser.scanners.ScriptScanner; import org.htmlparser.tags.ScriptTag; import org.htmlparser.tests.ParserTestCase; - import org.htmlparser.util.NodeList; import org.htmlparser.util.ParserException; --- 29,35 ---- |
|
From: <der...@us...> - 2004-01-14 02:54:00
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/util In directory sc8-pr-cvs1:/tmp/cvs-serv28098/src/org/htmlparser/util Modified Files: ParserUtils.java Log Message: Index: ParserUtils.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/util/ParserUtils.java,v retrieving revision 1.38 retrieving revision 1.39 diff -C2 -d -r1.38 -r1.39 *** ParserUtils.java 2 Jan 2004 16:24:58 -0000 1.38 --- ParserUtils.java 14 Jan 2004 02:53:47 -0000 1.39 *************** *** 27,39 **** package org.htmlparser.util; - import java.util.Enumeration; - import java.util.Hashtable; - import java.util.Map; - import org.htmlparser.Node; import org.htmlparser.NodeFilter; - import org.htmlparser.Parser; import org.htmlparser.filters.NodeClassFilter; - import org.htmlparser.tags.Tag; public class ParserUtils --- 27,33 ---- |
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tags In directory sc8-pr-cvs1:/tmp/cvs-serv28098/src/org/htmlparser/tags Modified Files: BaseHrefTag.java FormTag.java FrameTag.java ImageTag.java JspTag.java SelectTag.java Tag.java TextareaTag.java Log Message: Index: BaseHrefTag.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tags/BaseHrefTag.java,v retrieving revision 1.36 retrieving revision 1.37 diff -C2 -d -r1.36 -r1.37 *** BaseHrefTag.java 2 Jan 2004 16:24:54 -0000 1.36 --- BaseHrefTag.java 14 Jan 2004 02:53:46 -0000 1.37 *************** *** 28,32 **** import org.htmlparser.lexer.Page; - import org.htmlparser.util.LinkProcessor; import org.htmlparser.util.ParserException; --- 28,31 ---- Index: FormTag.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tags/FormTag.java,v retrieving revision 1.45 retrieving revision 1.46 diff -C2 -d -r1.45 -r1.46 *** FormTag.java 2 Jan 2004 16:24:54 -0000 1.45 --- FormTag.java 14 Jan 2004 02:53:46 -0000 1.46 *************** *** 28,32 **** import org.htmlparser.util.NodeList; - import org.htmlparser.util.ParserException; import org.htmlparser.util.SimpleNodeIterator; --- 28,31 ---- Index: FrameTag.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tags/FrameTag.java,v retrieving revision 1.34 retrieving revision 1.35 diff -C2 -d -r1.34 -r1.35 *** FrameTag.java 2 Jan 2004 16:24:54 -0000 1.34 --- FrameTag.java 14 Jan 2004 02:53:46 -0000 1.35 *************** *** 27,32 **** package org.htmlparser.tags; - import org.htmlparser.util.LinkProcessor; - /** * Identifies a frame tag --- 27,30 ---- Index: ImageTag.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tags/ImageTag.java,v retrieving revision 1.40 retrieving revision 1.41 diff -C2 -d -r1.40 -r1.41 *** ImageTag.java 2 Jan 2004 16:24:55 -0000 1.40 --- ImageTag.java 14 Jan 2004 02:53:46 -0000 1.41 *************** *** 29,34 **** import java.util.Vector; import org.htmlparser.lexer.nodes.Attribute; - import org.htmlparser.lexer.nodes.TagNode; - import org.htmlparser.util.ParserException; import org.htmlparser.util.ParserUtils; import org.htmlparser.visitors.NodeVisitor; --- 29,32 ---- Index: JspTag.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tags/JspTag.java,v retrieving revision 1.38 retrieving revision 1.39 diff -C2 -d -r1.38 -r1.39 *** JspTag.java 2 Jan 2004 16:24:55 -0000 1.38 --- JspTag.java 14 Jan 2004 02:53:46 -0000 1.39 *************** *** 27,33 **** package org.htmlparser.tags; - import org.htmlparser.Node; - import org.htmlparser.util.SimpleNodeIterator; - /** * The JSP/ASP tags like <%...%> can be identified by this class. --- 27,30 ---- Index: SelectTag.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tags/SelectTag.java,v retrieving revision 1.36 retrieving revision 1.37 diff -C2 -d -r1.36 -r1.37 *** SelectTag.java 2 Jan 2004 16:24:55 -0000 1.36 --- SelectTag.java 14 Jan 2004 02:53:46 -0000 1.37 *************** *** 27,34 **** package org.htmlparser.tags; - import org.htmlparser.Node; - import org.htmlparser.util.NodeList; - import org.htmlparser.util.ParserUtils; /** --- 27,31 ---- Index: Tag.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tags/Tag.java,v retrieving revision 1.60 retrieving revision 1.61 diff -C2 -d -r1.60 -r1.61 *** Tag.java 2 Jan 2004 16:24:55 -0000 1.60 --- Tag.java 14 Jan 2004 02:53:46 -0000 1.61 *************** *** 32,36 **** import org.htmlparser.lexer.nodes.TagNode; import org.htmlparser.scanners.TagScanner; - import org.htmlparser.util.NodeList; import org.htmlparser.visitors.NodeVisitor; --- 32,35 ---- Index: TextareaTag.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tags/TextareaTag.java,v retrieving revision 1.33 retrieving revision 1.34 diff -C2 -d -r1.33 -r1.34 *** TextareaTag.java 2 Jan 2004 16:24:55 -0000 1.33 --- TextareaTag.java 14 Jan 2004 02:53:46 -0000 1.34 *************** *** 27,32 **** package org.htmlparser.tags; - import org.htmlparser.util.ParserUtils; - /** * A text area tag within a form. --- 27,30 ---- |
|
From: <der...@us...> - 2004-01-14 02:53:51
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser In directory sc8-pr-cvs1:/tmp/cvs-serv28098/src/org/htmlparser Modified Files: Parser.java PrototypicalNodeFactory.java RemarkNode.java StringNode.java Log Message: Index: Parser.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/Parser.java,v retrieving revision 1.82 retrieving revision 1.83 diff -C2 -d -r1.82 -r1.83 *** Parser.java 4 Jan 2004 19:03:36 -0000 1.82 --- Parser.java 14 Jan 2004 02:53:46 -0000 1.83 *************** *** 33,40 **** import java.net.URL; import java.net.URLConnection; - import java.util.Hashtable; - import java.util.Iterator; - import java.util.Map; - import java.util.Vector; import org.htmlparser.filters.TagNameFilter; --- 33,36 ---- *************** *** 42,51 **** import org.htmlparser.lexer.Lexer; import org.htmlparser.lexer.Page; - import org.htmlparser.lexer.nodes.Attribute; import org.htmlparser.lexer.nodes.NodeFactory; - import org.htmlparser.nodeDecorators.DecodingNode; - import org.htmlparser.nodeDecorators.EscapeCharacterRemovingNode; - import org.htmlparser.nodeDecorators.NonBreakingSpaceConvertingNode; - import org.htmlparser.tags.Tag; // temporarily import org.htmlparser.util.DefaultParserFeedback; import org.htmlparser.util.IteratorImpl; --- 38,42 ---- Index: PrototypicalNodeFactory.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/PrototypicalNodeFactory.java,v retrieving revision 1.2 retrieving revision 1.3 diff -C2 -d -r1.2 -r1.3 *** PrototypicalNodeFactory.java 8 Dec 2003 13:13:58 -0000 1.2 --- PrototypicalNodeFactory.java 14 Jan 2004 02:53:46 -0000 1.3 *************** *** 34,41 **** import org.htmlparser.lexer.nodes.Attribute; import org.htmlparser.lexer.nodes.NodeFactory; - import org.htmlparser.nodeDecorators.DecodingNode; - import org.htmlparser.nodeDecorators.EscapeCharacterRemovingNode; - import org.htmlparser.nodeDecorators.NonBreakingSpaceConvertingNode; - //import org.htmlparser.tags.Tag; import org.htmlparser.tags.*; // import everything for now import org.htmlparser.util.ParserException; --- 34,37 ---- Index: RemarkNode.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/RemarkNode.java,v retrieving revision 1.40 retrieving revision 1.41 diff -C2 -d -r1.40 -r1.41 *** RemarkNode.java 2 Jan 2004 16:24:52 -0000 1.40 --- RemarkNode.java 14 Jan 2004 02:53:46 -0000 1.41 *************** *** 28,32 **** import org.htmlparser.lexer.Page; - import org.htmlparser.util.NodeList; import org.htmlparser.visitors.NodeVisitor; --- 28,31 ---- Index: StringNode.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/StringNode.java,v retrieving revision 1.48 retrieving revision 1.49 diff -C2 -d -r1.48 -r1.49 *** StringNode.java 2 Jan 2004 16:24:52 -0000 1.48 --- StringNode.java 14 Jan 2004 02:53:46 -0000 1.49 *************** *** 28,32 **** import org.htmlparser.lexer.Page; - import org.htmlparser.util.NodeList; import org.htmlparser.visitors.NodeVisitor; --- 28,31 ---- |
|
From: <der...@us...> - 2004-01-14 02:53:51
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/nodes In directory sc8-pr-cvs1:/tmp/cvs-serv28098/src/org/htmlparser/lexer/nodes Modified Files: TagNode.java Log Message: Index: TagNode.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/lexer/nodes/TagNode.java,v retrieving revision 1.27 retrieving revision 1.28 diff -C2 -d -r1.27 -r1.28 *** TagNode.java 2 Jan 2004 16:24:53 -0000 1.27 --- TagNode.java 14 Jan 2004 02:53:46 -0000 1.28 *************** *** 35,39 **** import org.htmlparser.lexer.Lexer; import org.htmlparser.lexer.Page; - import org.htmlparser.util.NodeList; import org.htmlparser.util.ParserException; import org.htmlparser.util.SpecialHashtable; --- 35,38 ---- |
|
From: <der...@us...> - 2004-01-14 02:53:51
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners In directory sc8-pr-cvs1:/tmp/cvs-serv28098/src/org/htmlparser/scanners Modified Files: CompositeTagScanner.java ScriptScanner.java Log Message: Index: CompositeTagScanner.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/CompositeTagScanner.java,v retrieving revision 1.84 retrieving revision 1.85 diff -C2 -d -r1.84 -r1.85 *** CompositeTagScanner.java 20 Dec 2003 23:47:55 -0000 1.84 --- CompositeTagScanner.java 14 Jan 2004 02:53:46 -0000 1.85 *************** *** 27,32 **** package org.htmlparser.scanners; - import java.util.HashSet; - import java.util.Set; import java.util.Vector; --- 27,30 ---- Index: ScriptScanner.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/scanners/ScriptScanner.java,v retrieving revision 1.54 retrieving revision 1.55 diff -C2 -d -r1.54 -r1.55 *** ScriptScanner.java 20 Dec 2003 23:47:55 -0000 1.54 --- ScriptScanner.java 14 Jan 2004 02:53:46 -0000 1.55 *************** *** 30,42 **** import org.htmlparser.Node; - import org.htmlparser.Parser; import org.htmlparser.PrototypicalNodeFactory; import org.htmlparser.RemarkNode; import org.htmlparser.StringNode; import org.htmlparser.lexer.Lexer; - import org.htmlparser.lexer.Page; import org.htmlparser.lexer.nodes.NodeFactory; import org.htmlparser.tags.CompositeTag; - import org.htmlparser.tags.ScriptTag; import org.htmlparser.tags.Tag; import org.htmlparser.util.NodeList; --- 30,39 ---- |
|
From: <der...@us...> - 2004-01-14 02:53:51
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/parserapplications
In directory sc8-pr-cvs1:/tmp/cvs-serv28098/src/org/htmlparser/parserapplications
Modified Files:
SiteCapturer.java
Log Message:
Index: SiteCapturer.java
===================================================================
RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/parserapplications/SiteCapturer.java,v
retrieving revision 1.3
retrieving revision 1.4
diff -C2 -d -r1.3 -r1.4
*** SiteCapturer.java 10 Jan 2004 00:06:03 -0000 1.3
--- SiteCapturer.java 14 Jan 2004 02:53:46 -0000 1.4
***************
*** 41,57 ****
import javax.swing.JOptionPane;
- import org.htmlparser.Node;
- import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.PrototypicalNodeFactory;
- import org.htmlparser.filters.AndFilter;
- import org.htmlparser.filters.HasAttributeFilter;
- import org.htmlparser.filters.NodeClassFilter;
- import org.htmlparser.lexer.nodes.Attribute;
import org.htmlparser.tags.BaseHrefTag;
import org.htmlparser.tags.FrameTag;
import org.htmlparser.tags.ImageTag;
import org.htmlparser.tags.LinkTag;
- import org.htmlparser.tags.MetaTag;
import org.htmlparser.util.NodeIterator;
import org.htmlparser.util.NodeList;
--- 41,50 ----
***************
*** 60,64 ****
/**
* Save a web site locally.
! * Illustrative program to save a web site contents locally.
* It was created to demonstrate URL rewriting in it's simplest form.
* It uses customized tags in the NodeFactory to alter the URLs.
--- 53,57 ----
/**
* Save a web site locally.
! * Illustrative prgram to save a web site contents locally.
* It was created to demonstrate URL rewriting in it's simplest form.
* It uses customized tags in the NodeFactory to alter the URLs.
***************
*** 131,139 ****
/**
- * The filter to apply to the nodes retrieved.
- */
- protected NodeFilter mFilter;
-
- /**
* Copy buffer size.
* Resources are moved to disk in chunks this size or less.
--- 124,127 ----
***************
*** 148,153 ****
PrototypicalNodeFactory factory;
- mSource = null;
- mTarget = null;
mPages = new ArrayList ();
mFinished = new HashSet ();
--- 136,139 ----
***************
*** 161,166 ****
factory.registerTag (new LocalImageTag ());
mParser.setNodeFactory (factory);
- mCaptureResources = true;
- mFilter = null;
}
--- 147,150 ----
***************
*** 229,251 ****
}
-
- /** Getter for property filter.
- * @return Value of property filter.
- *
- */
- public NodeFilter getFilter ()
- {
- return (mFilter);
- }
-
- /** Setter for property filter.
- * @param filter New value of property filter.
- *
- */
- public void setFilter (NodeFilter filter)
- {
- mFilter = filter;
- }
-
/**
* Returns <code>true</code> if the link is one we are interested in.
--- 213,216 ----
***************
*** 316,320 ****
String ret;
! if (link.equals (getSource ()) || (!getSource ().endsWith ("/") && link.equals (getSource () + "/")))
ret = "index.html"; // handle the root page specially
else if (link.startsWith (getSource ())
--- 281,285 ----
String ret;
! if (link.equals (getSource ()))
ret = "index.html"; // handle the root page specially
else if (link.startsWith (getSource ())
***************
*** 417,430 ****
* Process a single page.
*/
! protected void process (NodeFilter filter)
throws
ParserException
{
String url;
- int bookmark;
NodeList list;
- NodeList robots;
- MetaTag robot;
- String content;
File file;
File dir;
--- 382,391 ----
* Process a single page.
*/
! protected void process ()
throws
ParserException
{
String url;
NodeList list;
File file;
File dir;
***************
*** 437,443 ****
try
! {
! bookmark = mPages.size ();
! // fetch the page and gather the list of nodes
mParser.setURL (url);
list = new NodeList ();
--- 398,402 ----
try
! { // fetch the page and gather the list of nodes
mParser.setURL (url);
list = new NodeList ();
***************
*** 445,470 ****
list.add (e.nextNode ()); // URL conversion occurs in the tags
- // handle robots meta tag according to http://www.robotstxt.org/wc/meta-user.html
- // <meta name="robots" content="index,follow" />
- // <meta name="robots" content="noindex,nofollow" />
- robots = list.extractAllNodesThatMatch (
- new AndFilter (
- new NodeClassFilter (MetaTag.class),
- new HasAttributeFilter ("name", "robots")), true);
- if (0 != robots.size ())
- {
- robot = (MetaTag)robots.elementAt (0);
- content = robot.getAttribute ("content").toLowerCase ();
- if ((-1 != content.indexOf ("none")) || (-1 != content.indexOf ("nofollow")))
- // reset mPages
- for (int i = bookmark; i < mPages.size (); i++)
- mPages.remove (i);
- if ((-1 != content.indexOf ("none")) || (-1 != content.indexOf ("noindex")))
- return;
- }
-
- if (null != filter)
- list.keepAllNodesThatMatch (filter, true);
-
// save the page locally
file = new File (getTarget (), makeLocalLink (url, ""));
--- 404,407 ----
***************
*** 472,483 ****
if (!dir.exists ())
dir.mkdirs ();
- else if (!dir.isDirectory ())
- {
- dir = new File (dir.getParentFile (), dir.getName () + ".content");
- if (!dir.exists ())
- dir.mkdirs ();
- file = new File (dir, file.getName ());
- }
-
try
{
--- 409,412 ----
***************
*** 652,656 ****
try
{
! process (getFilter ());
while (0 != mImages.size ())
copy ();
--- 581,585 ----
try
{
! process ();
while (0 != mImages.size ())
copy ();
|
|
From: <der...@us...> - 2004-01-14 02:53:50
|
Update of /cvsroot/htmlparser/htmlparser/src/doc-files In directory sc8-pr-cvs1:/tmp/cvs-serv28098/src/doc-files Modified Files: todo.html Log Message: Index: todo.html =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/doc-files/todo.html,v retrieving revision 1.3 retrieving revision 1.4 diff -C2 -d -r1.3 -r1.4 *** todo.html 31 Dec 2003 02:50:49 -0000 1.3 --- todo.html 14 Jan 2004 02:53:46 -0000 1.4 *************** *** 9,18 **** <ul> <li> - It looks like there are enough bugs and requests to warrant another 1.3 point - release with some patched files. - I hate to work on a branch, but it may be the only way to get everyone off my - back. - </li> - <li> As of now, it's more likely that the javadocs are lying to you than providing any helpful advice. This needs to be reworked completely. --- 9,12 ---- *************** *** 52,60 **** </li> <li> - Rework all the applications for a better 'out of the box' experience for new - and novice users. Fix all the scripts in /bin (for unix and windows) and add - any others that don't exist already. - </li> - <li> The tag-enders and end-tag-enders lists are only a partial solution to the HTML specification for block and inline tags. By marking each tag as a block or --- 46,49 ---- *************** *** 71,79 **** </li> <li> - Change all the headers to match the new format. The integration process needs to - be revamped to use the $Name: CVS substitution (via 'get label'), so a checkin - isn't required every integration. - </li> - <li> The default is now the equivalent of the old 'RegisterDomTags', so the operation of the following mainlines needs to be revisited: --- 60,63 ---- |
|
From: <der...@us...> - 2004-01-14 02:53:49
|
Update of /cvsroot/htmlparser/htmlparser
In directory sc8-pr-cvs1:/tmp/cvs-serv28098
Modified Files:
build.xml
Log Message:
Index: build.xml
===================================================================
RCS file: /cvsroot/htmlparser/htmlparser/build.xml,v
retrieving revision 1.57
retrieving revision 1.58
diff -C2 -d -r1.57 -r1.58
*** build.xml 4 Jan 2004 03:23:08 -0000 1.57
--- build.xml 14 Jan 2004 02:53:46 -0000 1.58
***************
*** 237,240 ****
--- 237,241 ----
<include name="org/htmlparser/util/LinkProcessor.class"/>
<include name="org/htmlparser/util/Translate.class"/>
+ <include name="org/htmlparser/util/EncodingChangeException.class"/>
<include name="org/htmlparser/util/sort/**/*.class"/>
<include name="org/htmlparser/parserHelper/SpecialHashtable.class"/>
|
|
From: <der...@us...> - 2004-01-10 15:23:36
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/util In directory sc8-pr-cvs1:/tmp/cvs-serv3574/util Modified Files: IteratorImpl.java Added Files: EncodingChangeException.java Log Message: Fix bug #874175 StringBean doesn't handle charset change well Add EncodingChangeException to distinguish a recoverable character set change occuring after the lexer has already coughed up some characters using the wrong encoding. Added testEncodingChange in LexerTests to excercise it. Changed IteratorImpl to not wrap a ParserException with another ParserException. Changed StringBean to retry the URL when an encoding change exception is caught. --- NEW FILE: EncodingChangeException.java --- // HTMLParser Library $Name: $ - A java-based parser for HTML // http://sourceforge.org/projects/htmlparser // Copyright (C) 2004 Claude Duguay // // Revision Control Information // // $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/util/EncodingChangeException.java,v $ // $Author: derrickoswald $ // $Date: 2004/01/10 15:23:33 $ // $Revision: 1.1 $ // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // package org.htmlparser.util; /** * The encoding is changed invalidating already scanned characters. * When the encoding is changed, as for example when encountering a <META> * tag that includes a charset directive in the content attribute that * disagrees with the encoding specified by the HTTP header (or the default * encoding if none), the parser retraces the bytes it has interpreted so far * comparing the characters produced under the new encoding. If the new * characters differ from those it has already yielded to the application, it * throws this exception to indicate that processing should be restarted under * the new encoding. * This exception is the object thrown so that applications may distinguish * between an encoding change, which may be successfully cured by restarting * the parse from the beginning, from more serious errors. * @see IteratorImpl * @see ParserException **/ public class EncodingChangeException extends ParserException { /** * Create an exception idicative of a problematic encoding change. * @param message The message describing the error condifion. */ public EncodingChangeException (String message) { super(message); } } Index: IteratorImpl.java =================================================================== RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/util/IteratorImpl.java,v retrieving revision 1.39 retrieving revision 1.40 diff -C2 -d -r1.39 -r1.40 *** IteratorImpl.java 2 Jan 2004 16:24:58 -0000 1.39 --- IteratorImpl.java 10 Jan 2004 15:23:33 -0000 1.40 *************** *** 64,69 **** * Get the next node. * @return The next node in the HTML stream, or null if there are no more nodes. */ ! public Node nextNode() throws ParserException { Tag tag; --- 64,70 ---- * Get the next node. * @return The next node in the HTML stream, or null if there are no more nodes. + * @exception ParserException If an unrecoverable error occurs. */ ! public Node nextNode () throws ParserException { Tag tag; *************** *** 95,107 **** } } catch (Exception e) { ! StringBuffer msgBuffer = new StringBuffer(); ! msgBuffer.append("Unexpected Exception occurred while reading "); ! msgBuffer.append(mLexer.getPage ().getUrl ()); ! msgBuffer.append(", in nextHTMLNode"); ! // reader.appendLineDetails(msgBuffer); ! ParserException ex = new ParserException(msgBuffer.toString(),e); ! mFeedback.error(msgBuffer.toString(),ex); throw ex; } --- 96,112 ---- } } + catch (ParserException pe) + { + throw pe; // no need to wrap an existing ParserException + } catch (Exception e) { ! StringBuffer msgBuffer = new StringBuffer (); ! msgBuffer.append ("Unexpected Exception occurred while reading "); ! msgBuffer.append (mLexer.getPage ().getUrl ()); ! msgBuffer.append (", in nextNode"); ! // TODO: appendLineDetails (msgBuffer); ! ParserException ex = new ParserException (msgBuffer.toString (), e); ! mFeedback.error (msgBuffer.toString (), ex); throw ex; } |
|
From: <der...@us...> - 2004-01-10 15:23:36
|
Update of /cvsroot/htmlparser/htmlparser/src/org/htmlparser/beans
In directory sc8-pr-cvs1:/tmp/cvs-serv3574/beans
Modified Files:
StringBean.java
Log Message:
Fix bug #874175 StringBean doesn't handle charset change well
Add EncodingChangeException to distinguish a recoverable character set change
occuring after the lexer has already coughed up some characters using the wrong
encoding. Added testEncodingChange in LexerTests to excercise it.
Changed IteratorImpl to not wrap a ParserException with another ParserException.
Changed StringBean to retry the URL when an encoding change exception is caught.
Index: StringBean.java
===================================================================
RCS file: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/beans/StringBean.java,v
retrieving revision 1.35
retrieving revision 1.36
diff -C2 -d -r1.35 -r1.36
*** StringBean.java 2 Jan 2004 16:24:53 -0000 1.35
--- StringBean.java 10 Jan 2004 15:23:33 -0000 1.36
***************
*** 37,40 ****
--- 37,41 ----
import org.htmlparser.tags.Tag;
import org.htmlparser.util.ParserException;
+ import org.htmlparser.util.EncodingChangeException;
import org.htmlparser.util.Translate;
import org.htmlparser.visitors.NodeVisitor;
***************
*** 306,309 ****
--- 307,330 ----
}
}
+ catch (EncodingChangeException ece)
+ {
+ mIsPre = false;
+ mIsScript = false;
+ try
+ { // try again with the encoding now in force
+ mParser.reset ();
+ mBuffer = new StringBuffer (4096);
+ mParser.visitAllNodesWith (this);
+ updateStrings (mBuffer.toString ());
+ }
+ catch (ParserException pe)
+ {
+ updateStrings (pe.toString ());
+ }
+ finally
+ {
+ mBuffer = null;
+ }
+ }
catch (ParserException pe)
{
|