From: <my...@us...> - 2009-11-03 22:25:19
|
Revision: 2111 http://aperture.svn.sourceforge.net/aperture/?rev=2111&view=rev Author: mylka Date: 2009-11-03 22:25:10 +0000 (Tue, 03 Nov 2009) Log Message: ----------- 2881842 - applied the patch by Christian Spurk, the HtmlParserUtil uses the proper lexer Stream class from the htmlparser library - which works correctly with changing encodings in the middle of the file Modified Paths: -------------- aperture/trunk/core/helper/html/src/main/java/org/semanticdesktop/aperture/helper/html/HtmlParserUtil.java Modified: aperture/trunk/core/helper/html/src/main/java/org/semanticdesktop/aperture/helper/html/HtmlParserUtil.java =================================================================== --- aperture/trunk/core/helper/html/src/main/java/org/semanticdesktop/aperture/helper/html/HtmlParserUtil.java 2009-11-03 21:51:38 UTC (rev 2110) +++ aperture/trunk/core/helper/html/src/main/java/org/semanticdesktop/aperture/helper/html/HtmlParserUtil.java 2009-11-03 22:25:10 UTC (rev 2111) @@ -20,6 +20,7 @@ import org.htmlparser.lexer.InputStreamSource; import org.htmlparser.lexer.Lexer; import org.htmlparser.lexer.Page; +import org.htmlparser.lexer.Stream; import org.htmlparser.tags.MetaTag; import org.htmlparser.util.EncodingChangeException; import org.htmlparser.util.ParserException; @@ -71,22 +72,10 @@ // tells him to use a different charset String charsetName = (charset == null) ? Page.DEFAULT_CHARSET : charset.displayName(); - // wrap the InputStream in a BufferedInputStream if it does not support mark and reset - if (!stream.markSupported()) { - stream = new BufferedInputStream(stream, BUFFER_SIZE); - } - - // mark the stream with a sufficiently high read limit so that the Parser can do a reset after it - // encounteres a <meta http-equiv="content-type" content="..."> statement. Apparently the Parser - // does a reset in this case but does not do a mark beforehand. The chosen read limit should be - // greater than or equal to the buffer size of the InputStreamSource created later on as the - // latter will fill its entire buffer. - stream.mark(BUFFER_SIZE); - // parse the document try { // setup some data structures - InputStreamSource source = new InputStreamSource(stream, charsetName, BUFFER_SIZE); + InputStreamSource source = new InputStreamSource(new Stream(stream), charsetName, BUFFER_SIZE); Page page = new Page(source); Lexer lexer = new Lexer(page); Parser parser = new Parser(lexer, FEEDBACK_LOGGER); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |