From: <mgu...@us...> - 2008-08-29 12:47:19
|
Revision: 190 http://nekohtml.svn.sourceforge.net/nekohtml/?rev=190&view=rev Author: mguillem Date: 2008-08-29 12:47:26 +0000 (Fri, 29 Aug 2008) Log Message: ----------- added new feature "http://cyberneko.org/html/features/parse-noscript-content" (default to true) to turn off <noscript> content parsing Modified Paths: -------------- trunk/doc/changes.html trunk/doc/settings.html trunk/src/org/cyberneko/html/HTMLScanner.java Added Paths: ----------- trunk/data/canonical/test-noscript-parseit.html trunk/data/canonical/test-noscript.html trunk/data/test-noscript-parseit.html trunk/data/test-noscript.html trunk/data/test-noscript.html.settings Added: trunk/data/canonical/test-noscript-parseit.html =================================================================== --- trunk/data/canonical/test-noscript-parseit.html (rev 0) +++ trunk/data/canonical/test-noscript-parseit.html 2008-08-29 12:47:26 UTC (rev 190) @@ -0,0 +1,12 @@ +(HTML +(BODY +(NOSCRIPT +(DIV +"hello +(SPAN +"world +)SPAN +)DIV +)NOSCRIPT +)BODY +)HTML \ No newline at end of file Property changes on: trunk/data/canonical/test-noscript-parseit.html ___________________________________________________________________ Added: svn:keywords + Author Date Id Revision Added: svn:eol-style + native Added: trunk/data/canonical/test-noscript.html =================================================================== --- trunk/data/canonical/test-noscript.html (rev 0) +++ trunk/data/canonical/test-noscript.html 2008-08-29 12:47:26 UTC (rev 190) @@ -0,0 +1,7 @@ +(HTML +(BODY +(NOSCRIPT +"<div>hello <span>world</span> +)NOSCRIPT +)BODY +)HTML \ No newline at end of file Property changes on: trunk/data/canonical/test-noscript.html ___________________________________________________________________ Added: svn:keywords + Author Date Id Revision Added: svn:eol-style + native Added: trunk/data/test-noscript-parseit.html =================================================================== --- trunk/data/test-noscript-parseit.html (rev 0) +++ trunk/data/test-noscript-parseit.html 2008-08-29 12:47:26 UTC (rev 190) @@ -0,0 +1 @@ +<noscript><div>hello <span>world</span></noscript> \ No newline at end of file Property changes on: trunk/data/test-noscript-parseit.html ___________________________________________________________________ Added: svn:keywords + Author Date Id Revision Added: svn:eol-style + native Added: trunk/data/test-noscript.html =================================================================== --- trunk/data/test-noscript.html (rev 0) +++ trunk/data/test-noscript.html 2008-08-29 12:47:26 UTC (rev 190) @@ -0,0 +1 @@ +<noscript><div>hello <span>world</span></noscript> \ No newline at end of file Property changes on: trunk/data/test-noscript.html ___________________________________________________________________ Added: svn:keywords + Author Date Id Revision Added: svn:eol-style + native Added: trunk/data/test-noscript.html.settings =================================================================== --- trunk/data/test-noscript.html.settings (rev 0) +++ trunk/data/test-noscript.html.settings 2008-08-29 12:47:26 UTC (rev 190) @@ -0,0 +1 @@ +feature http://cyberneko.org/html/features/parse-noscript-content false Modified: trunk/doc/changes.html =================================================================== --- trunk/doc/changes.html 2008-08-27 11:26:45 UTC (rev 189) +++ trunk/doc/changes.html 2008-08-29 12:47:26 UTC (rev 190) @@ -27,7 +27,8 @@ <dt>Future version <dd>Fixed bugs #2059466 and #2051091 (accepting unknown tags within inline elements as well as as containers, don't accept any container in head), #2039483 (wrong augmentation when attribute value contains a newline, patch from Ian Roberts), - #2039915 (failed skip() does not back up columnNumber, patch from Ian Roberts) + #2039915 (failed skip() does not back up columnNumber, patch from Ian Roberts), + added new feature <code>http://cyberneko.org/html/features/parse-noscript-content</code> to turn off <noscript> content parsing <dt>Version 1.9.8 (22 Jul 2008) <dd>Fixed bugs #1949460 (handling of uppercase 'X' for entities in hexadecimal format), Modified: trunk/doc/settings.html =================================================================== --- trunk/doc/settings.html 2008-08-27 11:26:45 UTC (rev 189) +++ trunk/doc/settings.html 2008-08-29 12:47:26 UTC (rev 190) @@ -320,6 +320,16 @@ to know about errors in the parsed HTML document, this feature can be set to <code>true</code>. <td align='center'>false + <tr> + <td> + <a name='parse-noscript-content'></a> + <span class='id'>http://cyberneko.org/html/features/parse-noscript-content</span> + <br> + Specifies whether the content of a <noscript>...</noscript> node should be parsed or not. + When set to <code>false</code> the content will be considered as plain text whereas when set to <code>true</code>, + tags will be parsed normally.</td> + <td align='center'>true</td> + </tr> </table> <h2>Properties</h2> Modified: trunk/src/org/cyberneko/html/HTMLScanner.java =================================================================== --- trunk/src/org/cyberneko/html/HTMLScanner.java 2008-08-27 11:26:45 UTC (rev 189) +++ trunk/src/org/cyberneko/html/HTMLScanner.java 2008-08-29 12:47:26 UTC (rev 190) @@ -69,6 +69,7 @@ * <li>http://cyberneko.org/html/features/scanner/cdata-sections * <li>http://cyberneko.org/html/features/override-doctype * <li>http://cyberneko.org/html/features/insert-doctype + * <li>http://cyberneko.org/html/features/parse-noscript-content * </ul> * <p> * This component recognizes the following properties: @@ -197,6 +198,9 @@ /** Insert document type declaration. */ public static final String INSERT_DOCTYPE = "http://cyberneko.org/html/features/insert-doctype"; + + /** Parse <noscript>...</noscript> content */ + public static final String PARSE_NOSCRIPT_CONTENT = "http://cyberneko.org/html/features/parse-noscript-content"; /** Normalize attribute values. */ protected static final String NORMALIZE_ATTRIBUTES = "http://cyberneko.org/html/features/scanner/normalize-attrs"; @@ -218,6 +222,7 @@ OVERRIDE_DOCTYPE, INSERT_DOCTYPE, NORMALIZE_ATTRIBUTES, + PARSE_NOSCRIPT_CONTENT, }; /** Recognized features defaults. */ @@ -237,6 +242,7 @@ Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, + Boolean.TRUE, }; // properties @@ -382,6 +388,9 @@ /** Normalize attribute values. */ protected boolean fNormalizeAttributes; + + /** Parse noscript content. */ + protected boolean fParseNoScriptContent; // properties @@ -723,6 +732,7 @@ fOverrideDoctype = manager.getFeature(OVERRIDE_DOCTYPE); fInsertDoctype = manager.getFeature(INSERT_DOCTYPE); fNormalizeAttributes = manager.getFeature(NORMALIZE_ATTRIBUTES); + fParseNoScriptContent = manager.getFeature(PARSE_NOSCRIPT_CONTENT); // get properties fNamesElems = getNamesValue(String.valueOf(manager.getProperty(NAMES_ELEMS))); @@ -771,6 +781,9 @@ else if (featureId.equals(IGNORE_SPECIFIED_CHARSET)) { fIgnoreSpecifiedCharset = state; } + else if (featureId.equals(PARSE_NOSCRIPT_CONTENT)) { + fParseNoScriptContent = state; + } } // setFeature(String,boolean) @@ -1960,6 +1973,9 @@ if ("script".equalsIgnoreCase(ename)) { scanScriptContent(); } + else if (!fParseNoScriptContent && "noscript".equalsIgnoreCase(ename)) { + scanNoScriptContent(); + } else if (ename != null && !fSingleBoolean[0] && HTMLElements.getElement(ename).isSpecial() && (!ename.equalsIgnoreCase("TITLE") || isEnded(ename))) { @@ -2022,6 +2038,47 @@ return true; } // scan(boolean):boolean + /** + * Scans the content of <noscript>: it doesn't get parsed but is considered as plain text + * when feature {@link HTMLScanner#PARSE_NOSCRIPT_CONTENT} is set to false. + * @throws IOException + */ + private void scanNoScriptContent() throws IOException { + final XMLStringBuffer buffer = new XMLStringBuffer(); + + while (true) { + int c = read(); + if (c == -1) { + break; + } + if (c == '<') { + final String next = nextContent(10) + " "; + if (next.length() >= 10 && "/noscript".equalsIgnoreCase(next.substring(0, 9)) + && ('>' == next.charAt(9) || Character.isWhitespace(next.charAt(9)))) { + fCurrentEntity.offset--; + fCurrentEntity.columnNumber--; + break; + } + } + if (c == '\r' || c == '\n') { + fCurrentEntity.offset--; + fCurrentEntity.columnNumber--; + int newlines = skipNewlines(); + for (int i = 0; i < newlines; i++) { + buffer.append('\n'); + } + } + else { + buffer.append((char)c); + } + } + if (buffer.length > 0 && fDocumentHandler != null) { + fEndLineNumber = fCurrentEntity.lineNumber; + fEndColumnNumber = fCurrentEntity.columnNumber; + fDocumentHandler.characters(buffer, locationAugs()); + } + } + private void scanScriptContent() throws IOException { final XMLStringBuffer buffer = new XMLStringBuffer(); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |