[nekohtml-dev] SF.net SVN: nekohtml:[190] trunk

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 454-5900

Revision: 190
          http://nekohtml.svn.sourceforge.net/nekohtml/?rev=190&view=rev
Author:   mguillem
Date:     2008-08-29 12:47:26 +0000 (Fri, 29 Aug 2008)

Log Message:
-----------
added new feature "http://cyberneko.org/html/features/parse-noscript-content" (default to true) to turn off <noscript> content parsing

Modified Paths:
--------------
    trunk/doc/changes.html
    trunk/doc/settings.html
    trunk/src/org/cyberneko/html/HTMLScanner.java

Added Paths:
-----------
    trunk/data/canonical/test-noscript-parseit.html
    trunk/data/canonical/test-noscript.html
    trunk/data/test-noscript-parseit.html
    trunk/data/test-noscript.html
    trunk/data/test-noscript.html.settings

Added: trunk/data/canonical/test-noscript-parseit.html
===================================================================

--- trunk/data/canonical/test-noscript-parseit.html	                        (rev 0)
+++ trunk/data/canonical/test-noscript-parseit.html	2008-08-29 12:47:26 UTC (rev 190)
@@ -0,0 +1,12 @@
+(HTML
+(BODY
+(NOSCRIPT
+(DIV
+"hello 
+(SPAN
+"world
+)SPAN
+)DIV
+)NOSCRIPT
+)BODY
+)HTML
\ No newline at end of file


Property changes on: trunk/data/canonical/test-noscript-parseit.html
___________________________________________________________________
Added: svn:keywords
   + Author Date Id Revision
Added: svn:eol-style
   + native

Added: trunk/data/canonical/test-noscript.html
===================================================================
--- trunk/data/canonical/test-noscript.html	                        (rev 0)
+++ trunk/data/canonical/test-noscript.html	2008-08-29 12:47:26 UTC (rev 190)
@@ -0,0 +1,7 @@
+(HTML
+(BODY
+(NOSCRIPT
+"<div>hello <span>world</span>
+)NOSCRIPT
+)BODY
+)HTML
\ No newline at end of file


Property changes on: trunk/data/canonical/test-noscript.html
___________________________________________________________________
Added: svn:keywords
   + Author Date Id Revision
Added: svn:eol-style
   + native

Added: trunk/data/test-noscript-parseit.html
===================================================================
--- trunk/data/test-noscript-parseit.html	                        (rev 0)
+++ trunk/data/test-noscript-parseit.html	2008-08-29 12:47:26 UTC (rev 190)
@@ -0,0 +1 @@
+<noscript><div>hello <span>world</span></noscript>
\ No newline at end of file


Property changes on: trunk/data/test-noscript-parseit.html
___________________________________________________________________
Added: svn:keywords
   + Author Date Id Revision
Added: svn:eol-style
   + native

Added: trunk/data/test-noscript.html
===================================================================
--- trunk/data/test-noscript.html	                        (rev 0)
+++ trunk/data/test-noscript.html	2008-08-29 12:47:26 UTC (rev 190)
@@ -0,0 +1 @@
+<noscript><div>hello <span>world</span></noscript>
\ No newline at end of file


Property changes on: trunk/data/test-noscript.html
___________________________________________________________________
Added: svn:keywords
   + Author Date Id Revision
Added: svn:eol-style
   + native

Added: trunk/data/test-noscript.html.settings
===================================================================
--- trunk/data/test-noscript.html.settings	                        (rev 0)
+++ trunk/data/test-noscript.html.settings	2008-08-29 12:47:26 UTC (rev 190)
@@ -0,0 +1 @@
+feature http://cyberneko.org/html/features/parse-noscript-content false

Modified: trunk/doc/changes.html
===================================================================
--- trunk/doc/changes.html	2008-08-27 11:26:45 UTC (rev 189)
+++ trunk/doc/changes.html	2008-08-29 12:47:26 UTC (rev 190)
@@ -27,7 +27,8 @@
  <dt>Future version
  <dd>Fixed bugs #2059466 and #2051091 (accepting unknown tags within inline elements as well as as containers, don't accept any container in head),
  #2039483 (wrong augmentation when attribute value contains a newline, patch from Ian Roberts), 
- #2039915 (failed skip() does not back up columnNumber, patch from Ian Roberts)
+ #2039915 (failed skip() does not back up columnNumber, patch from Ian Roberts),
+ added new feature <code>http://cyberneko.org/html/features/parse-noscript-content</code> to turn off &lt;noscript&gt; content parsing
  
  <dt>Version 1.9.8 (22 Jul 2008)
  <dd>Fixed bugs #1949460 (handling of uppercase 'X' for entities in hexadecimal format),

Modified: trunk/doc/settings.html
===================================================================
--- trunk/doc/settings.html	2008-08-27 11:26:45 UTC (rev 189)
+++ trunk/doc/settings.html	2008-08-29 12:47:26 UTC (rev 190)
@@ -320,6 +320,16 @@
    to know about errors in the parsed HTML document, this feature
    can be set to <code>true</code>.
   <td align='center'>false
+  <tr>
+  <td>
+   <a name='parse-noscript-content'></a>
+   <span class='id'>http://cyberneko.org/html/features/parse-noscript-content</span>
+   <br>
+   Specifies whether the content of a &lt;noscript&gt;...&lt;/noscript&gt; node should be parsed or not.
+   When set to <code>false</code> the content will be considered as plain text whereas when set to <code>true</code>,
+   tags will be parsed normally.</td>
+  <td align='center'>true</td>
+  </tr>
 </table>
 
 <h2>Properties</h2>

Modified: trunk/src/org/cyberneko/html/HTMLScanner.java
===================================================================
--- trunk/src/org/cyberneko/html/HTMLScanner.java	2008-08-27 11:26:45 UTC (rev 189)
+++ trunk/src/org/cyberneko/html/HTMLScanner.java	2008-08-29 12:47:26 UTC (rev 190)
@@ -69,6 +69,7 @@
  * <li>http://cyberneko.org/html/features/scanner/cdata-sections
  * <li>http://cyberneko.org/html/features/override-doctype
  * <li>http://cyberneko.org/html/features/insert-doctype
+ * <li>http://cyberneko.org/html/features/parse-noscript-content
  * </ul>
  * <p>
  * This component recognizes the following properties:
@@ -197,6 +198,9 @@
 
     /** Insert document type declaration. */
     public static final String INSERT_DOCTYPE = "http://cyberneko.org/html/features/insert-doctype";
+    
+    /** Parse &lt;noscript&gt;...&lt;/noscript&gt; content */
+    public static final String PARSE_NOSCRIPT_CONTENT = "http://cyberneko.org/html/features/parse-noscript-content";
 
     /** Normalize attribute values. */
     protected static final String NORMALIZE_ATTRIBUTES = "http://cyberneko.org/html/features/scanner/normalize-attrs";
@@ -218,6 +222,7 @@
         OVERRIDE_DOCTYPE,
         INSERT_DOCTYPE,
         NORMALIZE_ATTRIBUTES,
+        PARSE_NOSCRIPT_CONTENT,
     };
 
     /** Recognized features defaults. */
@@ -237,6 +242,7 @@
         Boolean.FALSE,
         Boolean.FALSE,
         Boolean.FALSE,
+        Boolean.TRUE,
     };
 
     // properties
@@ -382,6 +388,9 @@
 
     /** Normalize attribute values. */
     protected boolean fNormalizeAttributes;
+    
+    /** Parse noscript content. */
+    protected boolean fParseNoScriptContent;
 
     // properties
 
@@ -723,6 +732,7 @@
         fOverrideDoctype = manager.getFeature(OVERRIDE_DOCTYPE);
         fInsertDoctype = manager.getFeature(INSERT_DOCTYPE);
         fNormalizeAttributes = manager.getFeature(NORMALIZE_ATTRIBUTES);
+        fParseNoScriptContent = manager.getFeature(PARSE_NOSCRIPT_CONTENT);
 
         // get properties
         fNamesElems = getNamesValue(String.valueOf(manager.getProperty(NAMES_ELEMS)));
@@ -771,6 +781,9 @@
         else if (featureId.equals(IGNORE_SPECIFIED_CHARSET)) { 
             fIgnoreSpecifiedCharset = state; 
         }
+        else if (featureId.equals(PARSE_NOSCRIPT_CONTENT)) { 
+            fParseNoScriptContent = state; 
+        }
 
     } // setFeature(String,boolean)
 
@@ -1960,6 +1973,9 @@
                                 if ("script".equalsIgnoreCase(ename)) {
                                 	scanScriptContent();
                                 }
+                                else if (!fParseNoScriptContent && "noscript".equalsIgnoreCase(ename)) {
+                                	scanNoScriptContent();
+                                }
                                 else if (ename != null && !fSingleBoolean[0] 
                                     && HTMLElements.getElement(ename).isSpecial() 
                                     && (!ename.equalsIgnoreCase("TITLE") || isEnded(ename))) {
@@ -2022,6 +2038,47 @@
             return true;
         } // scan(boolean):boolean
 
+        /**
+         * Scans the content of <noscript>: it doesn't get parsed but is considered as plain text
+         * when feature {@link HTMLScanner#PARSE_NOSCRIPT_CONTENT} is set to false.
+         * @throws IOException
+         */
+        private void scanNoScriptContent() throws IOException {
+        	final XMLStringBuffer buffer = new XMLStringBuffer();
+        	
+            while (true) {
+                int c = read();
+                if (c == -1) {
+                    break;
+                }
+                if (c == '<') {
+                	final String next = nextContent(10) + " ";
+                	if (next.length() >= 10 && "/noscript".equalsIgnoreCase(next.substring(0, 9))
+            			&& ('>' == next.charAt(9) || Character.isWhitespace(next.charAt(9)))) {
+	                    fCurrentEntity.offset--;
+	                    fCurrentEntity.columnNumber--;
+	                    break;
+                	}
+            	}
+            	if (c == '\r' || c == '\n') {
+                    fCurrentEntity.offset--;
+                    fCurrentEntity.columnNumber--;
+                    int newlines = skipNewlines();
+                    for (int i = 0; i < newlines; i++) {
+                        buffer.append('\n');
+                    }
+                }
+                else {
+                    buffer.append((char)c);
+                }
+            }
+            if (buffer.length > 0 && fDocumentHandler != null) {
+                fEndLineNumber = fCurrentEntity.lineNumber;
+                fEndColumnNumber = fCurrentEntity.columnNumber;
+                fDocumentHandler.characters(buffer, locationAugs());
+            }
+        }
+        
         private void scanScriptContent() throws IOException {
 
         	final XMLStringBuffer buffer = new XMLStringBuffer();


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.