Work at SourceForge, help us to make it a better place! We have an immediate need for a Support Technician in our San Francisco or Denver office.

Close

#141 HTMLScanner class fails in Turkish locale

1.9.15
closed-fixed
scanner (58)
5
2012-10-05
2012-07-15
Uwe Schindler
No

HTMLScanner uses String.toLowerCase() and String.toUpperCase() without specifying a Locale (Locale.ENGLISH or better Locale.ROOT), so in the Turkish default locale, the uppercasing and lowercasing of element names breaks. See also http://blog.thetaphi.de/2012/07/default-locales-default-charsets-and.html about the problem.

This can be tested by setting the default using Locale.setDefault(new Locale("tr", "TR") and then trying to parse a HTML document. <title> gets reported to SAX as <TİTLE> when uppercasing element names (the default).

The correct way to fix this is described in my blog, with Java 1.6 pass Locale.ROOT to alle (throughout the codebase of NEKOHTML) to String toUpperCase or String.toLowerCase. In previous Java versions a workaround is unsing Locale.ENGLISH.

Discussion

  • Uwe Schindler
    Uwe Schindler
    2012-07-15

    The same happens with charset names. If the charset is reported as "iso-8859-1" (in lowercase), this part of code will never get true:

    ianaEncoding.toUpperCase().startsWith("ISO-8859")

     
  • RBRi
    RBRi
    2012-08-29

    Index: src/org/cyberneko/html/HTMLScanner.java

    --- src/org/cyberneko/html/HTMLScanner.java (revision 306)
    +++ src/org/cyberneko/html/HTMLScanner.java (working copy)
    @@ -25,6 +25,7 @@
    import java.io.UnsupportedEncodingException;
    import java.net.URL;
    import java.util.BitSet;
    +import java.util.Locale;
    import java.util.Stack;

    import org.apache.xerces.util.EncodingMap;
    @@ -884,7 +885,7 @@
    }
    }
    if (encodings[1] == null) {
    - encodings[1] = EncodingMap.getIANA2JavaMapping(encodings[0].toUpperCase());
    + encodings[1] = EncodingMap.getIANA2JavaMapping(encodings[0].toUpperCase(Locale.ENGLISH));
    if (encodings[1] == null) {
    encodings[1] = encodings[0];
    if (fReportErrors) {
    @@ -896,7 +897,7 @@
    fJavaEncoding = encodings[1];
    /* PATCH: Asgeir Asgeirsson */
    fIso8859Encoding = fIANAEncoding == null
    - || fIANAEncoding.toUpperCase().startsWith("ISO-8859")
    + || fIANAEncoding.toUpperCase(Locale.ENGLISH).startsWith("ISO-8859")
    || fIANAEncoding.equalsIgnoreCase(fDefaultIANAEncoding);
    encoding = fIANAEncoding;
    reader = new InputStreamReader(fByteStream, fJavaEncoding);
    @@ -1074,8 +1075,8 @@
    /** Modifies the given name based on the specified mode. */
    protected static final String modifyName(String name, short mode) {
    switch (mode) {
    - case NAMES_UPPERCASE: return name.toUpperCase();
    - case NAMES_LOWERCASE: return name.toLowerCase();
    + case NAMES_UPPERCASE: return name.toUpperCase(Locale.ENGLISH);
    + case NAMES_LOWERCASE: return name.toLowerCase(Locale.ENGLISH);
    }
    return name;
    } // modifyName(String,short):String
    @@ -2055,7 +2056,7 @@
    fElementCount++;
    fSingleBoolean[0] = false;
    final String ename = scanStartElement(fSingleBoolean);
    - final String enameLC = ename == null ? null : ename.toLowerCase();
    + final String enameLC = ename == null ? null : ename.toLowerCase(Locale.ENGLISH);
    fBeginLineNumber = fCurrentEntity.getLineNumber();
    fBeginColumnNumber = fCurrentEntity.getColumnNumber();
    fBeginCharacterOffset = fCurrentEntity.getCharacterOffset();
    @@ -2584,7 +2585,7 @@
    }
    else {
    fAttributes.getName(aindex,fQName);
    - fQName.rawname = fQName.rawname.toLowerCase();
    + fQName.rawname = fQName.rawname.toLowerCase(Locale.ENGLISH);
    fAttributes.setName(aindex,fQName);
    aindex++;
    }
    @@ -2663,7 +2664,7 @@
    String content = getValue(fAttributes, "content");
    if (content != null) {
    content = removeSpaces(content);
    - int index1 = content.toLowerCase().indexOf("charset=");
    + int index1 = content.toLowerCase(Locale.ENGLISH).indexOf("charset=");
    if (index1 != -1 && !fIgnoreSpecifiedCharset) {
    final int index2 = content.indexOf(';', index1);
    final String charset = index2 != -1 ? content.substring(index1+8, index2) : content.substring(index1+8);
    @@ -2733,7 +2734,7 @@
    boolean encodingChanged = false;
    try {
    String ianaEncoding = charset;
    - String javaEncoding = EncodingMap.getIANA2JavaMapping(ianaEncoding.toUpperCase());
    + String javaEncoding = EncodingMap.getIANA2JavaMapping(ianaEncoding.toUpperCase(Locale.ENGLISH));
    if (DEBUG_CHARSET) {
    System.out.println("+++ ianaEncoding: "+ianaEncoding);
    System.out.println("+++ javaEncoding: "+javaEncoding);
    @@ -2754,7 +2755,7 @@
    // change the charset
    else {
    fIso8859Encoding = ianaEncoding == null
    - || ianaEncoding.toUpperCase().startsWith("ISO-8859")
    + || ianaEncoding.toUpperCase(Locale.ENGLISH).startsWith("ISO-8859")
    || ianaEncoding.equalsIgnoreCase(fDefaultIANAEncoding);
    fJavaEncoding = javaEncoding;
    fCurrentEntity.setStream(new InputStreamReader(fByteStream, javaEncoding));
    @@ -3079,7 +3080,7 @@
    private boolean isEnded(String ename) {
    String content = new String(fCurrentEntity.buffer, fCurrentEntity.offset,
    fCurrentEntity.length - fCurrentEntity.offset);
    - return content.toLowerCase().indexOf("</" + ename.toLowerCase() + ">") != -1;
    + return content.toLowerCase(Locale.ENGLISH).indexOf("</" + ename.toLowerCase(Locale.ENGLISH) + ">") != -1;
    }

    } // class ContentScanner
    Index: src/org/cyberneko/html/HTMLTagBalancer.java
    ===================================================================
    --- src/org/cyberneko/html/HTMLTagBalancer.java (revision 306)
    +++ src/org/cyberneko/html/HTMLTagBalancer.java (working copy)
    @@ -18,6 +18,7 @@

    import java.util.ArrayList;
    import java.util.List;
    +import java.util.Locale;

    import org.apache.xerces.util.XMLAttributesImpl;
    import org.apache.xerces.xni.Augmentations;
    @@ -1045,7 +1046,7 @@
    fErrorReporter.reportWarning("HTML2007", new Object[]{ename,iname});
    }
    if (fDocumentHandler != null) {
    - // PATCH: Marc-Andr� Morissette
    + // PATCH: Marc-Andr� Morissette
    callEndElement(info.qname, i < depth - 1 ? synthesizedAugs() : augs);
    }
    }
    @@ -1222,8 +1223,8 @@
    /** Modifies the given name based on the specified mode. */
    protected static final String modifyName(String name, short mode) {
    switch (mode) {
    - case NAMES_UPPERCASE: return name.toUpperCase();
    - case NAMES_LOWERCASE: return name.toLowerCase();
    + case NAMES_UPPERCASE: return name.toUpperCase(Locale.ENGLISH);
    + case NAMES_LOWERCASE: return name.toLowerCase(Locale.ENGLISH);
    }
    return name;
    } // modifyName(String,short):String
    Index: src/org/cyberneko/html/filters/ElementRemover.java
    ===================================================================
    --- src/org/cyberneko/html/filters/ElementRemover.java (revision 306)
    +++ src/org/cyberneko/html/filters/ElementRemover.java (working copy)
    @@ -17,6 +17,7 @@
    package org.cyberneko.html.filters;

    import java.util.Hashtable;
    +import java.util.Locale;

    import org.apache.xerces.xni.Augmentations;
    import org.apache.xerces.xni.NamespaceContext;
    @@ -136,12 +137,12 @@
    * see #removeElement
    */
    public void acceptElement(String element, String[] attributes) {
    - Object key = element.toLowerCase();
    + Object key = element.toLowerCase(Locale.ENGLISH);
    Object value = NULL;
    if (attributes != null) {
    String[] newarray = new String[attributes.length];
    for (int i = 0; i < attributes.length; i++) {
    - newarray[i] = attributes[i].toLowerCase();
    + newarray[i] = attributes[i].toLowerCase(Locale.ENGLISH);
    }
    value = attributes;
    }
    @@ -157,7 +158,7 @@
    * @param element The element to completely remove.
    */
    public void removeElement(String element) {
    - Object key = element.toLowerCase();
    + Object key = element.toLowerCase(Locale.ENGLISH);
    Object value = NULL;
    fRemovedElements.put(key, value);
    } // removeElement(String)
    @@ -306,26 +307,26 @@

    /** Returns true if the specified element is accepted. */
    protected boolean elementAccepted(String element) {
    - Object key = element.toLowerCase();
    + Object key = element.toLowerCase(Locale.ENGLISH);
    return fAcceptedElements.containsKey(key);
    } // elementAccepted(String):boolean

    /** Returns true if the specified element should be removed. */
    protected boolean elementRemoved(String element) {
    - Object key = element.toLowerCase();
    + Object key = element.toLowerCase(Locale.ENGLISH);
    return fRemovedElements.containsKey(key);
    } // elementRemoved(String):boolean

    /** Handles an open tag. */
    protected boolean handleOpenTag(QName element, XMLAttributes attributes) {
    if (elementAccepted(element.rawname)) {
    - Object key = element.rawname.toLowerCase();
    + Object key = element.rawname.toLowerCase(Locale.ENGLISH);
    Object value = fAcceptedElements.get(key);
    if (value != NULL) {
    String[] anames = (String[])value;
    int attributeCount = attributes.getLength();
    LOOP: for (int i = 0; i < attributeCount; i++) {
    - String aname = attributes.getQName(i).toLowerCase();
    + String aname = attributes.getQName(i).toLowerCase(Locale.ENGLISH);
    for (int j = 0; j < anames.length; j++) {
    if (anames[j].equals(aname)) {
    continue LOOP;
    Index: src/org/cyberneko/html/filters/Purifier.java
    ===================================================================
    --- src/org/cyberneko/html/filters/Purifier.java (revision 306)
    +++ src/org/cyberneko/html/filters/Purifier.java (working copy)
    @@ -16,6 +16,8 @@

    package org.cyberneko.html.filters;

    +import java.util.Locale;
    +
    import org.apache.xerces.util.XMLChar;
    import org.apache.xerces.util.XMLStringBuffer;
    import org.apache.xerces.xni.Augmentations;
    @@ -204,7 +206,7 @@
    standalone = null;
    }
    else {
    - standalone = standalone.toLowerCase();
    + standalone = standalone.toLowerCase(Locale.ENGLISH);
    }
    }
    super.xmlDecl(version,encoding,standalone,augs);
    @@ -464,7 +466,7 @@
    for (int i = 0; i < len; i++) {
    str.insert(0, '0');
    }
    - return str.toString().toUpperCase();
    + return str.toString().toUpperCase(Locale.ENGLISH);
    } // toHexString(int,int):String

    } // class Purifier
    Index: src/org/cyberneko/html/filters/Writer.java
    ===================================================================
    --- src/org/cyberneko/html/filters/Writer.java (revision 306)
    +++ src/org/cyberneko/html/filters/Writer.java (working copy)
    @@ -20,6 +20,7 @@
    import java.io.OutputStreamWriter;
    import java.io.PrintWriter;
    import java.io.UnsupportedEncodingException;
    +import java.util.Locale;

    import org.apache.xerces.xni.Augmentations;
    import org.apache.xerces.xni.NamespaceContext;
    @@ -333,11 +334,11 @@
    // modify META[@http-equiv='content-type']/@content value
    int contentIndex = -1;
    String originalContent = null;
    - if (element.rawname.toLowerCase().equals("meta")) {
    + if (element.rawname.toLowerCase(Locale.ENGLISH).equals("meta")) {
    String httpEquiv = null;
    int length = attributes.getLength();
    for (int i = 0; i < length; i++) {
    - String aname = attributes.getQName(i).toLowerCase();
    + String aname = attributes.getQName(i).toLowerCase(Locale.ENGLISH);
    if (aname.equals("http-equiv")) {
    httpEquiv = attributes.getValue(i);
    }
    @@ -345,12 +346,12 @@
    contentIndex = i;
    }
    }
    - if (httpEquiv != null && httpEquiv.toLowerCase().equals("content-type")) {
    + if (httpEquiv != null && httpEquiv.toLowerCase(Locale.ENGLISH).equals("content-type")) {
    fSeenHttpEquiv = true;
    String content = null;
    if (contentIndex != -1) {
    originalContent = attributes.getValue(contentIndex);
    - content = originalContent.toLowerCase();
    + content = originalContent.toLowerCase(Locale.ENGLISH);
    }
    if (content != null) {
    int charsetIndex = content.indexOf("charset=");

     
  • RBRi
    RBRi
    2012-08-29

    Index: src/org/cyberneko/html/filters/NamespaceBinder.java

    --- src/org/cyberneko/html/filters/NamespaceBinder.java (revision 306)
    +++ src/org/cyberneko/html/filters/NamespaceBinder.java (working copy)
    @@ -17,6 +17,7 @@
    package org.cyberneko.html.filters;

    import java.util.Enumeration;
    +import java.util.Locale;
    import java.util.Vector;

    import org.apache.xerces.xni.Augmentations;
    @@ -372,8 +373,8 @@
    /** Modifies the given name based on the specified mode. */
    protected static final String modifyName(String name, short mode) {
    switch (mode) {
    - case NAMES_UPPERCASE: return name.toUpperCase();
    - case NAMES_LOWERCASE: return name.toLowerCase();
    + case NAMES_UPPERCASE: return name.toUpperCase(Locale.ENGLISH);
    + case NAMES_LOWERCASE: return name.toLowerCase(Locale.ENGLISH);
    }
    return name;
    } // modifyName(String,short):String
    @@ -393,7 +394,7 @@
    for (int i = attrCount - 1; i >= 0; i--) {
    attrs.getName(i, fQName);
    String aname = fQName.rawname;
    - String ANAME = aname.toUpperCase();
    + String ANAME = aname.toUpperCase(Locale.ENGLISH);
    if (ANAME.startsWith("XMLNS:") || ANAME.equals("XMLNS")) {
    int anamelen = aname.length();

     
  • Marc Guillemot
    Marc Guillemot
    2012-09-28

    Now fixed. Thanks for reporting (and thanks for the patch Ronald).

     
  • Marc Guillemot
    Marc Guillemot
    2012-09-28

    • assigned_to: nobody --> mguillem
    • status: open --> closed-fixed
     
  • Uwe Schindler
    Uwe Schindler
    2012-09-28

    • status: closed-fixed --> open-fixed
     
  • Marc Guillemot
    Marc Guillemot
    2012-10-01

    Hi Uwe,

    the commit activity is not indicating a release now ;-) Next release will probably come shortly before next release of HtmlUnit, perhaps at the end of the month.

    Interesting to know that NekoHTML is used in Lucene's benkchmark.

     
  • Marc Guillemot
    Marc Guillemot
    2012-10-05

    • status: open-fixed --> closed-fixed