From: <ad...@us...> - 2010-11-17 23:53:59
|
Revision: 1208 http://jtidy.svn.sourceforge.net/jtidy/?rev=1208&view=rev Author: aditsu Date: 2010-11-17 23:53:53 +0000 (Wed, 17 Nov 2010) Log Message: ----------- partial fix for test 445557 - modified encodingError to use decimal character values for invalid_char messages, and added special handling for characters between 127 and 160 in readChar Modified Paths: -------------- branches/CodeUpdateAndJava5/src/main/java/org/w3c/tidy/Report.java branches/CodeUpdateAndJava5/src/main/java/org/w3c/tidy/StreamInJavaImpl.java Modified: branches/CodeUpdateAndJava5/src/main/java/org/w3c/tidy/Report.java =================================================================== --- branches/CodeUpdateAndJava5/src/main/java/org/w3c/tidy/Report.java 2010-11-17 23:46:35 UTC (rev 1207) +++ branches/CodeUpdateAndJava5/src/main/java/org/w3c/tidy/Report.java 2010-11-17 23:53:53 UTC (rev 1208) @@ -480,21 +480,11 @@ break; case VENDOR_SPECIFIC_CHARS: lexer.badChars |= BC_VENDOR_SPECIFIC_CHARS; - messageLexer( - code.code(), - lexer, - Level.WARNING, - "invalid_char", - replaceMode, buf); + messageLexer(code.code(), lexer, Level.WARNING, "invalid_char", replaceMode, c); break; case INVALID_SGML_CHARS: lexer.badChars |= BC_INVALID_SGML_CHARS; - messageLexer( - code.code(), - lexer, - Level.WARNING, - "invalid_char", - replaceMode, buf); + messageLexer(code.code(), lexer, Level.WARNING, "invalid_char", replaceMode, c); break; case INVALID_UTF8: lexer.badChars |= BC_INVALID_UTF8; Modified: branches/CodeUpdateAndJava5/src/main/java/org/w3c/tidy/StreamInJavaImpl.java =================================================================== --- branches/CodeUpdateAndJava5/src/main/java/org/w3c/tidy/StreamInJavaImpl.java 2010-11-17 23:46:35 UTC (rev 1207) +++ branches/CodeUpdateAndJava5/src/main/java/org/w3c/tidy/StreamInJavaImpl.java 2010-11-17 23:53:53 UTC (rev 1208) @@ -119,6 +119,8 @@ private int tabsize; private int tabs; + + private Lexer lexer; /** * Instantiates a new StreamInJavaImpl. @@ -259,6 +261,40 @@ return c; } } + + /* produced e.g. as a side-effect of smart quotes in Word */ + /* but can't happen if using MACROMAN encoding */ + if (127 < c && c < 160) { + int c1 = 0; + int replMode = Report.DISCARDED_CHAR; + final String enc = lexer.configuration.getInCharEncodingName(); + final String repl = lexer.configuration.getReplacementCharEncoding(); + boolean isVendorChar = ("WIN1252".equals(enc) || "MACROMAN".equals(enc)); + boolean isWinChar = ("WIN1252".equals(enc) || "WIN1252".equals(repl)); + boolean isMacChar = ("MACROMAN".equals(enc) || "MACROMAN".equals(repl)); + + /* set error position just before offending character */ + lexer.lines = curline; + lexer.columns = curcol; + + if (isWinChar) { + c1 = EncodingUtils.decodeWin1252(c); + } + else if (isMacChar) { + c1 = EncodingUtils.decodeMacRoman(c); + } + if (c1 != 0) { + replMode = Report.REPLACED_CHAR; + } + + if (c1 == 0 && isVendorChar) { + lexer.report.encodingError(lexer, ErrorCode.VENDOR_SPECIFIC_CHARS, c, replMode); + } + else if (!isVendorChar) { + lexer.report.encodingError(lexer, ErrorCode.INVALID_SGML_CHARS, c, replMode); + } + c = c1; + } this.curcol++; return c; @@ -333,9 +369,8 @@ /** * @see org.w3c.tidy.StreamIn#setLexer(org.w3c.tidy.Lexer) */ - public void setLexer(Lexer lexer) - { - // unused in the java implementation + public void setLexer(final Lexer lexer) { + this.lexer = lexer; } } \ No newline at end of file This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |