From: <le...@us...> - 2008-11-05 02:39:32
|
Revision: 5542 http://jython.svn.sourceforge.net/jython/?rev=5542&view=rev Author: leosoto Date: 2008-11-05 02:39:25 +0000 (Wed, 05 Nov 2008) Log Message: ----------- Fixing mangling of multibyte characters before parsing PyUnicode input (as described on <http://www.nabble.com/Parsing-and-non-ASCII-Input-td19007415.html>). This doesn't completely solve the parsing problems with unicode input because org.python.antlr.GrammarActions#extractStrings is not aware of the 'current' input encoding yet. But it is a step forward. Refs: #1062 Modified Paths: -------------- trunk/jython/src/org/python/core/ParserFacade.java trunk/jython/src/org/python/core/Py.java trunk/jython/src/org/python/core/__builtin__.java Modified: trunk/jython/src/org/python/core/ParserFacade.java =================================================================== --- trunk/jython/src/org/python/core/ParserFacade.java 2008-10-31 22:06:00 UTC (rev 5541) +++ trunk/jython/src/org/python/core/ParserFacade.java 2008-11-05 02:39:25 UTC (rev 5542) @@ -7,7 +7,9 @@ import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; +import java.io.LineNumberReader; import java.io.Reader; +import java.io.StringReader; import java.io.UnsupportedEncodingException; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -32,14 +34,13 @@ import org.python.core.io.StreamIO; import org.python.core.io.TextIOInputStream; import org.python.core.io.UniversalIOWrapper; -import org.python.core.util.StringUtil; /** * Facade for the classes in the org.python.antlr package. */ public class ParserFacade { - + private static int MARK_LIMIT = 100000; private ParserFacade() {} @@ -70,7 +71,7 @@ reader = null; } } - + if (t instanceof ParseException) { ParseException e = (ParseException)t; PythonTree node = (PythonTree)e.node; @@ -90,76 +91,87 @@ else return Py.JavaError(t); } + /** + * Internal parser entry point. + * + * Users of this method should call fixParseError on any Throwable thrown + * from it, to translate ParserExceptions into PySyntaxErrors or + * PyIndentationErrors. + */ + private static modType parse(BufferedReader reader, + String kind, + String filename, + CompilerFlags cflags) throws Throwable { + reader.mark(MARK_LIMIT); // We need the ability to move back on the + // reader, for the benefit of fixParseError and + // validPartialSentence + if (kind.equals("eval")) { + CharStream cs = new NoCloseReaderStream(reader); + ExpressionParser e = new ExpressionParser(cs, filename); + return e.parse(); + } else if (kind.equals("single")) { + InteractiveParser i = new InteractiveParser(reader, filename); + return i.parse(); + } else if (kind.equals("exec")) { + CharStream cs = new NoCloseReaderStream(reader); + ModuleParser g = new ModuleParser(cs, filename); + return g.file_input(); + } else { + throw Py.ValueError("parse kind must be eval, exec, or single"); + } + } + public static modType parse(InputStream stream, String kind, String filename, CompilerFlags cflags) { - //FIXME: npe? - BufferedReader bufreader = null; - modType node = null; + BufferedReader bufReader = null; try { - if (kind.equals("eval")) { - bufreader = prepBufreader(stream, cflags, filename); - CharStream cs = new NoCloseReaderStream(bufreader); - ExpressionParser e = new ExpressionParser(cs, filename); - node = e.parse(); - } else if (kind.equals("single")) { - bufreader = prepBufreader(stream, cflags, filename); - InteractiveParser i = new InteractiveParser(bufreader, filename); - node = i.parse(); - } else if (kind.equals("exec")) { - bufreader = prepBufreader(stream, cflags, filename); - CharStream cs = new NoCloseReaderStream(bufreader); - ModuleParser g = new ModuleParser(cs, filename); - node = g.file_input(); - } else { - throw Py.ValueError("parse kind must be eval, exec, or single"); - } + // prepBufReader takes care of encoding detection and universal + // newlines: + bufReader = prepBufreader(stream, cflags, filename); + return parse(bufReader, kind, filename, cflags ); } catch (Throwable t) { - throw fixParseError(bufreader, t, filename); + throw fixParseError(bufReader, t, filename); } finally { - try { - if (bufreader != null) { - bufreader.close(); - } - } catch (IOException i) { - //XXX - } + close(bufReader); } - return node; } + public static modType parse(String string, + String kind, + String filename, + CompilerFlags cflags) { + BufferedReader bufReader = null; + try { + bufReader = prepBufReader(string); + return parse(bufReader, kind, filename, cflags); + } catch (Throwable t) { + throw fixParseError(bufReader, t, filename); + } finally { + close(bufReader); + } + } + public static modType partialParse(String string, String kind, String filename, CompilerFlags cflags, boolean stdprompt) { - ByteArrayInputStream istream = new ByteArrayInputStream( - StringUtil.toBytes(string)); - //FIXME: npe? - BufferedReader bufreader = null; - modType node = null; + // XXX: What's the idea of the stdprompt argument? + BufferedReader reader = null; try { - if (kind.equals("single")) { - bufreader = prepBufreader(istream, cflags, filename); - InteractiveParser i = new InteractiveParser(bufreader, filename); - node = i.parse(); - } else if (kind.equals("eval")) { - bufreader = prepBufreader(istream, cflags, filename); - CharStream cs = new NoCloseReaderStream(bufreader); - ExpressionParser e = new ExpressionParser(cs, filename); - node = e.parse(); - } else { - throw Py.ValueError("parse kind must be eval, exec, or single"); - } + reader = prepBufReader(string); + return parse(reader, kind, filename, cflags); } catch (Throwable t) { - PyException p = fixParseError(bufreader, t, filename); - if (validPartialSentence(bufreader, kind, filename)) { + PyException p = fixParseError(reader, t, filename); + if (validPartialSentence(reader, kind, filename)) { return null; } throw p; + } finally { + close(reader); } - return node; } private static boolean validPartialSentence(BufferedReader bufreader, String kind, String filename) { @@ -182,6 +194,7 @@ } } catch (Exception e) { + System.out.println(e); return lexer.eofWhileNested; } return true; @@ -202,8 +215,6 @@ } else if (cflags != null && cflags.encoding != null) { encoding = cflags.encoding; } - } else if (cflags.source_is_utf8) { - throw new ParseException("encoding declaration in Unicode string"); } // Enable universal newlines mode on the input @@ -229,13 +240,29 @@ throw Py.SystemError("Java couldn't find the ISO-8859-1 encoding"); } } - + BufferedReader bufreader = new BufferedReader(reader); - - bufreader.mark(MARK_LIMIT); return bufreader; } + private static BufferedReader prepBufReader(String string) throws IOException { + BufferedReader bufReader; + + // LineNumberReader takes care of universal newlines + bufReader = new LineNumberReader(new StringReader(string)); + + // If the input is a decoded string (implied from the String argument + // for prepBufReader), it can't have an encoding declaration. + bufReader.mark(MARK_LIMIT); + if (findEncoding(bufReader) != null) { + throw new ParseException("encoding declaration in Unicode string"); + } + bufReader.reset(); + + return bufReader; + } + + /** * Check for a BOM mark at the begginning of stream. If there is a BOM * mark, advance the stream passed it. If not, reset() to start at the @@ -262,12 +289,31 @@ } stream.reset(); return false; - } + } private static String readEncoding(InputStream stream) throws IOException { stream.mark(MARK_LIMIT); String encoding = null; BufferedReader br = new BufferedReader(new InputStreamReader(stream), 512); + encoding = findEncoding(br); + // XXX: reset() can still raise an IOException if a line exceeds our large mark + // limit + stream.reset(); + return encodingMap(encoding); + } + + /** + * Reads the first two lines of the reader, searching for an encoding + * declaration. + * + * Note that reseting the reader (if needed) is responsibility of the caller. + * + * @return The declared encoding, or null if no encoding declaration is + * found + */ + private static String findEncoding(BufferedReader br) + throws IOException { + String encoding = null; for (int i = 0; i < 2; i++) { String strLine = br.readLine(); if (strLine == null) { @@ -279,10 +325,7 @@ break; } } - // XXX: reset() can still raise an IOException if a line exceeds our large mark - // limit - stream.reset(); - return encodingMap(encoding); + return encoding; } private static String encodingMap(String encoding) { @@ -308,4 +351,14 @@ return null; } + private static void close(BufferedReader reader) { + try { + if (reader != null) { + reader.close(); + } + } catch (IOException i) { + // XXX: Log the error? + } + } + } Modified: trunk/jython/src/org/python/core/Py.java =================================================================== --- trunk/jython/src/org/python/core/Py.java 2008-10-31 22:06:00 UTC (rev 5541) +++ trunk/jython/src/org/python/core/Py.java 2008-11-05 02:39:25 UTC (rev 5542) @@ -16,6 +16,7 @@ import java.sql.Date; import java.sql.Time; import java.sql.Timestamp; +import java.util.Arrays; import java.util.Calendar; import java.util.HashSet; import java.util.Set; @@ -357,7 +358,7 @@ public static PyObject ImportWarning; public static void ImportWarning(String message) { - warning(ImportWarning, message); + warning(ImportWarning, message); } public static PyObject UnicodeWarning; @@ -1610,34 +1611,31 @@ } // w/o compiler-flags - public static PyObject compile(modType node, String filename) { - return compile(node, getName(), filename); - } - - public static PyObject compile(modType node, String name, - String filename) { - return compile(node, name, filename, true, false); - } - - public static PyObject compile(modType node, String name, - String filename, - boolean linenumbers, - boolean printResults) { - return compile_flags(node, name, filename, linenumbers, - printResults, null); - } - - public static PyObject compile(InputStream istream, String filename, - String kind) { + public static PyObject compile(InputStream istream, String filename, String kind) { return compile_flags(istream, filename, kind, null); } - // with compiler-flags - public static PyObject compile_flags(modType node, String name, - String filename, - boolean linenumbers, - boolean printResults, CompilerFlags cflags) { + /** + * Entry point for compiling modules. + * + * @param node Module node, coming from the parsing process + * @param name Internal name for the compiled code. Typically generated by + * calling {@link #getName()}. + * @param filename Source file name + * @param linenumbers True to track source line numbers on the generated + * code + * @param printResults True to call the sys.displayhook on the result of + * the code + * @param cflags Compiler flags + * @return Code object for the compiled module + */ + public static PyObject compile_flags(modType node, String name, String filename, + boolean linenumbers, boolean printResults, + CompilerFlags cflags) { try { + if (cflags != null && cflags.only_ast) { + return Py.java2py(node); + } ByteArrayOutputStream ostream = new ByteArrayOutputStream(); Module.compile(node, ostream, name, filename, linenumbers, printResults, false, cflags); @@ -1650,62 +1648,75 @@ } } + public static PyObject compile_flags(modType node, String filename, + String kind, CompilerFlags cflags) { + return Py.compile_flags(node, getName(), filename, true, + kind.equals("single"), cflags); + } + + /** + * Compiles python source code coming from a file or another external stream + */ public static PyObject compile_flags(InputStream istream, String filename, - String kind,CompilerFlags cflags) - { + String kind, CompilerFlags cflags) { modType node = ParserFacade.parse(istream, kind, filename, cflags); - if (cflags != null && cflags.only_ast) { - return Py.java2py(node); - } - - boolean printResults = false; - if (kind.equals("single")) { - printResults = true; - } - return Py.compile_flags(node, getName(), filename, true, printResults, cflags); + return Py.compile_flags(node, filename, kind, cflags); } - public static PyObject compile_flags(modType node, String filename, - String kind, CompilerFlags cflags) { - boolean printResults = false; - if (kind.equals("single")) { - printResults = true; + /** + * Compiles python source code coming from decoded Strings. + * + * DO NOT use this for PyString input. Use + * {@link #compile_flags(byte[], String, String, CompilerFlags)} instead. + */ + public static PyObject compile_flags(String data, String filename, + String kind, CompilerFlags cflags) { + if (data.contains("\0")) { + throw Py.TypeError("compile() expected string without null bytes"); } - return Py.compile_flags(node, getName(), filename, true, printResults, cflags); + if (cflags != null && cflags.dont_imply_dedent) { + data += "\n"; + } else { + data += "\n\n"; + } + modType node = ParserFacade.parse(data, kind, filename, cflags); + return Py.compile_flags(node, filename, kind, cflags); } - public static PyObject compile_flags(String data, - String filename, - String kind, - CompilerFlags cflags) { - - if (data.contains("\0")) { - throw Py.TypeError("compile() expected string without null bytes"); + /** + * Compiles python source code coming from bytestrings + */ + public static PyObject compile_flags(byte[] bytes, String filename, + String kind, CompilerFlags cflags) { + for(int i = 0; i < bytes.length; i++) { + if (bytes[i] == 0) { + throw Py.TypeError("compile() expected string without null bytes"); + } } - - byte[] bytes; + byte[] data; if (cflags != null && cflags.dont_imply_dedent) { - bytes = StringUtil.toBytes(data + "\n"); + data = new byte[bytes.length + 1]; + System.arraycopy(bytes, 0, data, 0, bytes.length); + data[data.length - 1] = '\n'; } else { - bytes = StringUtil.toBytes(data + "\n\n"); + data = new byte[bytes.length + 2]; + System.arraycopy(bytes, 0, data, 0, bytes.length); + data[data.length - 1] = data[data.length - 2] = '\n'; } - return Py.compile_flags(new ByteArrayInputStream(bytes), - filename, - kind, - cflags); + modType node = ParserFacade.parse(new ByteArrayInputStream(data), kind, + filename, cflags); + return Py.compile_flags(node, filename, kind, cflags); } - public static PyObject compile_command_flags(String string, - String filename, String kind, CompilerFlags cflags, boolean stdprompt) { + public static PyObject compile_command_flags(String string, String filename, + String kind, CompilerFlags cflags, boolean stdprompt) { modType node = ParserFacade.partialParse(string + "\n", kind, filename, - cflags, stdprompt); - + cflags, stdprompt); if (node == null) { return Py.None; } - return Py.compile_flags(node, Py.getName(), filename, true, true, - cflags); + return Py.compile_flags(node, Py.getName(), filename, true, true, cflags); } public static PyObject[] unpackSequence(PyObject obj, int length) { Modified: trunk/jython/src/org/python/core/__builtin__.java =================================================================== --- trunk/jython/src/org/python/core/__builtin__.java 2008-10-31 22:06:00 UTC (rev 5541) +++ trunk/jython/src/org/python/core/__builtin__.java 2008-11-05 02:39:25 UTC (rev 5542) @@ -1,6 +1,8 @@ // Copyright (c) Corporation for National Research Initiatives package org.python.core; +import java.io.ByteArrayInputStream; +import java.io.InputStream; import java.util.Iterator; import java.util.Map; @@ -281,7 +283,7 @@ if (args[0] instanceof PyUnicode) { flags += PyTableCode.PyCF_SOURCE_IS_UTF8; } - return __builtin__.compile(args[0].toString(), args[1].toString(), args[2].toString(), flags, dont_inherit); + return __builtin__.compile((PyString)args[0], args[1].toString(), args[2].toString(), flags, dont_inherit); case 29: return __builtin__.map(args); case 43: @@ -412,7 +414,7 @@ } public static PyObject apply(PyObject o) { - return o.__call__(); + return o.__call__(); } public static PyObject apply(PyObject o, PyObject args) { @@ -473,19 +475,29 @@ throw Py.TypeError("number coercion failed"); } - public static PyObject compile(String data, String filename, String kind) { - return Py.compile_flags(data, filename, kind, Py.getCompilerFlags()); + public static PyObject compile(PyString data, String filename, String kind) { + if (data instanceof PyUnicode) { + return Py.compile_flags(data.toString(), filename, kind, Py.getCompilerFlags()); + } else { + return Py.compile_flags(data.toBytes(), filename, kind, Py.getCompilerFlags()); + } } public static PyObject compile(modType node, String filename, String kind) { return Py.compile_flags(node, filename, kind, Py.getCompilerFlags()); } - public static PyObject compile(String data, String filename, String kind, int flags, boolean dont_inherit) { + public static PyObject compile(PyString data, String filename, String kind, int flags, boolean dont_inherit) { if ((flags & ~PyTableCode.CO_ALL_FEATURES) != 0) { throw Py.ValueError("compile(): unrecognised flags"); } - return Py.compile_flags(data, filename, kind, Py.getCompilerFlags(flags, dont_inherit)); + if (data instanceof PyUnicode) { + return Py.compile_flags(data.toString(), filename, kind, + Py.getCompilerFlags(flags, dont_inherit)); + } else { + return Py.compile_flags(data.toBytes(), filename, kind, + Py.getCompilerFlags(flags, dont_inherit)); + } } public static PyObject compile(modType node, String filename, String kind, int flags, boolean dont_inherit) { @@ -550,7 +562,7 @@ code = (PyCode) o; } else { if (o instanceof PyString) { - code = (PyCode)compile(o.toString(), "<string>", "eval"); + code = (PyCode)compile((PyString)o, "<string>", "eval"); } else { throw Py.TypeError("eval: argument 1 must be string or code object"); } @@ -1164,7 +1176,7 @@ } public static PyObject reload(PySystemState o) { - // reinitialize methods + // reinitialize methods o.reload(); return o; } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |