From: <fwi...@us...> - 2009-01-08 19:22:54
|
Revision: 5883 http://jython.svn.sourceforge.net/jython/?rev=5883&view=rev Author: fwierzbicki Date: 2009-01-08 19:22:49 +0000 (Thu, 08 Jan 2009) Log Message: ----------- Very initial attempt to support Bytes type in parser. Modified Paths: -------------- branches/jy3k/grammar/Python.g branches/jy3k/src/org/python/antlr/GrammarActions.java branches/jy3k/src/org/python/core/PyUnicode.java Modified: branches/jy3k/grammar/Python.g =================================================================== --- branches/jy3k/grammar/Python.g 2009-01-08 16:48:06 UTC (rev 5882) +++ branches/jy3k/grammar/Python.g 2009-01-08 19:22:49 UTC (rev 5883) @@ -94,6 +94,7 @@ import org.python.antlr.ast.BoolOp; import org.python.antlr.ast.boolopType; import org.python.antlr.ast.Break; +import org.python.antlr.ast.Bytes; import org.python.antlr.ast.Call; import org.python.antlr.ast.ClassDef; import org.python.antlr.ast.cmpopType; @@ -1243,6 +1244,8 @@ -> ^(COMPLEX<Num>[$COMPLEX, actions.makeComplex($COMPLEX)]) | (S+=STRING)+ -> ^(STRING<Str>[actions.extractStringToken($S), actions.extractStrings($S, encoding)]) + | (B+=BYTES)+ + -> ^(BYTES<Bytes>[actions.extractStringToken($B), actions.extractBytes($B, encoding)]) ; //listmaker: test ( list_for | (',' test)* [','] ) @@ -1720,6 +1723,24 @@ /** Match various string types. Note that greedy=false implies ''' * should make us exit loop not continue. */ +BYTES + : ('b'|'B') + ( '\'\'\'' (options {greedy=false;}:TRIAPOS)* '\'\'\'' + | '"""' (options {greedy=false;}:TRIQUOTE)* '"""' + | '"' (ESC|~('\\'|'\n'|'"'))* '"' + | '\'' (ESC|~('\\'|'\n'|'\''))* '\'' + ) { + if (state.tokenStartLine != input.getLine()) { + state.tokenStartLine = input.getLine(); + state.tokenStartCharPositionInLine = -2; + } + } + ; + + +/** Match various string types. Note that greedy=false implies ''' + * should make us exit loop not continue. + */ STRING : ('r'|'R')? ( '\'\'\'' (options {greedy=false;}:TRIAPOS)* '\'\'\'' @@ -1735,7 +1756,7 @@ ; STRINGPART - : {partial}?=> ('r'|'R')? + : {partial}?=> ('r'|'b'|'R'|'B')? ( '\'\'\'' ~('\'\'\'')* | '"""' ~('"""')* ) Modified: branches/jy3k/src/org/python/antlr/GrammarActions.java =================================================================== --- branches/jy3k/src/org/python/antlr/GrammarActions.java 2009-01-08 16:48:06 UTC (rev 5882) +++ branches/jy3k/src/org/python/antlr/GrammarActions.java 2009-01-08 19:22:49 UTC (rev 5883) @@ -17,6 +17,7 @@ import org.python.antlr.ast.expr_contextType; import org.python.antlr.ast.operatorType; import org.python.antlr.ast.unaryopType; +import org.python.antlr.ast.Bytes; import org.python.antlr.ast.Context; import org.python.antlr.ast.keyword; import org.python.antlr.ast.Attribute; @@ -397,53 +398,67 @@ return Py.newInteger((int) l); } - class StringPair { - private String s; - private boolean unicode; + //FIXME: this is just a guess ATM + PyString extractBytes(List s, String encoding) { + Token last = null; + StringBuffer sb = new StringBuffer(); + Iterator iter = s.iterator(); + while (iter.hasNext()) { + last = (Token)iter.next(); + String sp = extractBytesPart(last, encoding); + sb.append(sp); + } + return new PyString(sb.toString()); + } - StringPair(String s, boolean unicode) { - this.s = s; - this.unicode = unicode; + //FIXME: this is just a guess ATM + String extractBytesPart(Token t, String encoding) { + String string = t.getText(); + char quoteChar = string.charAt(0); + int start = 0; + int end; + quoteChar = string.charAt(start); + boolean raw = false; + if (quoteChar == 'b' || quoteChar == 'B') { + raw = true; + start++; } - String getString() { - return s; + int quotes = 3; + if (string.length() - start == 2) { + quotes = 1; } - - boolean isUnicode() { - return unicode; + if (string.charAt(start) != string.charAt(start+1)) { + quotes = 1; } + + start = quotes + start; + end = string.length() - quotes; + if (encoding != null) { + string = new PyUnicode(string.substring(start, end)).encode(encoding); + } else { + string = string.substring(start, end); + } + return string; } + PyString extractStrings(List s, String encoding) { - boolean ustring = false; Token last = null; StringBuffer sb = new StringBuffer(); Iterator iter = s.iterator(); while (iter.hasNext()) { last = (Token)iter.next(); - StringPair sp = extractString(last, encoding); - if (sp.isUnicode()) { - ustring = true; - } - sb.append(sp.getString()); + String sp = extractString(last, encoding); + sb.append(sp); } - if (ustring) { - return new PyUnicode(sb.toString()); - } - return new PyString(sb.toString()); + return new PyUnicode(sb.toString()); } - StringPair extractString(Token t, String encoding) { + String extractString(Token t, String encoding) { String string = t.getText(); char quoteChar = string.charAt(0); int start = 0; int end; - boolean ustring = false; - - if (quoteChar == 'u' || quoteChar == 'U') { - ustring = true; - start++; - } quoteChar = string.charAt(start); boolean raw = false; if (quoteChar == 'r' || quoteChar == 'R') { @@ -463,22 +478,22 @@ // string is properly decoded according to the source encoding // XXX: No need to re-encode when the encoding is iso-8859-1, but ParserFacade // needs to normalize the encoding name - if (!ustring && encoding != null) { + if (encoding != null) { // str with a specified encoding: first re-encode back out string = new PyUnicode(string.substring(start, end)).encode(encoding); if (!raw) { // Handle escapes in non-raw strs string = PyString.decode_UnicodeEscape(string, 0, string.length(), "strict", - ustring); + false); } } else if (raw) { // Raw str without an encoding or raw unicode: simply passthru string = string.substring(start, end); } else { // Plain unicode: already decoded, just handle escapes - string = PyString.decode_UnicodeEscape(string, start, end, "strict", ustring); + string = PyString.decode_UnicodeEscape(string, start, end, "strict", true); } - return new StringPair(string, ustring); + return string; } Token extractStringToken(List s) { Modified: branches/jy3k/src/org/python/core/PyUnicode.java =================================================================== --- branches/jy3k/src/org/python/core/PyUnicode.java 2009-01-08 16:48:06 UTC (rev 5882) +++ branches/jy3k/src/org/python/core/PyUnicode.java 2009-01-08 19:22:49 UTC (rev 5883) @@ -246,7 +246,7 @@ @ExposedMethod(doc = BuiltinDocs.unicode___repr___doc) final PyString unicode___repr__() { - return new PyString("u" + encode_UnicodeEscape(string, true)); + return new PyString(encode_UnicodeEscape(string, true)); } @ExposedMethod(doc = BuiltinDocs.unicode___getitem___doc) This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |