[Jython-checkins] SF.net SVN: jython:[5883] branches/jy3k

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 422-6466

Revision: 5883
          http://jython.svn.sourceforge.net/jython/?rev=5883&view=rev
Author:   fwierzbicki
Date:     2009-01-08 19:22:49 +0000 (Thu, 08 Jan 2009)

Log Message:
-----------
Very initial attempt to support Bytes type in parser.

Modified Paths:
--------------
    branches/jy3k/grammar/Python.g
    branches/jy3k/src/org/python/antlr/GrammarActions.java
    branches/jy3k/src/org/python/core/PyUnicode.java

Modified: branches/jy3k/grammar/Python.g
===================================================================

--- branches/jy3k/grammar/Python.g	2009-01-08 16:48:06 UTC (rev 5882)
+++ branches/jy3k/grammar/Python.g	2009-01-08 19:22:49 UTC (rev 5883)
@@ -94,6 +94,7 @@
 import org.python.antlr.ast.BoolOp;
 import org.python.antlr.ast.boolopType;
 import org.python.antlr.ast.Break;
+import org.python.antlr.ast.Bytes;
 import org.python.antlr.ast.Call;
 import org.python.antlr.ast.ClassDef;
 import org.python.antlr.ast.cmpopType;
@@ -1243,6 +1244,8 @@
     -> ^(COMPLEX<Num>[$COMPLEX, actions.makeComplex($COMPLEX)])
      | (S+=STRING)+ 
     -> ^(STRING<Str>[actions.extractStringToken($S), actions.extractStrings($S, encoding)])
+     | (B+=BYTES)+ 
+    -> ^(BYTES<Bytes>[actions.extractStringToken($B), actions.extractBytes($B, encoding)])
      ;
 
 //listmaker: test ( list_for | (',' test)* [','] )
@@ -1720,6 +1723,24 @@
 /** Match various string types.  Note that greedy=false implies '''
  *  should make us exit loop not continue.
  */
+BYTES
+    :   ('b'|'B')
+        (   '\'\'\'' (options {greedy=false;}:TRIAPOS)* '\'\'\''
+        |   '"""' (options {greedy=false;}:TRIQUOTE)* '"""'
+        |   '"' (ESC|~('\\'|'\n'|'"'))* '"'
+        |   '\'' (ESC|~('\\'|'\n'|'\''))* '\''
+        ) {
+           if (state.tokenStartLine != input.getLine()) {
+               state.tokenStartLine = input.getLine();
+               state.tokenStartCharPositionInLine = -2;
+           }
+        }
+    ;
+
+
+/** Match various string types.  Note that greedy=false implies '''
+ *  should make us exit loop not continue.
+ */
 STRING
     :   ('r'|'R')?
         (   '\'\'\'' (options {greedy=false;}:TRIAPOS)* '\'\'\''
@@ -1735,7 +1756,7 @@
     ;
 
 STRINGPART
-    : {partial}?=> ('r'|'R')?
+    : {partial}?=> ('r'|'b'|'R'|'B')?
         (   '\'\'\'' ~('\'\'\'')*
         |   '"""' ~('"""')*
         )

Modified: branches/jy3k/src/org/python/antlr/GrammarActions.java
===================================================================
--- branches/jy3k/src/org/python/antlr/GrammarActions.java	2009-01-08 16:48:06 UTC (rev 5882)
+++ branches/jy3k/src/org/python/antlr/GrammarActions.java	2009-01-08 19:22:49 UTC (rev 5883)
@@ -17,6 +17,7 @@
 import org.python.antlr.ast.expr_contextType;
 import org.python.antlr.ast.operatorType;
 import org.python.antlr.ast.unaryopType;
+import org.python.antlr.ast.Bytes;
 import org.python.antlr.ast.Context;
 import org.python.antlr.ast.keyword;
 import org.python.antlr.ast.Attribute;
@@ -397,53 +398,67 @@
         return Py.newInteger((int) l);
     }
 
-    class StringPair {
-        private String s;
-        private boolean unicode;
+    //FIXME: this is just a guess ATM
+    PyString extractBytes(List s, String encoding) {
+        Token last = null;
+        StringBuffer sb = new StringBuffer();
+        Iterator iter = s.iterator();
+        while (iter.hasNext()) {
+            last = (Token)iter.next();
+            String sp = extractBytesPart(last, encoding);
+            sb.append(sp);
+        }
+        return new PyString(sb.toString());
+    }
 
-        StringPair(String s, boolean unicode) {
-            this.s = s;
-            this.unicode = unicode;
+    //FIXME: this is just a guess ATM
+    String extractBytesPart(Token t, String encoding) {
+        String string = t.getText();
+        char quoteChar = string.charAt(0);
+        int start = 0;
+        int end;
+        quoteChar = string.charAt(start);
+        boolean raw = false;
+        if (quoteChar == 'b' || quoteChar == 'B') {
+            raw = true;
+            start++;
         }
-        String getString() {
-            return s;
+        int quotes = 3;
+        if (string.length() - start == 2) {
+            quotes = 1;
         }
-        
-        boolean isUnicode() {
-            return unicode;
+        if (string.charAt(start) != string.charAt(start+1)) {
+            quotes = 1;
         }
+
+        start = quotes + start;
+        end = string.length() - quotes;
+        if (encoding != null) {
+            string = new PyUnicode(string.substring(start, end)).encode(encoding);
+        } else {
+            string = string.substring(start, end);
+        }
+        return string;
     }
 
+
     PyString extractStrings(List s, String encoding) {
-        boolean ustring = false;
         Token last = null;
         StringBuffer sb = new StringBuffer();
         Iterator iter = s.iterator();
         while (iter.hasNext()) {
             last = (Token)iter.next();
-            StringPair sp = extractString(last, encoding);
-            if (sp.isUnicode()) {
-                ustring = true;
-            }
-            sb.append(sp.getString());
+            String sp = extractString(last, encoding);
+            sb.append(sp);
         }
-        if (ustring) {
-            return new PyUnicode(sb.toString());
-        }
-        return new PyString(sb.toString());
+        return new PyUnicode(sb.toString());
     }
 
-    StringPair extractString(Token t, String encoding) {
+    String extractString(Token t, String encoding) {
         String string = t.getText();
         char quoteChar = string.charAt(0);
         int start = 0;
         int end;
-        boolean ustring = false;
-
-        if (quoteChar == 'u' || quoteChar == 'U') {
-            ustring = true;
-            start++;
-        }
         quoteChar = string.charAt(start);
         boolean raw = false;
         if (quoteChar == 'r' || quoteChar == 'R') {
@@ -463,22 +478,22 @@
         // string is properly decoded according to the source encoding
         // XXX: No need to re-encode when the encoding is iso-8859-1, but ParserFacade
         // needs to normalize the encoding name
-        if (!ustring && encoding != null) {
+        if (encoding != null) {
             // str with a specified encoding: first re-encode back out
             string = new PyUnicode(string.substring(start, end)).encode(encoding);
             if (!raw) {
                 // Handle escapes in non-raw strs
                 string = PyString.decode_UnicodeEscape(string, 0, string.length(), "strict",
-                                                       ustring);
+                                                       false);
             }
         } else if (raw) {
             // Raw str without an encoding or raw unicode: simply passthru
             string = string.substring(start, end);
         } else {
             // Plain unicode: already decoded, just handle escapes
-            string = PyString.decode_UnicodeEscape(string, start, end, "strict", ustring);
+            string = PyString.decode_UnicodeEscape(string, start, end, "strict", true);
         }
-        return new StringPair(string, ustring);
+        return string;
     }
 
     Token extractStringToken(List s) {

Modified: branches/jy3k/src/org/python/core/PyUnicode.java
===================================================================
--- branches/jy3k/src/org/python/core/PyUnicode.java	2009-01-08 16:48:06 UTC (rev 5882)
+++ branches/jy3k/src/org/python/core/PyUnicode.java	2009-01-08 19:22:49 UTC (rev 5883)
@@ -246,7 +246,7 @@
 
     @ExposedMethod(doc = BuiltinDocs.unicode___repr___doc)
     final PyString unicode___repr__() {
-        return new PyString("u" + encode_UnicodeEscape(string, true));
+        return new PyString(encode_UnicodeEscape(string, true));
     }
 
     @ExposedMethod(doc = BuiltinDocs.unicode___getitem___doc)


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.