From: <fwi...@us...> - 2008-07-23 01:58:04
|
Revision: 4988 http://jython.svn.sourceforge.net/jython/?rev=4988&view=rev Author: fwierzbicki Date: 2008-07-23 01:58:01 +0000 (Wed, 23 Jul 2008) Log Message: ----------- Made Str offsets better match CPython. PythonTree now extends BaseTree (too much of CommonTree was getting changed anyway). astdump.py repaired so that it is a useful testing ground for ast comparisons. Modified Paths: -------------- branches/asm/ast/astdump.py branches/asm/grammar/Python.g branches/asm/src/org/python/antlr/PythonTree.java Modified: branches/asm/ast/astdump.py =================================================================== --- branches/asm/ast/astdump.py 2008-07-23 01:21:32 UTC (rev 4987) +++ branches/asm/ast/astdump.py 2008-07-23 01:58:01 UTC (rev 4988) @@ -29,9 +29,11 @@ pyfiles = [code_path] for pyfile in pyfiles: - print "%s to %s" % (pyfile, output_dir) import pprint - fh = open(makepath(os.path.join(output_dir, pyfile)), 'w') + path = pyfile.split(os.path.sep) + print "%s to %s: %s" % (pyfile, output_dir, os.path.join(output_dir, *path)) + fh = open(makepath(os.path.join(output_dir, *path)), 'w') + print fh pprint.pprint(astview.tree(pyfile), fh) if __name__ == '__main__': Modified: branches/asm/grammar/Python.g =================================================================== --- branches/asm/grammar/Python.g 2008-07-23 01:21:32 UTC (rev 4987) +++ branches/asm/grammar/Python.g 2008-07-23 01:58:01 UTC (rev 4988) @@ -393,8 +393,7 @@ } Token extractStringToken(List s) { - //XXX: really we want the *last* one. - return (Token)s.get(0); + return (Token)s.get(s.size() - 1); } @@ -956,7 +955,7 @@ | LONGINT -> ^(NumTok<Num>[$LONGINT, makeInt($LONGINT)]) | FLOAT -> ^(NumTok<Num>[$FLOAT, makeFloat($FLOAT)]) | COMPLEX -> ^(NumTok<Num>[$COMPLEX, makeComplex($COMPLEX)]) - | (S+=STRING)+ {debug("S+: " + $S);} + | (S+=STRING)+ -> ^(StrTok<Str>[extractStringToken($S), extractStrings($S)]) ; @@ -1299,7 +1298,12 @@ | '"""' (options {greedy=false;}:TRIQUOTE)* '"""' | '"' (ESC|~('\\'|'\n'|'"'))* '"' | '\'' (ESC|~('\\'|'\n'|'\''))* '\'' - ) + ) { + if (state.tokenStartLine != input.getLine()) { + state.tokenStartLine = input.getLine(); + state.tokenStartCharPositionInLine = -2; + } + } ; /** the two '"'? cause a warning -- is there a way to avoid that? */ Modified: branches/asm/src/org/python/antlr/PythonTree.java =================================================================== --- branches/asm/src/org/python/antlr/PythonTree.java 2008-07-23 01:21:32 UTC (rev 4987) +++ branches/asm/src/org/python/antlr/PythonTree.java 2008-07-23 01:58:01 UTC (rev 4988) @@ -1,7 +1,7 @@ package org.python.antlr; import org.antlr.runtime.tree.BaseTree; -import org.antlr.runtime.tree.CommonTree; +import org.antlr.runtime.tree.Tree; import org.antlr.runtime.CommonToken; import org.antlr.runtime.Token; @@ -10,14 +10,27 @@ import org.python.antlr.ast.VisitorIF; -public class PythonTree extends CommonTree implements AST { +public class PythonTree extends BaseTree implements AST { public boolean from_future_checked = false; private int charStartIndex = -1; private int charStopIndex = -1; + /** A single token is the payload */ + public Token token; + + /** What token indexes bracket all tokens associated with this node + * and below? + */ + protected int startIndex=-1, stopIndex=-1; + + /** Who is the parent node of this node; if null, implies node is root */ + public PythonTree parent; + + /** What index is this node in the child list? Range: 0..n-1 */ + public int childIndex = -1; + public PythonTree(int ttype, Token t) { - super(); CommonToken c = new CommonToken(ttype, t.getText()); c.setLine(t.getLine()); c.setTokenIndex(t.getTokenIndex()); @@ -28,16 +41,109 @@ token = c; } - public PythonTree(Token token) { - super(token); + public PythonTree(Token t) { + this.token = t; } public PythonTree(PythonTree node) { - super(node); + super(node); + token = node.token; + startIndex = node.startIndex; + stopIndex = node.stopIndex; charStartIndex = node.getCharStartIndex(); charStopIndex = node.getCharStopIndex(); } + + public Token getToken() { + return token; + } + public Tree dupNode() { + return new PythonTree(this); + } + + public boolean isNil() { + return token==null; + } + + public int getType() { + if (token==null) { + return Token.INVALID_TOKEN_TYPE; + } + return token.getType(); + } + + public String getText() { + if (token==null) { + return null; + } + return token.getText(); + } + + public int getLine() { + if (token==null || token.getLine()==0) { + if ( getChildCount()>0 ) { + return getChild(0).getLine(); + } + return 0; + } + return token.getLine(); + } + + public int getCharPositionInLine() { + if (token==null || token.getCharPositionInLine()==-1) { + if (getChildCount()>0) { + return getChild(0).getCharPositionInLine(); + } + return 0; + } else if (token != null && token.getCharPositionInLine() == -2) { + //XXX: yucky fix because CPython's ast uses -1 as a real value + // for char pos in certain circumstances (for example, the + // char pos of multi-line strings. I would just use -1, + // but ANTLR is using -1 in special ways also. + return -1; + } + return token.getCharPositionInLine(); + } + + public int getTokenStartIndex() { + if ( startIndex==-1 && token!=null ) { + return token.getTokenIndex(); + } + return startIndex; + } + + public void setTokenStartIndex(int index) { + startIndex = index; + } + + public int getTokenStopIndex() { + if ( stopIndex==-1 && token!=null ) { + return token.getTokenIndex(); + } + return stopIndex; + } + + public void setTokenStopIndex(int index) { + stopIndex = index; + } + + public int getChildIndex() { + return childIndex; + } + + public Tree getParent() { + return parent; + } + + public void setParent(Tree t) { + this.parent = (PythonTree)t; + } + + public void setChildIndex(int index) { + this.childIndex = index; + } + public int getCharStartIndex() { if (charStartIndex == -1 && token != null) { if (token instanceof CommonToken) { @@ -54,13 +160,23 @@ charStartIndex = index; } + /* + * Adding one to stopIndex from Tokens. ANTLR defines the char position as + * being the array index of the actual characters. Most tools these days + * define document offsets as the positions between the characters. If you + * imagine drawing little boxes around each character and think of the + * numbers as pointing to either the left or right side of a character's + * box, then 0 is before the first character - and in a Document of 10 + * characters, position 10 is after the last character. + */ public int getCharStopIndex() { + if (charStopIndex == -1 && token != null) { if (token instanceof CommonToken) { - return ((CommonToken)token).getStopIndex(); + return ((CommonToken)token).getStopIndex() + 1; } if (token instanceof ImaginaryToken) { - return ((ImaginaryToken)token).getStopIndex(); + return ((ImaginaryToken)token).getStopIndex() + 1; } } return charStopIndex; @@ -74,6 +190,13 @@ if (isNil()) { return "None"; } + if ( getType()==Token.INVALID_TOKEN_TYPE ) { + return "<errornode>"; + } + if ( token==null ) { + return null; + } + return token.getText() + "(" + this.getLine() + "," + this.getCharPositionInLine() + ")"; } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |