From: <ti...@co...> - 2009-02-01 11:14:11
|
Author: tismer Date: Sun Feb 1 12:15:34 2009 New Revision: 61490 Added: psyco/v2/dist/test/test_tokenize_mod.py (contents, props changed) psyco/v2/dist/test/tokenize_mod.py (contents, props changed) Log: THE CRASHER! This test program unleashes a deep problem of psyco. Something is wrong with compact object, or the promotion engine, whatsoever. The problem came up with the generator implementation, which is based upon compact objects. In order to prove that not the generator implementation is wrong, but that the error existed long before, we wrote this almost equivalent version of tokenize_mod.py, that is written in the way the generators internally use compact objects. And in fact, this crashes reliably on top of the unmodified, original psyco. This is an extremely hard-to-crack bug, and I'm going to circumvent the problem for the first v2 release by rewriting the generator implementation. Added: psyco/v2/dist/test/test_tokenize_mod.py ============================================================================== --- (empty file) +++ psyco/v2/dist/test/test_tokenize_mod.py Sun Feb 1 12:15:34 2009 @@ -0,0 +1,20 @@ +# this file is reduced to the bare minimum to reproduce the crash +import os, glob, random +from cStringIO import StringIO +from test.test_support import findfile + +from tokenize_mod import GenToken, tokenize_obj + +def test_main(): + # This displays the tokenization of tokenize_tests.py to stdout, and + # regrtest.py checks that this equals the expected output (in the + # test/output/ directory). + f = open(findfile('tokenize_tests' + os.extsep + 'txt')) + # original: tokenize(f.readline) + import psyco + psyco.bind(GenToken.generate_tokens) + tokenize_obj(f.readline) + f.close() + +if __name__ == "__main__": + test_main() Added: psyco/v2/dist/test/tokenize_mod.py ============================================================================== --- (empty file) +++ psyco/v2/dist/test/tokenize_mod.py Sun Feb 1 12:15:34 2009 @@ -0,0 +1,288 @@ +"""Tokenization help for Python programs. + +generate_tokens(readline) is a generator that breaks a stream of +text into Python tokens. It accepts a readline-like method which is called +repeatedly to get the next line of input (or "" for EOF). It generates +5-tuples with these members: + + the token type (see token.py) + the token (a string) + the starting (row, column) indices of the token (a 2-tuple of ints) + the ending (row, column) indices of the token (a 2-tuple of ints) + the original line (string) + +It is designed to match the working of the Python tokenizer exactly, except +that it produces COMMENT tokens for comments and gives type OP for all +operators + +Older entry points + tokenize_loop(readline, tokeneater) + tokenize(readline, tokeneater=printtoken) +are the same, except instead of generating tokens, tokeneater is a callback +function to which the 5 fields described above are passed as 5 arguments, +each time a new token is found.""" + +__author__ = 'Ka-Ping Yee <pi...@lf...>' +__credits__ = \ + 'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro' + +import string, re +from token import * + +import token +__all__ = [x for x in dir(token) if x[0] != '_'] + ["COMMENT", "tokenize", + "generate_tokens", "NL", "untokenize"] +del x +del token + +COMMENT = N_TOKENS +tok_name[COMMENT] = 'COMMENT' +NL = N_TOKENS + 1 +tok_name[NL] = 'NL' +N_TOKENS += 2 + +def group(*choices): return '(' + '|'.join(choices) + ')' +def any(*choices): return group(*choices) + '*' +def maybe(*choices): return group(*choices) + '?' + +Whitespace = r'[ \f\t]*' +Comment = r'#[^\r\n]*' +Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment) +Name = r'[a-zA-Z_]\w*' + +Hexnumber = r'0[xX][\da-fA-F]*[lL]?' +Octnumber = r'0[0-7]*[lL]?' +Decnumber = r'[1-9]\d*[lL]?' +Intnumber = group(Hexnumber, Octnumber, Decnumber) +Exponent = r'[eE][-+]?\d+' +Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent) +Expfloat = r'\d+' + Exponent +Floatnumber = group(Pointfloat, Expfloat) +Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]') +Number = group(Imagnumber, Floatnumber, Intnumber) + +# Tail end of ' string. +Single = r"[^'\\]*(?:\\.[^'\\]*)*'" +# Tail end of " string. +Double = r'[^"\\]*(?:\\.[^"\\]*)*"' +# Tail end of ''' string. +Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''" +# Tail end of """ string. +Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""' +Triple = group("[uU]?[rR]?'''", '[uU]?[rR]?"""') +# Single-line ' or " string. +String = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'", + r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"') + +# Because of leftmost-then-longest match semantics, be sure to put the +# longest operators first (e.g., if = came before ==, == would get +# recognized as two instances of =). +Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=", + r"//=?", + r"[+\-*/%&|^=<>]=?", + r"~") + +Bracket = '[][(){}]' +Special = group(r'\r?\n', r'[:;.,`@]') +Funny = group(Operator, Bracket, Special) + +PlainToken = group(Number, Funny, String, Name) +Token = Ignore + PlainToken + +# First (or only) line of ' or " string. +ContStr = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" + + group("'", r'\\\r?\n'), + r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' + + group('"', r'\\\r?\n')) +PseudoExtras = group(r'\\\r?\n', Comment, Triple) +PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name) + +tokenprog, pseudoprog, single3prog, double3prog = map( + re.compile, (Token, PseudoToken, Single3, Double3)) +endprogs = {"'": re.compile(Single), '"': re.compile(Double), + "'''": single3prog, '"""': double3prog, + "r'''": single3prog, 'r"""': double3prog, + "u'''": single3prog, 'u"""': double3prog, + "ur'''": single3prog, 'ur"""': double3prog, + "R'''": single3prog, 'R"""': double3prog, + "U'''": single3prog, 'U"""': double3prog, + "uR'''": single3prog, 'uR"""': double3prog, + "Ur'''": single3prog, 'Ur"""': double3prog, + "UR'''": single3prog, 'UR"""': double3prog, + 'r': None, 'R': None, 'u': None, 'U': None} + +triple_quoted = {} +for t in ("'''", '"""', + "r'''", 'r"""', "R'''", 'R"""', + "u'''", 'u"""', "U'''", 'U"""', + "ur'''", 'ur"""', "Ur'''", 'Ur"""', + "uR'''", 'uR"""', "UR'''", 'UR"""'): + triple_quoted[t] = t +single_quoted = {} +for t in ("'", '"', + "r'", 'r"', "R'", 'R"', + "u'", 'u"', "U'", 'U"', + "ur'", 'ur"', "Ur'", 'Ur"', + "uR'", 'uR"', "UR'", 'UR"' ): + single_quoted[t] = t + +tabsize = 8 + +class TokenError(Exception): pass + +class StopTokenizing(Exception): pass + +def printtoken(type, token, (srow, scol), (erow, ecol), line): # for testing + print "%d,%d-%d,%d:\t%s\t%s" % \ + (srow, scol, erow, ecol, tok_name[type], repr(token)) + +def tokenize_loop_obj(readline, tokeneater): + genobj = GenToken(readline) + for token_info in genobj: + tokeneater(*token_info) + +def tokenize_obj(readline, tokeneater=printtoken): + try: + tokenize_loop_obj(readline, tokeneater) + except StopTokenizing: + pass + +import psyco + +class GenToken(psyco.compact): + + def __init__ (self, readline): + self.readline = readline + self.result = [] + + def __iter__ (self): + del self.result[:] + self.generate_tokens() + return self.result.__iter__() + + def push (self, *args): + self.result.append (args) + + def generate_tokens(self): + self.lnum = self.parenlev = self.continued = 0 + self.namechars, self.numchars = string.ascii_letters + '_', '0123456789' + self.contstr, self.needcont = '', 0 + self.contline = None + self.indents = [0] + + while 1: # loop over lines in stream + try: + self.line = self.readline() + except StopIteration: + self.line = '' + self.lnum = self.lnum + 1 + self.pos, self.max = 0, len(self.line) + + if self.contstr: # self.continued string + if not self.line: + raise TokenError, ("EOF in multi-line string", self.strstart) + self.endmatch = self.endprog.match(self.line) + if self.endmatch: + self.pos = self.end = self.endmatch.end(0) + self.push (STRING, self.contstr + self.line[:self.end], + self.strstart, (self.lnum, self.end), self.contline + self.line) + self.contstr, self.needcont = '', 0 + self.contline = None + elif self.needcont and self.line[-2:] != '\\\n' and self.line[-3:] != '\\\r\n': + self.push (ERRORTOKEN, self.contstr + self.line, + self.strstart, (self.lnum, len(self.line)), self.contline) + self.contstr = '' + self.contline = None + continue + else: + self.contstr = self.contstr + self.line + self.contline = self.contline + self.line + continue + + elif self.parenlev == 0 and not self.continued: # new statement + if not self.line: break + self.column = 0 + while self.pos < self.max: # measure leading whitespace + if self.line[self.pos] == ' ': self.column = self.column + 1 + elif self.line[self.pos] == '\t': self.column = (self.column/tabsize + 1)*tabsize + elif self.line[self.pos] == '\f': self.column = 0 + else: break + self.pos = self.pos + 1 + if self.pos == self.max: break + + if self.line[self.pos] in '#\r\n': # skip comments or blank lines + self.push ((NL, COMMENT)[self.line[self.pos] == '#'], self.line[self.pos:], + (self.lnum, self.pos), (self.lnum, len(self.line)), self.line) + continue + + if self.column > self.indents[-1]: # count self.indents or dedents + self.indents.append(self.column) + self.push (INDENT, self.line[:self.pos], (self.lnum, 0), (self.lnum, self.pos), self.line) + while self.column < self.indents[-1]: + if self.column not in self.indents: + raise IndentationError( + "unindent does not match any outer indentation level", + ("<tokenize>", self.lnum, self.pos, self.line)) + self.indents = self.indents[:-1] + self.push (DEDENT, '', (self.lnum, self.pos), (self.lnum, self.pos), self.line) + + else: # self.continued statement + if not self.line: + raise TokenError, ("EOF in multi-line statement", (self.lnum, 0)) + self.continued = 0 + + while self.pos < self.max: + self.pseudomatch = pseudoprog.match(self.line, self.pos) + if self.pseudomatch: # scan for tokens + self.start, self.end = self.pseudomatch.span(1) + self.spos, self.epos, self.pos = (self.lnum, self.start), (self.lnum, self.end), self.end + self.token, self.initial = self.line[self.start:self.end], self.line[self.start] + + if self.initial in self.numchars or \ + (self.initial == '.' and self.token != '.'): # ordinary number + self.push (NUMBER, self.token, self.spos, self.epos, self.line) + elif self.initial in '\r\n': + self.push (self.parenlev > 0 and NL or NEWLINE, + self.token, self.spos, self.epos, self.line) + elif self.initial == '#': + self.push (COMMENT, self.token, self.spos, self.epos, self.line) + elif self.token in triple_quoted: + self.endprog = endprogs[self.token] + self.endmatch = self.endprog.match(self.line, self.pos) + if self.endmatch: # all on one line + self.pos = self.endmatch.end(0) + self.token = self.line[self.start:self.pos] + self.push (STRING, self.token, self.spos, (self.lnum, self.pos), self.line) + else: + self.strstart = (self.lnum, self.start) # multiple lines + self.contstr = self.line[self.start:] + self.contline = self.line + break + elif self.initial in single_quoted or \ + self.token[:2] in single_quoted or \ + self.token[:3] in single_quoted: + if self.token[-1] == '\n': # self.continued string + self.strstart = (self.lnum, self.start) + self.endprog = (endprogs[self.initial] or endprogs[self.token[1]] or + endprogs[self.token[2]]) + self.contstr, self.needcont = self.line[self.start:], 1 + self.contline = self.line + break + else: # ordinary string + self.push (STRING, self.token, self.spos, self.epos, self.line) + elif self.initial in self.namechars: # ordinary name + self.push (NAME, self.token, self.spos, self.epos, self.line) + elif self.initial == '\\': # self.continued stmt + self.continued = 1 + else: + if self.initial in '([{': self.parenlev = self.parenlev + 1 + elif self.initial in ')]}': self.parenlev = self.parenlev - 1 + self.push (OP, self.token, self.spos, self.epos, self.line) + else: + self.push (ERRORTOKEN, self.line[self.pos], + (self.lnum, self.pos), (self.lnum, self.pos+1), self.line) + self.pos = self.pos + 1 + + for self.indent in self.indents[1:]: # pop remaining indent levels + self.push (DEDENT, '', (self.lnum, 0), (self.lnum, 0), '') + self.push (ENDMARKER, '', (self.lnum, 0), (self.lnum, 0), '') |