From: <nr...@us...> - 2008-06-06 20:48:50
|
Revision: 4548 http://jython.svn.sourceforge.net/jython/?rev=4548&view=rev Author: nriley Date: 2008-06-06 13:48:42 -0700 (Fri, 06 Jun 2008) Log Message: ----------- sync sre opcodes with Python 2.4; implement SRE_OP_GROUPREF_EXISTS Modified Paths: -------------- branches/asm/Lib/sre_compile.py branches/asm/src/org/python/modules/sre/SRE_STATE.java Removed Paths: ------------- branches/asm/Lib/test/test_re.py Modified: branches/asm/Lib/sre_compile.py =================================================================== --- branches/asm/Lib/sre_compile.py 2008-06-06 18:12:44 UTC (rev 4547) +++ branches/asm/Lib/sre_compile.py 2008-06-06 20:48:42 UTC (rev 4548) @@ -1,495 +1,526 @@ -# -# Secret Labs' Regular Expression Engine -# -# convert template to internal format -# -# Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved. -# -# See the sre.py file for information on usage and redistribution. -# - -"""Internal support module for sre""" - -import _sre, sys - -from sre_constants import * - -assert _sre.MAGIC == MAGIC, "SRE module mismatch" - -if _sre.CODESIZE == 2: - MAXCODE = 65535 -else: - MAXCODE = 0xFFFFFFFFL - -def _compile(code, pattern, flags): - # internal: compile a (sub)pattern - emit = code.append - for op, av in pattern: - if op in (LITERAL, NOT_LITERAL): - if flags & SRE_FLAG_IGNORECASE: - emit(OPCODES[OP_IGNORE[op]]) - emit(_sre.getlower(av, flags)) - else: - emit(OPCODES[op]) - emit(av) - elif op is IN: - if flags & SRE_FLAG_IGNORECASE: - emit(OPCODES[OP_IGNORE[op]]) - def fixup(literal, flags=flags): - return _sre.getlower(literal, flags) - else: - emit(OPCODES[op]) - fixup = lambda x: x - skip = len(code); emit(0) - _compile_charset(av, flags, code, fixup) - code[skip] = len(code) - skip - elif op is ANY: - if flags & SRE_FLAG_DOTALL: - emit(OPCODES[ANY_ALL]) - else: - emit(OPCODES[ANY]) - elif op in (REPEAT, MIN_REPEAT, MAX_REPEAT): - if flags & SRE_FLAG_TEMPLATE: - raise error, "internal: unsupported template operator" - emit(OPCODES[REPEAT]) - skip = len(code); emit(0) - emit(av[0]) - emit(av[1]) - _compile(code, av[2], flags) - emit(OPCODES[SUCCESS]) - code[skip] = len(code) - skip - elif _simple(av) and op != REPEAT: - if op == MAX_REPEAT: - emit(OPCODES[REPEAT_ONE]) - else: - emit(OPCODES[MIN_REPEAT_ONE]) - skip = len(code); emit(0) - emit(av[0]) - emit(av[1]) - _compile(code, av[2], flags) - emit(OPCODES[SUCCESS]) - code[skip] = len(code) - skip - else: - emit(OPCODES[REPEAT]) - skip = len(code); emit(0) - emit(av[0]) - emit(av[1]) - _compile(code, av[2], flags) - code[skip] = len(code) - skip - if op == MAX_REPEAT: - emit(OPCODES[MAX_UNTIL]) - else: - emit(OPCODES[MIN_UNTIL]) - elif op is SUBPATTERN: - if av[0]: - emit(OPCODES[MARK]) - emit((av[0]-1)*2) - # _compile_info(code, av[1], flags) - _compile(code, av[1], flags) - if av[0]: - emit(OPCODES[MARK]) - emit((av[0]-1)*2+1) - elif op in (SUCCESS, FAILURE): - emit(OPCODES[op]) - elif op in (ASSERT, ASSERT_NOT): - emit(OPCODES[op]) - skip = len(code); emit(0) - if av[0] >= 0: - emit(0) # look ahead - else: - lo, hi = av[1].getwidth() - if lo != hi: - raise error, "look-behind requires fixed-width pattern" - emit(lo) # look behind - _compile(code, av[1], flags) - emit(OPCODES[SUCCESS]) - code[skip] = len(code) - skip - elif op is CALL: - emit(OPCODES[op]) - skip = len(code); emit(0) - _compile(code, av, flags) - emit(OPCODES[SUCCESS]) - code[skip] = len(code) - skip - elif op is AT: - emit(OPCODES[op]) - if flags & SRE_FLAG_MULTILINE: - av = AT_MULTILINE.get(av, av) - if flags & SRE_FLAG_LOCALE: - av = AT_LOCALE.get(av, av) - elif flags & SRE_FLAG_UNICODE: - av = AT_UNICODE.get(av, av) - emit(ATCODES[av]) - elif op is BRANCH: - emit(OPCODES[op]) - tail = [] - for av in av[1]: - skip = len(code); emit(0) - # _compile_info(code, av, flags) - _compile(code, av, flags) - emit(OPCODES[JUMP]) - tail.append(len(code)); emit(0) - code[skip] = len(code) - skip - emit(0) # end of branch - for tail in tail: - code[tail] = len(code) - tail - elif op is CATEGORY: - emit(OPCODES[op]) - if flags & SRE_FLAG_LOCALE: - av = CH_LOCALE[av] - elif flags & SRE_FLAG_UNICODE: - av = CH_UNICODE[av] - emit(CHCODES[av]) - elif op is GROUPREF: - if flags & SRE_FLAG_IGNORECASE: - emit(OPCODES[OP_IGNORE[op]]) - else: - emit(OPCODES[op]) - emit(av-1) - else: - raise ValueError, ("unsupported operand type", op) - -def _compile_charset(charset, flags, code, fixup=None): - # compile charset subprogram - emit = code.append - if fixup is None: - fixup = lambda x: x - for op, av in _optimize_charset(charset, fixup): - emit(OPCODES[op]) - if op is NEGATE: - pass - elif op is LITERAL: - emit(fixup(av)) - elif op is RANGE: - emit(fixup(av[0])) - emit(fixup(av[1])) - elif op is CHARSET: - code.extend(av) - elif op is BIGCHARSET: - code.extend(av) - elif op is CATEGORY: - if flags & SRE_FLAG_LOCALE: - emit(CHCODES[CH_LOCALE[av]]) - elif flags & SRE_FLAG_UNICODE: - emit(CHCODES[CH_UNICODE[av]]) - else: - emit(CHCODES[av]) - else: - raise error, "internal: unsupported set operator" - emit(OPCODES[FAILURE]) - -def _optimize_charset(charset, fixup): - # internal: optimize character set - out = [] - charmap = [0]*256 - try: - for op, av in charset: - if op is NEGATE: - out.append((op, av)) - elif op is LITERAL: - charmap[fixup(av)] = 1 - elif op is RANGE: - for i in range(fixup(av[0]), fixup(av[1])+1): - charmap[i] = 1 - elif op is CATEGORY: - # XXX: could append to charmap tail - return charset # cannot compress - except IndexError: - # character set contains unicode characters - return _optimize_unicode(charset, fixup) - # compress character map - i = p = n = 0 - runs = [] - for c in charmap: - if c: - if n == 0: - p = i - n = n + 1 - elif n: - runs.append((p, n)) - n = 0 - i = i + 1 - if n: - runs.append((p, n)) - if len(runs) <= 2: - # use literal/range - for p, n in runs: - if n == 1: - out.append((LITERAL, p)) - else: - out.append((RANGE, (p, p+n-1))) - if len(out) < len(charset): - return out - else: - # use bitmap - data = _mk_bitmap(charmap) - out.append((CHARSET, data)) - return out - return charset - -def _mk_bitmap(bits): - data = [] - if _sre.CODESIZE == 2: - start = (1, 0) - else: - start = (1L, 0L) - m, v = start - for c in bits: - if c: - v = v + m - m = m << 1 - if m > MAXCODE: - data.append(v) - m, v = start - return data - -# To represent a big charset, first a bitmap of all characters in the -# set is constructed. Then, this bitmap is sliced into chunks of 256 -# characters, duplicate chunks are eliminitated, and each chunk is -# given a number. In the compiled expression, the charset is -# represented by a 16-bit word sequence, consisting of one word for -# the number of different chunks, a sequence of 256 bytes (128 words) -# of chunk numbers indexed by their original chunk position, and a -# sequence of chunks (16 words each). - -# Compression is normally good: in a typical charset, large ranges of -# Unicode will be either completely excluded (e.g. if only cyrillic -# letters are to be matched), or completely included (e.g. if large -# subranges of Kanji match). These ranges will be represented by -# chunks of all one-bits or all zero-bits. - -# Matching can be also done efficiently: the more significant byte of -# the Unicode character is an index into the chunk number, and the -# less significant byte is a bit index in the chunk (just like the -# CHARSET matching). - -# In UCS-4 mode, the BIGCHARSET opcode still supports only subsets -# of the basic multilingual plane; an efficient representation -# for all of UTF-16 has not yet been developed. This means, -# in particular, that negated charsets cannot be represented as -# bigcharsets. - -def _optimize_unicode(charset, fixup): - try: - import array - except ImportError: - return charset - charmap = [0]*65536 - negate = 0 - try: - for op, av in charset: - if op is NEGATE: - negate = 1 - elif op is LITERAL: - charmap[fixup(av)] = 1 - elif op is RANGE: - for i in range(fixup(av[0]), fixup(av[1])+1): - charmap[i] = 1 - elif op is CATEGORY: - # XXX: could expand category - return charset # cannot compress - except IndexError: - # non-BMP characters - return charset - if negate: - if sys.maxunicode != 65535: - # XXX: negation does not work with big charsets - return charset - for i in range(65536): - charmap[i] = not charmap[i] - comps = {} - mapping = [0]*256 - block = 0 - data = [] - for i in range(256): - chunk = tuple(charmap[i*256:(i+1)*256]) - new = comps.setdefault(chunk, block) - mapping[i] = new - if new == block: - block = block + 1 - data = data + _mk_bitmap(chunk) - header = [block] - if _sre.CODESIZE == 2: - code = 'H' - else: - code = 'I' - # Convert block indices to byte array of 256 bytes - mapping = array.array('b', mapping).tostring() - # Convert byte array to word array - mapping = array.array(code, mapping) - - # remove this assertion, because we need to represent 2 byte - # characters with a 32 bit integer, due to the lack of support for - # unsigned ints (or using char for this purpose) in PyArray.java; - # this is just much easier - - # assert mapping.itemsize == _sre.CODESIZE - header = header + mapping.tolist() - data[0:0] = header - return [(BIGCHARSET, data)] - -def _simple(av): - # check if av is a "simple" operator - lo, hi = av[2].getwidth() - if lo == 0 and hi == MAXREPEAT: - raise error, "nothing to repeat" - return lo == hi == 1 and av[2][0][0] != SUBPATTERN - -def _compile_info(code, pattern, flags): - # internal: compile an info block. in the current version, - # this contains min/max pattern width, and an optional literal - # prefix or a character map - lo, hi = pattern.getwidth() - if lo == 0: - return # not worth it - # look for a literal prefix - prefix = [] - prefix_skip = 0 - charset = [] # not used - if not (flags & SRE_FLAG_IGNORECASE): - # look for literal prefix - for op, av in pattern.data: - if op is LITERAL: - if len(prefix) == prefix_skip: - prefix_skip = prefix_skip + 1 - prefix.append(av) - elif op is SUBPATTERN and len(av[1]) == 1: - op, av = av[1][0] - if op is LITERAL: - prefix.append(av) - else: - break - else: - break - # if no prefix, look for charset prefix - if not prefix and pattern.data: - op, av = pattern.data[0] - if op is SUBPATTERN and av[1]: - op, av = av[1][0] - if op is LITERAL: - charset.append((op, av)) - elif op is BRANCH: - c = [] - for p in av[1]: - if not p: - break - op, av = p[0] - if op is LITERAL: - c.append((op, av)) - else: - break - else: - charset = c - elif op is BRANCH: - c = [] - for p in av[1]: - if not p: - break - op, av = p[0] - if op is LITERAL: - c.append((op, av)) - else: - break - else: - charset = c - elif op is IN: - charset = av -## if prefix: -## print "*** PREFIX", prefix, prefix_skip -## if charset: -## print "*** CHARSET", charset - # add an info block - emit = code.append - emit(OPCODES[INFO]) - skip = len(code); emit(0) - # literal flag - mask = 0 - if prefix: - mask = SRE_INFO_PREFIX - if len(prefix) == prefix_skip == len(pattern.data): - mask = mask + SRE_INFO_LITERAL - elif charset: - mask = mask + SRE_INFO_CHARSET - emit(mask) - # pattern length - if lo < MAXCODE: - emit(lo) - else: - emit(MAXCODE) - prefix = prefix[:MAXCODE] - if hi < MAXCODE: - emit(hi) - else: - emit(0) - # add literal prefix - if prefix: - emit(len(prefix)) # length - emit(prefix_skip) # skip - code.extend(prefix) - # generate overlap table - table = [-1] + ([0]*len(prefix)) - for i in range(len(prefix)): - table[i+1] = table[i]+1 - while table[i+1] > 0 and prefix[i] != prefix[table[i+1]-1]: - table[i+1] = table[table[i+1]-1]+1 - code.extend(table[1:]) # don't store first entry - elif charset: - _compile_charset(charset, flags, code) - code[skip] = len(code) - skip - -try: - unicode -except NameError: - STRING_TYPES = (type(""),) -else: - STRING_TYPES = (type(""), type(unicode(""))) - -def isstring(obj): - for tp in STRING_TYPES: - if isinstance(obj, tp): - return 1 - return 0 - -def _code(p, flags): - - flags = p.pattern.flags | flags - code = [] - - # compile info block - _compile_info(code, p, flags) - - # compile the pattern - _compile(code, p.data, flags) - - code.append(OPCODES[SUCCESS]) - - return code - -def compile(p, flags=0): - # internal: convert pattern list to internal format - - if isstring(p): - import sre_parse - pattern = p - p = sre_parse.parse(p, flags) - else: - pattern = None - - code = _code(p, flags) - - # print code - - # XXX: <fl> get rid of this limitation! - assert p.pattern.groups <= 100,\ - "sorry, but this version only supports 100 named groups" - - # map in either direction - groupindex = p.pattern.groupdict - indexgroup = [None] * p.pattern.groups - for k, i in groupindex.items(): - indexgroup[i] = k - - return _sre.compile( - pattern, flags, code, - p.pattern.groups-1, - groupindex, indexgroup - ) +# +# Secret Labs' Regular Expression Engine +# +# convert template to internal format +# +# Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved. +# +# See the sre.py file for information on usage and redistribution. +# + +"""Internal support module for sre""" + +import _sre, sys + +from sre_constants import * + +assert _sre.MAGIC == MAGIC, "SRE module mismatch" + +if _sre.CODESIZE == 2: + MAXCODE = 65535 +else: + MAXCODE = 0xFFFFFFFFL + +def _identityfunction(x): + return x + +def _compile(code, pattern, flags): + # internal: compile a (sub)pattern + emit = code.append + _len = len + LITERAL_CODES = {LITERAL:1, NOT_LITERAL:1} + REPEATING_CODES = {REPEAT:1, MIN_REPEAT:1, MAX_REPEAT:1} + SUCCESS_CODES = {SUCCESS:1, FAILURE:1} + ASSERT_CODES = {ASSERT:1, ASSERT_NOT:1} + for op, av in pattern: + if op in LITERAL_CODES: + if flags & SRE_FLAG_IGNORECASE: + emit(OPCODES[OP_IGNORE[op]]) + emit(_sre.getlower(av, flags)) + else: + emit(OPCODES[op]) + emit(av) + elif op is IN: + if flags & SRE_FLAG_IGNORECASE: + emit(OPCODES[OP_IGNORE[op]]) + def fixup(literal, flags=flags): + return _sre.getlower(literal, flags) + else: + emit(OPCODES[op]) + fixup = _identityfunction + skip = _len(code); emit(0) + _compile_charset(av, flags, code, fixup) + code[skip] = _len(code) - skip + elif op is ANY: + if flags & SRE_FLAG_DOTALL: + emit(OPCODES[ANY_ALL]) + else: + emit(OPCODES[ANY]) + elif op in REPEATING_CODES: + if flags & SRE_FLAG_TEMPLATE: + raise error, "internal: unsupported template operator" + emit(OPCODES[REPEAT]) + skip = _len(code); emit(0) + emit(av[0]) + emit(av[1]) + _compile(code, av[2], flags) + emit(OPCODES[SUCCESS]) + code[skip] = _len(code) - skip + elif _simple(av) and op is not REPEAT: + if op is MAX_REPEAT: + emit(OPCODES[REPEAT_ONE]) + else: + emit(OPCODES[MIN_REPEAT_ONE]) + skip = _len(code); emit(0) + emit(av[0]) + emit(av[1]) + _compile(code, av[2], flags) + emit(OPCODES[SUCCESS]) + code[skip] = _len(code) - skip + else: + emit(OPCODES[REPEAT]) + skip = _len(code); emit(0) + emit(av[0]) + emit(av[1]) + _compile(code, av[2], flags) + code[skip] = _len(code) - skip + if op is MAX_REPEAT: + emit(OPCODES[MAX_UNTIL]) + else: + emit(OPCODES[MIN_UNTIL]) + elif op is SUBPATTERN: + if av[0]: + emit(OPCODES[MARK]) + emit((av[0]-1)*2) + # _compile_info(code, av[1], flags) + _compile(code, av[1], flags) + if av[0]: + emit(OPCODES[MARK]) + emit((av[0]-1)*2+1) + elif op in SUCCESS_CODES: + emit(OPCODES[op]) + elif op in ASSERT_CODES: + emit(OPCODES[op]) + skip = _len(code); emit(0) + if av[0] >= 0: + emit(0) # look ahead + else: + lo, hi = av[1].getwidth() + if lo != hi: + raise error, "look-behind requires fixed-width pattern" + emit(lo) # look behind + _compile(code, av[1], flags) + emit(OPCODES[SUCCESS]) + code[skip] = _len(code) - skip + elif op is CALL: + emit(OPCODES[op]) + skip = _len(code); emit(0) + _compile(code, av, flags) + emit(OPCODES[SUCCESS]) + code[skip] = _len(code) - skip + elif op is AT: + emit(OPCODES[op]) + if flags & SRE_FLAG_MULTILINE: + av = AT_MULTILINE.get(av, av) + if flags & SRE_FLAG_LOCALE: + av = AT_LOCALE.get(av, av) + elif flags & SRE_FLAG_UNICODE: + av = AT_UNICODE.get(av, av) + emit(ATCODES[av]) + elif op is BRANCH: + emit(OPCODES[op]) + tail = [] + tailappend = tail.append + for av in av[1]: + skip = _len(code); emit(0) + # _compile_info(code, av, flags) + _compile(code, av, flags) + emit(OPCODES[JUMP]) + tailappend(_len(code)); emit(0) + code[skip] = _len(code) - skip + emit(0) # end of branch + for tail in tail: + code[tail] = _len(code) - tail + elif op is CATEGORY: + emit(OPCODES[op]) + if flags & SRE_FLAG_LOCALE: + av = CH_LOCALE[av] + elif flags & SRE_FLAG_UNICODE: + av = CH_UNICODE[av] + emit(CHCODES[av]) + elif op is GROUPREF: + if flags & SRE_FLAG_IGNORECASE: + emit(OPCODES[OP_IGNORE[op]]) + else: + emit(OPCODES[op]) + emit(av-1) + elif op is GROUPREF_EXISTS: + emit(OPCODES[op]) + emit(av[0]-1) + skipyes = _len(code); emit(0) + _compile(code, av[1], flags) + if av[2]: + emit(OPCODES[JUMP]) + skipno = _len(code); emit(0) + code[skipyes] = _len(code) - skipyes + 1 + _compile(code, av[2], flags) + code[skipno] = _len(code) - skipno + else: + code[skipyes] = _len(code) - skipyes + 1 + else: + raise ValueError, ("unsupported operand type", op) + +def _compile_charset(charset, flags, code, fixup=None): + # compile charset subprogram + emit = code.append + if fixup is None: + fixup = _identityfunction + for op, av in _optimize_charset(charset, fixup): + emit(OPCODES[op]) + if op is NEGATE: + pass + elif op is LITERAL: + emit(fixup(av)) + elif op is RANGE: + emit(fixup(av[0])) + emit(fixup(av[1])) + elif op is CHARSET: + code.extend(av) + elif op is BIGCHARSET: + code.extend(av) + elif op is CATEGORY: + if flags & SRE_FLAG_LOCALE: + emit(CHCODES[CH_LOCALE[av]]) + elif flags & SRE_FLAG_UNICODE: + emit(CHCODES[CH_UNICODE[av]]) + else: + emit(CHCODES[av]) + else: + raise error, "internal: unsupported set operator" + emit(OPCODES[FAILURE]) + +def _optimize_charset(charset, fixup): + # internal: optimize character set + out = [] + outappend = out.append + charmap = [0]*256 + try: + for op, av in charset: + if op is NEGATE: + outappend((op, av)) + elif op is LITERAL: + charmap[fixup(av)] = 1 + elif op is RANGE: + for i in range(fixup(av[0]), fixup(av[1])+1): + charmap[i] = 1 + elif op is CATEGORY: + # XXX: could append to charmap tail + return charset # cannot compress + except IndexError: + # character set contains unicode characters + return _optimize_unicode(charset, fixup) + # compress character map + i = p = n = 0 + runs = [] + runsappend = runs.append + for c in charmap: + if c: + if n == 0: + p = i + n = n + 1 + elif n: + runsappend((p, n)) + n = 0 + i = i + 1 + if n: + runsappend((p, n)) + if len(runs) <= 2: + # use literal/range + for p, n in runs: + if n == 1: + outappend((LITERAL, p)) + else: + outappend((RANGE, (p, p+n-1))) + if len(out) < len(charset): + return out + else: + # use bitmap + data = _mk_bitmap(charmap) + outappend((CHARSET, data)) + return out + return charset + +def _mk_bitmap(bits): + data = [] + dataappend = data.append + if _sre.CODESIZE == 2: + start = (1, 0) + else: + start = (1L, 0L) + m, v = start + for c in bits: + if c: + v = v + m + m = m + m + if m > MAXCODE: + dataappend(v) + m, v = start + return data + +# To represent a big charset, first a bitmap of all characters in the +# set is constructed. Then, this bitmap is sliced into chunks of 256 +# characters, duplicate chunks are eliminitated, and each chunk is +# given a number. In the compiled expression, the charset is +# represented by a 16-bit word sequence, consisting of one word for +# the number of different chunks, a sequence of 256 bytes (128 words) +# of chunk numbers indexed by their original chunk position, and a +# sequence of chunks (16 words each). + +# Compression is normally good: in a typical charset, large ranges of +# Unicode will be either completely excluded (e.g. if only cyrillic +# letters are to be matched), or completely included (e.g. if large +# subranges of Kanji match). These ranges will be represented by +# chunks of all one-bits or all zero-bits. + +# Matching can be also done efficiently: the more significant byte of +# the Unicode character is an index into the chunk number, and the +# less significant byte is a bit index in the chunk (just like the +# CHARSET matching). + +# In UCS-4 mode, the BIGCHARSET opcode still supports only subsets +# of the basic multilingual plane; an efficient representation +# for all of UTF-16 has not yet been developed. This means, +# in particular, that negated charsets cannot be represented as +# bigcharsets. + +def _optimize_unicode(charset, fixup): + try: + import array + except ImportError: + return charset + charmap = [0]*65536 + negate = 0 + try: + for op, av in charset: + if op is NEGATE: + negate = 1 + elif op is LITERAL: + charmap[fixup(av)] = 1 + elif op is RANGE: + for i in xrange(fixup(av[0]), fixup(av[1])+1): + charmap[i] = 1 + elif op is CATEGORY: + # XXX: could expand category + return charset # cannot compress + except IndexError: + # non-BMP characters + return charset + if negate: + if sys.maxunicode != 65535: + # XXX: negation does not work with big charsets + return charset + for i in xrange(65536): + charmap[i] = not charmap[i] + comps = {} + mapping = [0]*256 + block = 0 + data = [] + for i in xrange(256): + chunk = tuple(charmap[i*256:(i+1)*256]) + new = comps.setdefault(chunk, block) + mapping[i] = new + if new == block: + block = block + 1 + data = data + _mk_bitmap(chunk) + header = [block] + if _sre.CODESIZE == 2: + code = 'H' + else: + code = 'I' + # Convert block indices to byte array of 256 bytes + mapping = array.array('b', mapping).tostring() + # Convert byte array to word array + mapping = array.array(code, mapping) + + # remove this assertion, because we need to represent 2 byte + # characters with a 32 bit integer, due to the lack of support for + # unsigned ints (or using char for this purpose) in PyArray.java; + # this is just much easier + + # assert mapping.itemsize == _sre.CODESIZE + header = header + mapping.tolist() + data[0:0] = header + return [(BIGCHARSET, data)] + +def _simple(av): + # check if av is a "simple" operator + lo, hi = av[2].getwidth() + if lo == 0 and hi == MAXREPEAT: + raise error, "nothing to repeat" + return lo == hi == 1 and av[2][0][0] != SUBPATTERN + +def _compile_info(code, pattern, flags): + # internal: compile an info block. in the current version, + # this contains min/max pattern width, and an optional literal + # prefix or a character map + lo, hi = pattern.getwidth() + if lo == 0: + return # not worth it + # look for a literal prefix + prefix = [] + prefixappend = prefix.append + prefix_skip = 0 + charset = [] # not used + charsetappend = charset.append + if not (flags & SRE_FLAG_IGNORECASE): + # look for literal prefix + for op, av in pattern.data: + if op is LITERAL: + if len(prefix) == prefix_skip: + prefix_skip = prefix_skip + 1 + prefixappend(av) + elif op is SUBPATTERN and len(av[1]) == 1: + op, av = av[1][0] + if op is LITERAL: + prefixappend(av) + else: + break + else: + break + # if no prefix, look for charset prefix + if not prefix and pattern.data: + op, av = pattern.data[0] + if op is SUBPATTERN and av[1]: + op, av = av[1][0] + if op is LITERAL: + charsetappend((op, av)) + elif op is BRANCH: + c = [] + cappend = c.append + for p in av[1]: + if not p: + break + op, av = p[0] + if op is LITERAL: + cappend((op, av)) + else: + break + else: + charset = c + elif op is BRANCH: + c = [] + cappend = c.append + for p in av[1]: + if not p: + break + op, av = p[0] + if op is LITERAL: + cappend((op, av)) + else: + break + else: + charset = c + elif op is IN: + charset = av +## if prefix: +## print "*** PREFIX", prefix, prefix_skip +## if charset: +## print "*** CHARSET", charset + # add an info block + emit = code.append + emit(OPCODES[INFO]) + skip = len(code); emit(0) + # literal flag + mask = 0 + if prefix: + mask = SRE_INFO_PREFIX + if len(prefix) == prefix_skip == len(pattern.data): + mask = mask + SRE_INFO_LITERAL + elif charset: + mask = mask + SRE_INFO_CHARSET + emit(mask) + # pattern length + if lo < MAXCODE: + emit(lo) + else: + emit(MAXCODE) + prefix = prefix[:MAXCODE] + if hi < MAXCODE: + emit(hi) + else: + emit(0) + # add literal prefix + if prefix: + emit(len(prefix)) # length + emit(prefix_skip) # skip + code.extend(prefix) + # generate overlap table + table = [-1] + ([0]*len(prefix)) + for i in xrange(len(prefix)): + table[i+1] = table[i]+1 + while table[i+1] > 0 and prefix[i] != prefix[table[i+1]-1]: + table[i+1] = table[table[i+1]-1]+1 + code.extend(table[1:]) # don't store first entry + elif charset: + _compile_charset(charset, flags, code) + code[skip] = len(code) - skip + +try: + unicode +except NameError: + STRING_TYPES = (type(""),) +else: + STRING_TYPES = (type(""), type(unicode(""))) + +def isstring(obj): + for tp in STRING_TYPES: + if isinstance(obj, tp): + return 1 + return 0 + +def _code(p, flags): + + flags = p.pattern.flags | flags + code = [] + + # compile info block + _compile_info(code, p, flags) + + # compile the pattern + _compile(code, p.data, flags) + + code.append(OPCODES[SUCCESS]) + + return code + +def compile(p, flags=0): + # internal: convert pattern list to internal format + + if isstring(p): + import sre_parse + pattern = p + p = sre_parse.parse(p, flags) + else: + pattern = None + + code = _code(p, flags) + + # print code + + # XXX: <fl> get rid of this limitation! + if p.pattern.groups > 100: + raise AssertionError( + "sorry, but this version only supports 100 named groups" + ) + + # map in either direction + groupindex = p.pattern.groupdict + indexgroup = [None] * p.pattern.groups + for k, i in groupindex.items(): + indexgroup[i] = k + + return _sre.compile( + pattern, flags, code, + p.pattern.groups-1, + groupindex, indexgroup + ) Deleted: branches/asm/Lib/test/test_re.py =================================================================== --- branches/asm/Lib/test/test_re.py 2008-06-06 18:12:44 UTC (rev 4547) +++ branches/asm/Lib/test/test_re.py 2008-06-06 20:48:42 UTC (rev 4548) @@ -1,304 +0,0 @@ -#!/usr/local/bin/python -# -*- mode: python -*- -# $Id$ - -import sys -sys.path=['.']+sys.path - -from test_support import * -import re -import sys, os, string #, traceback - -print_test('re (test_re.py)', 1) - -# Misc tests from Tim Peters' re.doc - -print_test('basic functions', 2) - -print_test('search', 3) -assert re.search('x*', 'axx').span(0) == (0, 0) -assert re.search('x*', 'axx').span() == (0, 0) -assert re.search('x+', 'axx').span(0) == (1, 3) -assert re.search('x+', 'axx').span() == (1, 3) -assert re.search('x', 'aaa') == None - -print_test('match') -assert re.match('a*', 'xxx').span(0) == (0, 0) -assert re.match('a*', 'xxx').span() == (0, 0) -assert re.match('x*', 'xxxa').span(0) == (0, 3) -assert re.match('x*', 'xxxa').span() == (0, 3) -assert re.match('a+', 'xxx') == None - - -print_test('sub') -assert re.sub("(?i)b+", "x", "bbbb BBBB") == 'x x' - -def bump_num(matchobj): - int_value = int(matchobj.group(0)) - return str(int_value + 1) - -assert re.sub(r'\d+', bump_num, '08.2 -2 23x99y') == '9.3 -3 24x100y' -assert re.sub(r'\d+', bump_num, '08.2 -2 23x99y', 3) == '9.3 -3 23x99y' - -assert re.sub('.', lambda m: r"\n", 'x') == '\\n' -assert re.sub('.', r"\n", 'x') == '\n' - -s = r"\1\1" -assert re.sub('(.)', s, 'x') == 'xx' -assert re.sub('(.)', re.escape(s), 'x') == s -assert re.sub('(.)', lambda m: s, 'x') == s - -assert re.sub('(?P<a>x)', '\g<a>\g<a>', 'xx') == 'xxxx' -assert re.sub('(?P<a>x)', '\g<a>\g<1>', 'xx') == 'xxxx' -assert re.sub('(?P<unk>x)', '\g<unk>\g<unk>', 'xx') == 'xxxx' -assert re.sub('(?P<unk>x)', '\g<1>\g<1>', 'xx') == 'xxxx' - -assert re.sub('a', r'\t\n\v\r\f\a\b\B\Z\a\A\w\W\s\S\d\D', 'a') == '\t\n\v\r\f\a\b\\B\\Z\a\\A\\w\\W\\s\\S\\d\\D' -assert re.sub('a', '\t\n\v\r\f\a', 'a') == '\t\n\v\r\f\a' -assert re.sub('a', '\t\n\v\r\f\a', 'a') == (chr(9)+chr(10)+chr(11)+chr(13)+chr(12)+chr(7)) - -assert re.sub('^\s*', 'X', 'test') == 'Xtest' - -assert re.sub('a', 'b', 'aaaaa') == 'bbbbb' -assert re.sub('a', 'b', 'aaaaa', 1) == 'baaaa' - - -print_test('symbolic references', 4) - -try: - re.sub('(?P<a>x)', '\g<a', 'xx') -except re.error, reason: - pass -else: - raise TestFailed, "symbolic reference" - -try: - re.sub('(?P<a>x)', '\g<', 'xx') -except re.error, reason: - pass -else: - raise TestFailed, "symbolic reference" - -try: - re.sub('(?P<a>x)', '\g', 'xx') -except re.error, reason: - pass -else: - raise TestFailed, "symbolic reference" - -try: - re.sub('(?P<a>x)', '\g<a a>', 'xx') -except re.error, reason: - pass -else: - raise TestFailed, "symbolic reference" - -try: - re.sub('(?P<a>x)', '\g<1a1>', 'xx') -except re.error, reason: - pass -else: - raise TestFailed, "symbolic reference" - -try: - re.sub('(?P<a>x)', '\g<ab>', 'xx') -except (IndexError, re.error), reason: - pass -else: - raise TestFailed, "symbolic reference" - -try: - re.sub('(?P<a>x)|(?P<b>y)', '\g<b>', 'xx') -except re.error, reason: - pass -else: - raise TestFailed, "symbolic reference" - -try: - re.sub('(?P<a>x)|(?P<b>y)', '\\2', 'xx') -except re.error, reason: - pass -else: - raise TestFailed, "symbolic reference" - - -print_test('subn', 3) - -assert re.subn("(?i)b+", "x", "bbbb BBBB") == ('x x', 2) -assert re.subn("b+", "x", "bbbb BBBB") == ('x BBBB', 1) -assert re.subn("b+", "x", "xyz") == ('xyz', 0) -assert re.subn("b*", "x", "xyz") == ('xxxyxzx', 4) -assert re.subn("b*", "x", "xyz", 2) == ('xxxyz', 2) - -print_test('split') - -assert re.split(":", ":a:b::c") == ['', 'a', 'b', '', 'c'] -assert re.split(":*", ":a:b::c") == ['', 'a', 'b', 'c'] -assert re.split("(:*)", ":a:b::c") == ['', ':', 'a', ':', 'b', '::', 'c'] -assert re.split("(?::*)", ":a:b::c") == ['', 'a', 'b', 'c'] -assert re.split("(:)*", ":a:b::c") == ['', ':', 'a', ':', 'b', ':', 'c'] -assert re.split("([b:]+)", ":a:b::c") == ['', ':', 'a', ':b::', 'c'] -assert re.split("(b)|(:+)", ":a:b::c") == \ - ['', None, ':', 'a', None, ':', '', 'b', None, '', None, '::', 'c'] -assert re.split("(?:b)|(?::+)", ":a:b::c") == ['', 'a', '', '', 'c'] - -assert re.split(":", ":a:b::c", 2) == ['', 'a', 'b::c'] -assert re.split(':', 'a:b:c:d', 2) == ['a', 'b', 'c:d'] - -assert re.split("(:)", ":a:b::c", 2) == ['', ':', 'a', ':', 'b::c'] -assert re.split("(:*)", ":a:b::c", 2) == ['', ':', 'a', ':', 'b::c'] - -print_test('match.groups', 2) - - -# No groups at all -m = re.match('a', 'a') ; assert m.groups() == () -# A single group -m = re.match('(a)', 'a') ; assert m.groups() == ('a',) - -pat = re.compile('((a)|(b))(c)?') -assert pat.match('a').groups() == ('a', 'a', None, None) -assert pat.match('b').groups() == ('b', None, 'b', None) -assert pat.match('ac').groups() == ('a', 'a', None, 'c') -assert pat.match('bc').groups() == ('b', None, 'b', 'c') - - - -print_test('match.group', 2) - -# A single group -m = re.match('(a)', 'a') -assert m.group(0) == 'a' ; assert m.group(0) == 'a' -assert m.group(1) == 'a' ; assert m.group(1, 1) == ('a', 'a') - -pat = re.compile('(?:(?P<a1>a)|(?P<b2>b))(?P<c3>c)?') -assert pat.match('a').group(1, 2, 3) == ('a', None, None) -assert pat.match('b').group('a1', 'b2', 'c3') == (None, 'b', None) -assert pat.match('ac').group(1, 'b2', 3) == ('a', None, 'c') - -print_test('escape', 2) - -p="" -for i in range(0, 256): - p = p + chr(i) - assert re.match(re.escape(chr(i)), chr(i)) != None - assert re.match(re.escape(chr(i)), chr(i)).span() == (0,1) - -pat=re.compile( re.escape(p) ) -assert pat.match(p) != None -assert pat.match(p).span() == (0,256) - - -#if verbose: -# print 'Pickling a RegexObject instance' - -#import pickle -#pat = re.compile('a(?:b|(c|e){1,2}?|d)+?(.)') -#s = pickle.dumps(pat) -#pat = pickle.loads(s) - -print_test('flags', 2) - -assert re.I == re.IGNORECASE -assert re.L == re.LOCALE -assert re.M == re.MULTILINE -assert re.S == re.DOTALL -assert re.X == re.VERBOSE - -for flags in [re.I, re.M, re.X, re.S, re.L]: - r = re.compile('^pattern$', flags) - -from re_tests import * - -print_test("symbolic groups in regex's", 2) -print_test("perl5-compatible regex's", 2) - -for t in tests: - sys.stdout.flush() - pattern=s=outcome=repl=expected=None - if len(t)==5: - pattern, s, outcome, repl, expected = t - elif len(t)==3: - pattern, s, outcome = t - else: - raise ValueError, ('Test tuples should have 3 or 5 fields',t) - - try: - obj=re.compile(pattern) - except re.error: - if outcome==SYNTAX_ERROR: pass # Expected a syntax error - else: - print '=== Syntax error:', t - except KeyboardInterrupt: raise KeyboardInterrupt - except: - print '*** Unexpected error ***', t - #if verbose: - # traceback.print_exc(file=sys.stdout) - else: - try: - result=obj.search(s) - except (re.error), msg: - print '=== Unexpected exception', t, repr(msg) - if outcome==SYNTAX_ERROR: - # This should have been a syntax error; forget it. - pass - elif outcome==FAIL: - if result is None: pass # No match, as expected - else: print '=== Succeeded incorrectly', t - elif outcome==SUCCEED: - if result is not None: - # Matched, as expected, so now we compute the - # result string and compare it to our expected result. - start, end = result.span(0) - vardict={'found': result.group(0), - 'groups': result.group(), - 'flags': result.re.flags} - for i in range(1, 100): - try: - gi = result.group(i) - # Special hack because else the string concat fails: - if gi is None: - gi = "None" - except IndexError: - gi = "Error" - vardict['g%d' % i] = gi - for i in result.re.groupindex.keys(): - try: - gi = result.group(i) - if gi is None: - gi = "None" - except IndexError: - gi = "Error" - vardict[i] = gi - repl=eval(repl, vardict) - if repl!=expected: - print '=== grouping error', t, - print repr(repl)+' should be '+repr(expected) - else: - print '=== Failed incorrectly', t - continue - - # Try the match with the search area limited to the extent - # of the match and see if it still succeeds. \B will - # break (because it won't match at the end or start of a - # string), so we'll ignore patterns that feature it. - - if pattern[:2]!='\\B' and pattern[-2:]!='\\B': - obj=re.compile(pattern) - result=obj.search(s, result.start(0), result.end(0)+1) - if result==None: - print '=== Failed on range-limited match', t - - # Try the match with IGNORECASE enabled, and check that it - # still succeeds. - obj=re.compile(pattern, re.IGNORECASE) - result=obj.search(s) - if result==None: - print '=== Fails on case-insensitive match', t - - # Try the match with LOCALE enabled, and check that it - # still succeeds. - obj=re.compile(pattern, re.LOCALE) - result=obj.search(s) - if result==None: - print '=== Fails on locale-sensitive match', t Modified: branches/asm/src/org/python/modules/sre/SRE_STATE.java =================================================================== --- branches/asm/src/org/python/modules/sre/SRE_STATE.java 2008-06-06 18:12:44 UTC (rev 4547) +++ branches/asm/src/org/python/modules/sre/SRE_STATE.java 2008-06-06 20:48:42 UTC (rev 4548) @@ -21,7 +21,7 @@ public class SRE_STATE { /* - * Generated from Python-2.3.5 like 'python headerToJava.py < Modules/sre_constants.h' + * Generated from Python-2.4.5 like 'python headerToJava.py < Modules/sre_constants.h' * where headerToJava.py contains the following code import sys for line in sys.stdin: @@ -45,24 +45,25 @@ public static final int SRE_OP_CHARSET = 10; public static final int SRE_OP_BIGCHARSET = 11; public static final int SRE_OP_GROUPREF = 12; - public static final int SRE_OP_GROUPREF_IGNORE = 13; - public static final int SRE_OP_IN = 14; - public static final int SRE_OP_IN_IGNORE = 15; - public static final int SRE_OP_INFO = 16; - public static final int SRE_OP_JUMP = 17; - public static final int SRE_OP_LITERAL = 18; - public static final int SRE_OP_LITERAL_IGNORE = 19; - public static final int SRE_OP_MARK = 20; - public static final int SRE_OP_MAX_UNTIL = 21; - public static final int SRE_OP_MIN_UNTIL = 22; - public static final int SRE_OP_NOT_LITERAL = 23; - public static final int SRE_OP_NOT_LITERAL_IGNORE = 24; - public static final int SRE_OP_NEGATE = 25; - public static final int SRE_OP_RANGE = 26; - public static final int SRE_OP_REPEAT = 27; - public static final int SRE_OP_REPEAT_ONE = 28; - public static final int SRE_OP_SUBPATTERN = 29; - public static final int SRE_OP_MIN_REPEAT_ONE = 30; + public static final int SRE_OP_GROUPREF_EXISTS = 13; + public static final int SRE_OP_GROUPREF_IGNORE = 14; + public static final int SRE_OP_IN = 15; + public static final int SRE_OP_IN_IGNORE = 16; + public static final int SRE_OP_INFO = 17; + public static final int SRE_OP_JUMP = 18; + public static final int SRE_OP_LITERAL = 19; + public static final int SRE_OP_LITERAL_IGNORE = 20; + public static final int SRE_OP_MARK = 21; + public static final int SRE_OP_MAX_UNTIL = 22; + public static final int SRE_OP_MIN_UNTIL = 23; + public static final int SRE_OP_NOT_LITERAL = 24; + public static final int SRE_OP_NOT_LITERAL_IGNORE = 25; + public static final int SRE_OP_NEGATE = 26; + public static final int SRE_OP_RANGE = 27; + public static final int SRE_OP_REPEAT = 28; + public static final int SRE_OP_REPEAT_ONE = 29; + public static final int SRE_OP_SUBPATTERN = 30; + public static final int SRE_OP_MIN_REPEAT_ONE = 31; public static final int SRE_AT_BEGINNING = 0; public static final int SRE_AT_BEGINNING_LINE = 1; public static final int SRE_AT_BEGINNING_STRING = 2; @@ -241,12 +242,12 @@ return false; } - private void mark_fini() { + private void mark_fini() { // XXX => data_stack_dealloc in 2.4 mark_stack = null; mark_stack_size = mark_stack_base = 0; } - private int mark_save(int lo, int hi) { + private int mark_save(int lo, int hi) { // XXX => data_stack_grow in 2.4 if (hi <= lo) return mark_stack_base; @@ -652,6 +653,18 @@ } pidx++; break; + + case SRE_OP_GROUPREF_EXISTS: + i = pattern[pidx]; + //TRACE(pidx, ptr, "GROUPREF_EXISTS " + i); + p = mark[i+i]; + e = mark[i+i+1]; + if (p == -1 || e == -1 || e < p) { + pidx += pattern[pidx + 1]; + break; + } + pidx += 2; + break; case SRE_OP_LITERAL_IGNORE: //TRACE(pidx, ptr, "LITERAL_IGNORE " + (int) pattern[pidx]); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |