Thread: [javascriptlint-commit] SF.net SVN: javascriptlint:[340] trunk/jsengine/tokenizer
Status: Beta
Brought to you by:
matthiasmiller
From: <mat...@us...> - 2013-10-09 21:35:02
|
Revision: 340 http://sourceforge.net/p/javascriptlint/code/340 Author: matthiasmiller Date: 2013-10-09 21:34:59 +0000 (Wed, 09 Oct 2013) Log Message: ----------- Move tokens into separate module Modified Paths: -------------- trunk/jsengine/tokenizer/__init__.py Added Paths: ----------- trunk/jsengine/tokenizer/tok.py Modified: trunk/jsengine/tokenizer/__init__.py =================================================================== --- trunk/jsengine/tokenizer/__init__.py 2013-10-09 20:33:12 UTC (rev 339) +++ trunk/jsengine/tokenizer/__init__.py 2013-10-09 21:34:59 UTC (rev 340) @@ -1,5 +1,6 @@ # vim: sw=4 ts=4 et from jsengine import JSSyntaxError +import tok _WHITESPACE = u'\u0020\t\u000B\u000C\u00A0\uFFFF' _LINETERMINATOR = u'\u000A\u000D\u2028\u2029' @@ -10,131 +11,9 @@ u'ABCDEFGHIJKLMNOPQRSTUVWXYZ' + \ u'$_' -_ALL_TOKENS = [] +_KEYWORDS = tok.getkeywords() +_PUNCTUATOR_TREE = tok.get_punctuator_tree() -class _Token(object): - def __init__(self, category, literal): - self._category = category - self._literal = literal - _ALL_TOKENS.append(self) - - def __repr__(self): - return '_Token(%r, %r)' % (self._category, self._literal) - - @property - def category(self): - return self._category - - @property - def literal(self): - return self._literal - -class _Tokens(object): - # Symbols - ASSIGN_ULSHIFT = _Token('sym', '<<<=') - ASSIGN_URSHIFT = _Token('sym', '>>>=') - EQ_STRICT = _Token('sym', '===') - NE_STRICT = _Token('sym', '!==') - URSHIFT = _Token('sym', '>>>') - ASSIGN_LSHIFT = _Token('sym', '<<=') - ASSIGN_RSHIFT = _Token('sym', '>>=') - LE = _Token('sym', '<=') - GE = _Token('sym', '>=') - EQ = _Token('sym', '==') - NE = _Token('sym', '!=') - INC = _Token('sym', '++') - DEC = _Token('sym', '--') - LSHIFT = _Token('sym', '<<') - RSHIFT = _Token('sym', '>>') - LOGICAL_AND = _Token('sym', '&&') - LOGICAL_OR = _Token('sym', '||') - ASSIGN_ADD = _Token('sym', '+=') - ASSIGN_SUB = _Token('sym', '-=') - ASSIGN_MUL = _Token('sym', '*=') - ASSIGN_MOD = _Token('sym', '%=') - ASSIGN_BIT_AND = _Token('sym', '&=') - ASSIGN_BIT_OR = _Token('sym', '|=') - ASSIGN_BIT_XOR = _Token('sym', '^=') - ASSIGN_DIV = _Token('sym', '/=') - LBRACE = _Token('sym', '{') - RBRACE = _Token('sym', '}') - LPAREN = _Token('sym', '(') - RPAREN = _Token('sym', ')') - LBRACKET = _Token('sym', '[') - RBRACKET = _Token('sym', ']') - DOT = _Token('sym', '.') - SEMI = _Token('sym', ';') - COMMA = _Token('sym', ',') - LT = _Token('sym', '<') - GT = _Token('sym', '>') - ADD = _Token('sym', '+') - SUB = _Token('sym', '-') - MUL = _Token('sym', '*') - MOD = _Token('sym', '%') - BIT_OR = _Token('sym', '|') - BIT_AND = _Token('sym', '&') - BIT_XOR = _Token('sym', '^') - LOGICAL_NOT = _Token('sym', '!') - BIT_NOT = _Token('sym', '~') - QUESTION = _Token('sym', '?') - COLON = _Token('sym', ':') - ASSIGN = _Token('sym', '=') - DIV = _Token('sym', '/') - - # Keywords - BREAK = _Token('kw', 'break') - CASE = _Token('kw', 'case') - CATCH = _Token('kw', 'catch') - CONTINUE = _Token('kw', 'continue') - DEFAULT = _Token('kw', 'default') - DELETE = _Token('kw', 'delete') - DO = _Token('kw', 'do') - ELSE = _Token('kw', 'else') - FALSE = _Token('kw', 'false') - FINALLY = _Token('kw', 'finally') - FOR = _Token('kw', 'for') - FUNCTION = _Token('kw', 'function') - IF = _Token('kw', 'if') - IN = _Token('kw', 'in') - INSTANCEOF = _Token('kw', 'instanceof') - NEW = _Token('kw', 'new') - NULL = _Token('kw', 'null') - RETURN = _Token('kw', 'return') - SWITCH = _Token('kw', 'switch') - THIS = _Token('kw', 'this') - THROW = _Token('kw', 'throw') - TRUE = _Token('kw', 'true') - TYPEOF = _Token('kw', 'typeof') - TRY = _Token('kw', 'try') - VAR = _Token('kw', 'var') - VOID = _Token('kw', 'void') - WHILE = _Token('kw', 'while') - WITH = _Token('kw', 'with') - - # Other tokens - C_COMMENT = _Token('other', '/*') - CPP_COMMENT = _Token('other', '//') - HTML_COMMENT = _Token('other', '<!--') - ERROR = _Token('other', 'err') - EOF = _Token('other', 'eof') - EOL = _Token('other', 'eol') - NAME = _Token('other', '(name)') - NUMBER = _Token('other', '(num)') - OPERATOR = _Token('other', '(op)') - REGEXP = _Token('other', '(re)') - SPACE = _Token('other', '(sp)') - STRING = _Token('other', '(str)') - -tok = _Tokens() -_KEYWORDS = dict((t.literal, t) for t in _ALL_TOKENS if t.category == 'kw') -_PUNCTUATOR_TREE = {} -for punctuator in (t for t in _ALL_TOKENS if t.category == 'sym'): - d = _PUNCTUATOR_TREE - for c in punctuator.literal: - d = d.setdefault(c, {}) - assert not None in d, punctuator.literal - d[None] = punctuator - class Token: def __init__(self, tok, atom=None): self.tok = tok @@ -282,7 +161,7 @@ peek.set_offset(start_offset, end_offset) self._peeked.append(peek) - assert isinstance(peek.tok, _Token), repr(peek.tok) + assert isinstance(peek.tok, tok.TokenType), repr(peek.tok) if peek.tok not in (tok.EOL, tok.SPACE, tok.C_COMMENT, tok.CPP_COMMENT, tok.HTML_COMMENT): Copied: trunk/jsengine/tokenizer/tok.py (from rev 339, trunk/jsengine/tokenizer/__init__.py) =================================================================== --- trunk/jsengine/tokenizer/tok.py (rev 0) +++ trunk/jsengine/tokenizer/tok.py 2013-10-09 21:34:59 UTC (rev 340) @@ -0,0 +1,128 @@ +# vim: sw=4 ts=4 et +_ALL_TOKENS = [] + +class TokenType(object): + def __init__(self, category, literal): + self._category = category + self._literal = literal + _ALL_TOKENS.append(self) + + def __repr__(self): + return 'TokenType(%r, %r)' % (self._category, self._literal) + + @property + def category(self): + return self._category + + @property + def literal(self): + return self._literal + +# Symbols +ASSIGN_ULSHIFT = TokenType('sym', '<<<=') +ASSIGN_URSHIFT = TokenType('sym', '>>>=') +EQ_STRICT = TokenType('sym', '===') +NE_STRICT = TokenType('sym', '!==') +URSHIFT = TokenType('sym', '>>>') +ASSIGN_LSHIFT = TokenType('sym', '<<=') +ASSIGN_RSHIFT = TokenType('sym', '>>=') +LE = TokenType('sym', '<=') +GE = TokenType('sym', '>=') +EQ = TokenType('sym', '==') +NE = TokenType('sym', '!=') +INC = TokenType('sym', '++') +DEC = TokenType('sym', '--') +LSHIFT = TokenType('sym', '<<') +RSHIFT = TokenType('sym', '>>') +LOGICAL_AND = TokenType('sym', '&&') +LOGICAL_OR = TokenType('sym', '||') +ASSIGN_ADD = TokenType('sym', '+=') +ASSIGN_SUB = TokenType('sym', '-=') +ASSIGN_MUL = TokenType('sym', '*=') +ASSIGN_MOD = TokenType('sym', '%=') +ASSIGN_BIT_AND = TokenType('sym', '&=') +ASSIGN_BIT_OR = TokenType('sym', '|=') +ASSIGN_BIT_XOR = TokenType('sym', '^=') +ASSIGN_DIV = TokenType('sym', '/=') +LBRACE = TokenType('sym', '{') +RBRACE = TokenType('sym', '}') +LPAREN = TokenType('sym', '(') +RPAREN = TokenType('sym', ')') +LBRACKET = TokenType('sym', '[') +RBRACKET = TokenType('sym', ']') +DOT = TokenType('sym', '.') +SEMI = TokenType('sym', ';') +COMMA = TokenType('sym', ',') +LT = TokenType('sym', '<') +GT = TokenType('sym', '>') +ADD = TokenType('sym', '+') +SUB = TokenType('sym', '-') +MUL = TokenType('sym', '*') +MOD = TokenType('sym', '%') +BIT_OR = TokenType('sym', '|') +BIT_AND = TokenType('sym', '&') +BIT_XOR = TokenType('sym', '^') +LOGICAL_NOT = TokenType('sym', '!') +BIT_NOT = TokenType('sym', '~') +QUESTION = TokenType('sym', '?') +COLON = TokenType('sym', ':') +ASSIGN = TokenType('sym', '=') +DIV = TokenType('sym', '/') + +# Keywords +BREAK = TokenType('kw', 'break') +CASE = TokenType('kw', 'case') +CATCH = TokenType('kw', 'catch') +CONTINUE = TokenType('kw', 'continue') +DEFAULT = TokenType('kw', 'default') +DELETE = TokenType('kw', 'delete') +DO = TokenType('kw', 'do') +ELSE = TokenType('kw', 'else') +FALSE = TokenType('kw', 'false') +FINALLY = TokenType('kw', 'finally') +FOR = TokenType('kw', 'for') +FUNCTION = TokenType('kw', 'function') +IF = TokenType('kw', 'if') +IN = TokenType('kw', 'in') +INSTANCEOF = TokenType('kw', 'instanceof') +NEW = TokenType('kw', 'new') +NULL = TokenType('kw', 'null') +RETURN = TokenType('kw', 'return') +SWITCH = TokenType('kw', 'switch') +THIS = TokenType('kw', 'this') +THROW = TokenType('kw', 'throw') +TRUE = TokenType('kw', 'true') +TYPEOF = TokenType('kw', 'typeof') +TRY = TokenType('kw', 'try') +VAR = TokenType('kw', 'var') +VOID = TokenType('kw', 'void') +WHILE = TokenType('kw', 'while') +WITH = TokenType('kw', 'with') + +# Other tokens +C_COMMENT = TokenType('other', '/*') +CPP_COMMENT = TokenType('other', '//') +HTML_COMMENT = TokenType('other', '<!--') +ERROR = TokenType('other', 'err') +EOF = TokenType('other', 'eof') +EOL = TokenType('other', 'eol') +NAME = TokenType('other', '(name)') +NUMBER = TokenType('other', '(num)') +OPERATOR = TokenType('other', '(op)') +REGEXP = TokenType('other', '(re)') +SPACE = TokenType('other', '(sp)') +STRING = TokenType('other', '(str)') + +def getkeywords(): + return dict((t.literal, t) for t in _ALL_TOKENS if t.category == 'kw') + +def get_punctuator_tree(): + tree = {} + for punctuator in (t for t in _ALL_TOKENS if t.category == 'sym'): + leaf = tree + for c in punctuator.literal: + leaf = leaf.setdefault(c, {}) + assert not None in leaf, punctuator.literal + leaf[None] = punctuator + return tree + This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <mat...@us...> - 2013-10-09 22:41:50
|
Revision: 342 http://sourceforge.net/p/javascriptlint/code/342 Author: matthiasmiller Date: 2013-10-09 22:41:46 +0000 (Wed, 09 Oct 2013) Log Message: ----------- Distinguish between characters and strings. Modified Paths: -------------- trunk/jsengine/tokenizer/__init__.py trunk/jsengine/tokenizer/tok.py Modified: trunk/jsengine/tokenizer/__init__.py =================================================================== --- trunk/jsengine/tokenizer/__init__.py 2013-10-09 21:51:03 UTC (rev 341) +++ trunk/jsengine/tokenizer/__init__.py 2013-10-09 22:41:46 UTC (rev 342) @@ -14,6 +14,14 @@ _KEYWORDS = tok.getkeywords() _PUNCTUATOR_TREE = tok.get_punctuator_tree() +def _str_has_chr(s, c): + assert len(c) <= 1 + return c in s + +def _chr_to_str(c): + assert len(c) <= 1 + return c + class Token: def __init__(self, tok, atom=None): self.tok = tok @@ -51,23 +59,35 @@ return self._offset >= len(self._content) def readchr(self): - if self._offset < len(self._content): + c = self.peekchr() + if not c: + raise JSSyntaxError(self.get_offset(), 'unexpected_eof') + self._offset += 1 + return c + + def readchrif(self, expect): + if self.peekchr() == expect: self._offset += 1 - return self._content[self._offset - 1] - raise JSSyntaxError(self.get_offset()-1, 'unexpected_eof') + return expect + return '' - def readchrif(self, seq): - s = self.peekchrif(seq) + def readchrin(self, seq): + s = self.peekchrin(seq) if s: - assert len(s) == 1 self._offset += 1 return s - def peekchrif(self, seq): - if self._offset < len(self._content) and \ - self._content[self._offset] in seq: + def peekchr(self): + if self._offset < len(self._content): return self._content[self._offset] + return '' + def peekchrin(self, seq): + c = self.peekchr() + if c and _str_has_chr(seq, c): + return c + return '' + def readtextif(self, text): """ Returns the string if found. Otherwise returns None. """ @@ -182,7 +202,7 @@ # TODO: Validate and save while True: - c = stream.readchrif(_IDENT) + c = stream.readchrin(_IDENT) if not c: break @@ -194,15 +214,17 @@ if stream.eof(): return Token(tok.EOF) + stream.watch_reads() + c = stream.readchr() # WHITESPACE - if c in _WHITESPACE or c in _LINETERMINATOR: - linebreak = c in _LINETERMINATOR + if _str_has_chr(_WHITESPACE, c) or _str_has_chr(_LINETERMINATOR, c): + linebreak = _str_has_chr(_LINETERMINATOR, c) while True: - if stream.readchrif(_LINETERMINATOR): + if stream.readchrin(_LINETERMINATOR): linebreak = True - elif stream.readchrif(_WHITESPACE): + elif stream.readchrin(_WHITESPACE): pass else: break @@ -213,24 +235,24 @@ # COMMENTS if c == '/': - if stream.peekchrif("/"): - while not stream.eof() and not stream.peekchrif(_LINETERMINATOR): + if stream.peekchr() == '/': + while not stream.eof() and not stream.peekchrin(_LINETERMINATOR): stream.readchr() return Token(tok.CPP_COMMENT) - if stream.peekchrif("*"): + if stream.peekchr() == '*': linebreak = False while True: if stream.eof(): return Token(tok.ERROR, atom='unterminated_comment') c = stream.readchr() - if c in _LINETERMINATOR: + if _str_has_chr(_LINETERMINATOR, c): linebreak = True elif c == '*' and stream.readchrif('/'): return Token(tok.C_COMMENT) return Token(tok.EOF) elif c == '<': if stream.readtextif('!--'): - while not stream.eof() and not stream.peekchrif(_LINETERMINATOR): + while not stream.eof() and not stream.peekchrin(_LINETERMINATOR): stream.readchr() return Token(tok.HTML_COMMENT) @@ -245,66 +267,64 @@ c = stream.readchr() elif c == quote: return Token(tok.STRING, atom=s) - s += c + s += _chr_to_str(c) # NUMBERS - if c in _DIGITS or (c == '.' and stream.peekchrif(_DIGITS)): + if _str_has_chr(_DIGITS, c) or (c == '.' and stream.peekchrin(_DIGITS)): s = c # TODO - stream.watch_reads() - if c == '0' and stream.readchrif('xX'): + if c == '0' and stream.readchrin('xX'): # Hex - while stream.readchrif(_HEX_DIGITS): + while stream.readchrin(_HEX_DIGITS): pass - elif c == '0' and stream.readchrif(_DIGITS): + elif c == '0' and stream.readchrin(_DIGITS): # Octal - while stream.readchrif(_DIGITS): + while stream.readchrin(_DIGITS): pass else: # Decimal if c != '.': - while stream.readchrif(_DIGITS): + while stream.readchrin(_DIGITS): pass stream.readchrif('.') - while stream.readchrif(_DIGITS): + while stream.readchrin(_DIGITS): pass - if stream.readchrif('eE'): - stream.readchrif('+-') - if not stream.readchrif(_DIGITS): + if stream.readchrin('eE'): + stream.readchrin('+-') + if not stream.readchrin(_DIGITS): raise JSSyntaxError(stream.get_offset(), 'syntax_error') - while stream.readchrif(_DIGITS): + while stream.readchrin(_DIGITS): pass - if stream.peekchrif(_IDENT): + if stream.peekchrin(_IDENT): return Token(tok.ERROR) - atom = s + stream.get_watched_reads() + atom = stream.get_watched_reads() return Token(tok.NUMBER, atom=atom) if c in _PUNCTUATOR_TREE: d = _PUNCTUATOR_TREE[c] while True: - c = stream.readchrif(list(d.keys())) + c = stream.readchrin(u''.join(d.keys())) if c: d = d[c] else: break try: - return Token(d[None]) + return Token(d['']) except KeyError: print('oops') raise JSSyntaxError(stream.get_offset(), 'syntax_error') - if c in _IDENT: - s = '' - while c: - s += c - c = stream.readchrif(_IDENT + _DIGITS) - if s in _KEYWORDS: - return Token(_KEYWORDS[s], atom=s) - elif s: - return Token(tok.NAME, atom=s) + if _str_has_chr(_IDENT, c): + while stream.readchrin(_IDENT + _DIGITS): + pass + atom = stream.get_watched_reads() + if atom in _KEYWORDS: + return Token(_KEYWORDS[atom], atom=atom) + return Token(tok.NAME, atom=atom) + raise JSSyntaxError(stream.get_offset(), 'unexpected_char', - { 'char': c }) + { 'char': _chr_to_str(c) }) Modified: trunk/jsengine/tokenizer/tok.py =================================================================== --- trunk/jsengine/tokenizer/tok.py 2013-10-09 21:51:03 UTC (rev 341) +++ trunk/jsengine/tokenizer/tok.py 2013-10-09 22:41:46 UTC (rev 342) @@ -123,6 +123,6 @@ for c in punctuator.literal: leaf = leaf.setdefault(c, {}) assert not None in leaf, punctuator.literal - leaf[None] = punctuator + leaf[''] = punctuator return tree This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <mat...@us...> - 2013-10-10 14:07:44
|
Revision: 343 http://sourceforge.net/p/javascriptlint/code/343 Author: matthiasmiller Date: 2013-10-10 14:07:41 +0000 (Thu, 10 Oct 2013) Log Message: ----------- Fully distinguish between chars and strings. Modified Paths: -------------- trunk/jsengine/tokenizer/__init__.py trunk/jsengine/tokenizer/tok.py Modified: trunk/jsengine/tokenizer/__init__.py =================================================================== --- trunk/jsengine/tokenizer/__init__.py 2013-10-09 22:41:46 UTC (rev 342) +++ trunk/jsengine/tokenizer/__init__.py 2013-10-10 14:07:41 UTC (rev 343) @@ -14,14 +14,43 @@ _KEYWORDS = tok.getkeywords() _PUNCTUATOR_TREE = tok.get_punctuator_tree() -def _str_has_chr(s, c): - assert len(c) <= 1 - return c in s +class _Char(object): + def __init__(self, u): + assert isinstance(u, int) or u is None, u + self._u = u -def _chr_to_str(c): - assert len(c) <= 1 - return c + @classmethod + def fromstr(cls, s, i): + return _Char(ord(s[i])) + @classmethod + def ord(cls, s): + return _Char(ord(s)) + + @property + def uval(self): + return self._u + + def tostr(self): + if self._u is None: + return unicode() + return unichr(self._u) + + def instr(self, s): + if self._u is None: + return False + return s.find(unichr(self._u)) != -1 + + def __hash__(self): + return hash(self._u) + + def __eq__(self, other): + assert isinstance(other, _Char), other + return self._u == other._u + + def __nonzero__(self): + return not self._u is None + class Token: def __init__(self, tok, atom=None): self.tok = tok @@ -69,7 +98,7 @@ if self.peekchr() == expect: self._offset += 1 return expect - return '' + return _Char(None) def readchrin(self, seq): s = self.peekchrin(seq) @@ -79,14 +108,14 @@ def peekchr(self): if self._offset < len(self._content): - return self._content[self._offset] - return '' + return _Char.fromstr(self._content, self._offset) + return _Char(None) def peekchrin(self, seq): c = self.peekchr() - if c and _str_has_chr(seq, c): + if c and c.instr(seq): return c - return '' + return _Char(None) def readtextif(self, text): """ Returns the string if found. Otherwise returns None. @@ -184,20 +213,20 @@ stream = self._stream while True: c = stream.readchr() - if c == '\\': + if c == _Char.ord('\\'): c = stream.readchr() - if c == '\n': + if c == _Char.ord('\n'): return Token(tok.ERROR) - elif c == '[': + elif c == _Char.ord('['): while True: c = stream.readchr() - if c == '\n': + if c == _Char.ord('\n'): return Token(tok.ERROR) - elif c == ']': + elif c == _Char.ord(']'): break - elif c == '\n': + elif c == _Char.ord('\n'): return Token(tok.ERROR) - elif c == '/': + elif c == _Char.ord('/'): break # TODO: Validate and save @@ -219,8 +248,8 @@ c = stream.readchr() # WHITESPACE - if _str_has_chr(_WHITESPACE, c) or _str_has_chr(_LINETERMINATOR, c): - linebreak = _str_has_chr(_LINETERMINATOR, c) + if c.instr(_WHITESPACE) or c.instr(_LINETERMINATOR): + linebreak = c.instr(_LINETERMINATOR) while True: if stream.readchrin(_LINETERMINATOR): linebreak = True @@ -234,49 +263,49 @@ return Token(tok.SPACE) # COMMENTS - if c == '/': - if stream.peekchr() == '/': + if c == _Char.ord('/'): + if stream.peekchr() == _Char.ord('/'): while not stream.eof() and not stream.peekchrin(_LINETERMINATOR): stream.readchr() return Token(tok.CPP_COMMENT) - if stream.peekchr() == '*': + if stream.peekchr() == _Char.ord('*'): linebreak = False while True: if stream.eof(): return Token(tok.ERROR, atom='unterminated_comment') c = stream.readchr() - if _str_has_chr(_LINETERMINATOR, c): + if c.instr(_LINETERMINATOR): linebreak = True - elif c == '*' and stream.readchrif('/'): + elif c == _Char.ord('*') and stream.readchrif(_Char.ord('/')): return Token(tok.C_COMMENT) return Token(tok.EOF) - elif c == '<': + elif c == _Char.ord('<'): if stream.readtextif('!--'): while not stream.eof() and not stream.peekchrin(_LINETERMINATOR): stream.readchr() return Token(tok.HTML_COMMENT) # STRING LITERALS - if c == '"' or c == "'": + if c == _Char.ord('"') or c == _Char.ord("'"): # TODO: Decode s = '' quote = c while True: c = stream.readchr() - if c == '\\': + if c == _Char.ord('\\'): c = stream.readchr() elif c == quote: return Token(tok.STRING, atom=s) - s += _chr_to_str(c) + s += c.tostr() # NUMBERS - if _str_has_chr(_DIGITS, c) or (c == '.' and stream.peekchrin(_DIGITS)): + if c.instr(_DIGITS) or (c == _Char.ord('.') and stream.peekchrin(_DIGITS)): s = c # TODO - if c == '0' and stream.readchrin('xX'): + if c == _Char.ord('0') and stream.readchrin('xX'): # Hex while stream.readchrin(_HEX_DIGITS): pass - elif c == '0' and stream.readchrin(_DIGITS): + elif c == _Char.ord('0') and stream.readchrin(_DIGITS): # Octal while stream.readchrin(_DIGITS): pass @@ -285,7 +314,7 @@ if c != '.': while stream.readchrin(_DIGITS): pass - stream.readchrif('.') + stream.readchrif(_Char.ord('.')) while stream.readchrin(_DIGITS): pass @@ -303,21 +332,22 @@ atom = stream.get_watched_reads() return Token(tok.NUMBER, atom=atom) - if c in _PUNCTUATOR_TREE: - d = _PUNCTUATOR_TREE[c] + if c.uval in _PUNCTUATOR_TREE: + d = _PUNCTUATOR_TREE[c.uval] while True: - c = stream.readchrin(u''.join(d.keys())) - if c: - d = d[c] + c = stream.peekchr() + if c and c.uval in d: + stream.readchr() + d = d[c.uval] else: break try: - return Token(d['']) + return Token(d[-1]) except KeyError: print('oops') raise JSSyntaxError(stream.get_offset(), 'syntax_error') - if _str_has_chr(_IDENT, c): + if c.instr(_IDENT): while stream.readchrin(_IDENT + _DIGITS): pass @@ -327,4 +357,4 @@ return Token(tok.NAME, atom=atom) raise JSSyntaxError(stream.get_offset(), 'unexpected_char', - { 'char': _chr_to_str(c) }) + { 'char': c.tostr() }) Modified: trunk/jsengine/tokenizer/tok.py =================================================================== --- trunk/jsengine/tokenizer/tok.py 2013-10-09 22:41:46 UTC (rev 342) +++ trunk/jsengine/tokenizer/tok.py 2013-10-10 14:07:41 UTC (rev 343) @@ -121,8 +121,8 @@ for punctuator in (t for t in _ALL_TOKENS if t.category == 'sym'): leaf = tree for c in punctuator.literal: - leaf = leaf.setdefault(c, {}) + leaf = leaf.setdefault(ord(c), {}) assert not None in leaf, punctuator.literal - leaf[''] = punctuator + leaf[-1] = punctuator return tree This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <mat...@us...> - 2013-10-10 15:28:35
|
Revision: 346 http://sourceforge.net/p/javascriptlint/code/346 Author: matthiasmiller Date: 2013-10-10 15:28:32 +0000 (Thu, 10 Oct 2013) Log Message: ----------- Refactor punctuator/keyword handling. Modified Paths: -------------- trunk/jsengine/tokenizer/__init__.py trunk/jsengine/tokenizer/tok.py Modified: trunk/jsengine/tokenizer/__init__.py =================================================================== --- trunk/jsengine/tokenizer/__init__.py 2013-10-10 15:25:47 UTC (rev 345) +++ trunk/jsengine/tokenizer/__init__.py 2013-10-10 15:28:32 UTC (rev 346) @@ -5,15 +5,11 @@ _WHITESPACE = u'\u0020\t\u000B\u000C\u00A0\uFFFF' _LINETERMINATOR = u'\u000A\u000D\u2028\u2029' _DIGITS = u'0123456789' -_DOT_DIGITS = [u'.%s' % digit for digit in _DIGITS] _HEX_DIGITS = _DIGITS + u'abcdefABCDEF' _IDENT = u'abcdefghijklmnopqrstuvwxyz' + \ u'ABCDEFGHIJKLMNOPQRSTUVWXYZ' + \ u'$_' -_KEYWORDS = tok.getkeywords() -_PUNCTUATOR_TREE = tok.get_punctuator_tree() - class _Char(object): def __init__(self, u): assert isinstance(u, int) or u is None, u @@ -179,7 +175,7 @@ def expect_identifiername(self): encountered = self.advance() - if encountered.tok in list(_KEYWORDS.values()): + if tok.keywords.has(encountered.tok) != -1: encountered.tok = tok.NAME if encountered.tok != tok.NAME: raise JSSyntaxError(encountered.start_offset, 'syntax_error') @@ -332,29 +328,28 @@ atom = stream.get_watched_reads() return Token(tok.NUMBER, atom=atom) - if c.uval in _PUNCTUATOR_TREE: - d = _PUNCTUATOR_TREE[c.uval] + if tok.punctuators.hasprefix(c.tostr()): + s = c.tostr() while True: c = stream.peekchr() - if c and c.uval in d: + if c and tok.punctuators.hasprefix(s + c.tostr()): + s += c.tostr() stream.readchr() - d = d[c.uval] else: break - try: - return Token(d[-1]) - except KeyError: - print('oops') + d = tok.punctuators.get(s) + if not d: raise JSSyntaxError(stream.get_offset(), 'syntax_error') - + return Token(d) if c.instr(_IDENT): while stream.readchrin(_IDENT + _DIGITS): pass atom = stream.get_watched_reads() - if atom in _KEYWORDS: - return Token(_KEYWORDS[atom], atom=atom) - return Token(tok.NAME, atom=atom) + tt = tok.keywords.get(atom, tok.NAME) + t = Token(tt) + t.atom = atom + return t raise JSSyntaxError(stream.get_offset(), 'unexpected_char', { 'char': c.tostr() }) Modified: trunk/jsengine/tokenizer/tok.py =================================================================== --- trunk/jsengine/tokenizer/tok.py 2013-10-10 15:25:47 UTC (rev 345) +++ trunk/jsengine/tokenizer/tok.py 2013-10-10 15:28:32 UTC (rev 346) @@ -10,12 +10,10 @@ def __repr__(self): return 'TokenType(%r, %r)' % (self._category, self._literal) - @property - def category(self): + def getcategory(self): return self._category - @property - def literal(self): + def getliteral(self): return self._literal # Symbols @@ -113,16 +111,44 @@ SPACE = TokenType('other', '(sp)') STRING = TokenType('other', '(str)') -def getkeywords(): - return dict((t.literal, t) for t in _ALL_TOKENS if t.category == 'kw') +# Freeze the list of keywords +_ALL_TOKENS = tuple(_ALL_TOKENS) -def get_punctuator_tree(): - tree = {} - for punctuator in (t for t in _ALL_TOKENS if t.category == 'sym'): - leaf = tree - for c in punctuator.literal: - leaf = leaf.setdefault(ord(c), {}) - assert not None in leaf, punctuator.literal - leaf[-1] = punctuator - return tree +class _Keywords(object): + def __init__(self): + self._d = {} + for tt in _ALL_TOKENS: + if tt.getcategory() == 'kw': + self._d[tt.getliteral()] = tt + def get(self, literal, default): + return self._d.get(literal, default) + + def has(self, tok): + for iter in self._d.values(): + if iter == tok: + return True + return False + +keywords = _Keywords() + +class _Punctuators(object): + def __init__(self): + self._prefixes = {} + self._punctuators = {} + + for t in _ALL_TOKENS: + if t.getcategory() == 'sym': + literal = t.getliteral() + for i in range(len(literal)): + prefix = literal[:i+1] + self._prefixes[prefix] = True + self._punctuators[literal] = t + + def hasprefix(self, prefix): + return self._prefixes.get(prefix, False) + + def get(self, literal): + return self._punctuators.get(literal) + +punctuators = _Punctuators() This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |