Thread: [javascriptlint-commit] SF.net SVN: javascriptlint:[340] trunk/jsengine/tokenizer
Status: Beta
Brought to you by:
matthiasmiller
|
From: <mat...@us...> - 2013-10-09 21:35:02
|
Revision: 340
http://sourceforge.net/p/javascriptlint/code/340
Author: matthiasmiller
Date: 2013-10-09 21:34:59 +0000 (Wed, 09 Oct 2013)
Log Message:
-----------
Move tokens into separate module
Modified Paths:
--------------
trunk/jsengine/tokenizer/__init__.py
Added Paths:
-----------
trunk/jsengine/tokenizer/tok.py
Modified: trunk/jsengine/tokenizer/__init__.py
===================================================================
--- trunk/jsengine/tokenizer/__init__.py 2013-10-09 20:33:12 UTC (rev 339)
+++ trunk/jsengine/tokenizer/__init__.py 2013-10-09 21:34:59 UTC (rev 340)
@@ -1,5 +1,6 @@
# vim: sw=4 ts=4 et
from jsengine import JSSyntaxError
+import tok
_WHITESPACE = u'\u0020\t\u000B\u000C\u00A0\uFFFF'
_LINETERMINATOR = u'\u000A\u000D\u2028\u2029'
@@ -10,131 +11,9 @@
u'ABCDEFGHIJKLMNOPQRSTUVWXYZ' + \
u'$_'
-_ALL_TOKENS = []
+_KEYWORDS = tok.getkeywords()
+_PUNCTUATOR_TREE = tok.get_punctuator_tree()
-class _Token(object):
- def __init__(self, category, literal):
- self._category = category
- self._literal = literal
- _ALL_TOKENS.append(self)
-
- def __repr__(self):
- return '_Token(%r, %r)' % (self._category, self._literal)
-
- @property
- def category(self):
- return self._category
-
- @property
- def literal(self):
- return self._literal
-
-class _Tokens(object):
- # Symbols
- ASSIGN_ULSHIFT = _Token('sym', '<<<=')
- ASSIGN_URSHIFT = _Token('sym', '>>>=')
- EQ_STRICT = _Token('sym', '===')
- NE_STRICT = _Token('sym', '!==')
- URSHIFT = _Token('sym', '>>>')
- ASSIGN_LSHIFT = _Token('sym', '<<=')
- ASSIGN_RSHIFT = _Token('sym', '>>=')
- LE = _Token('sym', '<=')
- GE = _Token('sym', '>=')
- EQ = _Token('sym', '==')
- NE = _Token('sym', '!=')
- INC = _Token('sym', '++')
- DEC = _Token('sym', '--')
- LSHIFT = _Token('sym', '<<')
- RSHIFT = _Token('sym', '>>')
- LOGICAL_AND = _Token('sym', '&&')
- LOGICAL_OR = _Token('sym', '||')
- ASSIGN_ADD = _Token('sym', '+=')
- ASSIGN_SUB = _Token('sym', '-=')
- ASSIGN_MUL = _Token('sym', '*=')
- ASSIGN_MOD = _Token('sym', '%=')
- ASSIGN_BIT_AND = _Token('sym', '&=')
- ASSIGN_BIT_OR = _Token('sym', '|=')
- ASSIGN_BIT_XOR = _Token('sym', '^=')
- ASSIGN_DIV = _Token('sym', '/=')
- LBRACE = _Token('sym', '{')
- RBRACE = _Token('sym', '}')
- LPAREN = _Token('sym', '(')
- RPAREN = _Token('sym', ')')
- LBRACKET = _Token('sym', '[')
- RBRACKET = _Token('sym', ']')
- DOT = _Token('sym', '.')
- SEMI = _Token('sym', ';')
- COMMA = _Token('sym', ',')
- LT = _Token('sym', '<')
- GT = _Token('sym', '>')
- ADD = _Token('sym', '+')
- SUB = _Token('sym', '-')
- MUL = _Token('sym', '*')
- MOD = _Token('sym', '%')
- BIT_OR = _Token('sym', '|')
- BIT_AND = _Token('sym', '&')
- BIT_XOR = _Token('sym', '^')
- LOGICAL_NOT = _Token('sym', '!')
- BIT_NOT = _Token('sym', '~')
- QUESTION = _Token('sym', '?')
- COLON = _Token('sym', ':')
- ASSIGN = _Token('sym', '=')
- DIV = _Token('sym', '/')
-
- # Keywords
- BREAK = _Token('kw', 'break')
- CASE = _Token('kw', 'case')
- CATCH = _Token('kw', 'catch')
- CONTINUE = _Token('kw', 'continue')
- DEFAULT = _Token('kw', 'default')
- DELETE = _Token('kw', 'delete')
- DO = _Token('kw', 'do')
- ELSE = _Token('kw', 'else')
- FALSE = _Token('kw', 'false')
- FINALLY = _Token('kw', 'finally')
- FOR = _Token('kw', 'for')
- FUNCTION = _Token('kw', 'function')
- IF = _Token('kw', 'if')
- IN = _Token('kw', 'in')
- INSTANCEOF = _Token('kw', 'instanceof')
- NEW = _Token('kw', 'new')
- NULL = _Token('kw', 'null')
- RETURN = _Token('kw', 'return')
- SWITCH = _Token('kw', 'switch')
- THIS = _Token('kw', 'this')
- THROW = _Token('kw', 'throw')
- TRUE = _Token('kw', 'true')
- TYPEOF = _Token('kw', 'typeof')
- TRY = _Token('kw', 'try')
- VAR = _Token('kw', 'var')
- VOID = _Token('kw', 'void')
- WHILE = _Token('kw', 'while')
- WITH = _Token('kw', 'with')
-
- # Other tokens
- C_COMMENT = _Token('other', '/*')
- CPP_COMMENT = _Token('other', '//')
- HTML_COMMENT = _Token('other', '<!--')
- ERROR = _Token('other', 'err')
- EOF = _Token('other', 'eof')
- EOL = _Token('other', 'eol')
- NAME = _Token('other', '(name)')
- NUMBER = _Token('other', '(num)')
- OPERATOR = _Token('other', '(op)')
- REGEXP = _Token('other', '(re)')
- SPACE = _Token('other', '(sp)')
- STRING = _Token('other', '(str)')
-
-tok = _Tokens()
-_KEYWORDS = dict((t.literal, t) for t in _ALL_TOKENS if t.category == 'kw')
-_PUNCTUATOR_TREE = {}
-for punctuator in (t for t in _ALL_TOKENS if t.category == 'sym'):
- d = _PUNCTUATOR_TREE
- for c in punctuator.literal:
- d = d.setdefault(c, {})
- assert not None in d, punctuator.literal
- d[None] = punctuator
-
class Token:
def __init__(self, tok, atom=None):
self.tok = tok
@@ -282,7 +161,7 @@
peek.set_offset(start_offset, end_offset)
self._peeked.append(peek)
- assert isinstance(peek.tok, _Token), repr(peek.tok)
+ assert isinstance(peek.tok, tok.TokenType), repr(peek.tok)
if peek.tok not in (tok.EOL, tok.SPACE,
tok.C_COMMENT, tok.CPP_COMMENT,
tok.HTML_COMMENT):
Copied: trunk/jsengine/tokenizer/tok.py (from rev 339, trunk/jsengine/tokenizer/__init__.py)
===================================================================
--- trunk/jsengine/tokenizer/tok.py (rev 0)
+++ trunk/jsengine/tokenizer/tok.py 2013-10-09 21:34:59 UTC (rev 340)
@@ -0,0 +1,128 @@
+# vim: sw=4 ts=4 et
+_ALL_TOKENS = []
+
+class TokenType(object):
+ def __init__(self, category, literal):
+ self._category = category
+ self._literal = literal
+ _ALL_TOKENS.append(self)
+
+ def __repr__(self):
+ return 'TokenType(%r, %r)' % (self._category, self._literal)
+
+ @property
+ def category(self):
+ return self._category
+
+ @property
+ def literal(self):
+ return self._literal
+
+# Symbols
+ASSIGN_ULSHIFT = TokenType('sym', '<<<=')
+ASSIGN_URSHIFT = TokenType('sym', '>>>=')
+EQ_STRICT = TokenType('sym', '===')
+NE_STRICT = TokenType('sym', '!==')
+URSHIFT = TokenType('sym', '>>>')
+ASSIGN_LSHIFT = TokenType('sym', '<<=')
+ASSIGN_RSHIFT = TokenType('sym', '>>=')
+LE = TokenType('sym', '<=')
+GE = TokenType('sym', '>=')
+EQ = TokenType('sym', '==')
+NE = TokenType('sym', '!=')
+INC = TokenType('sym', '++')
+DEC = TokenType('sym', '--')
+LSHIFT = TokenType('sym', '<<')
+RSHIFT = TokenType('sym', '>>')
+LOGICAL_AND = TokenType('sym', '&&')
+LOGICAL_OR = TokenType('sym', '||')
+ASSIGN_ADD = TokenType('sym', '+=')
+ASSIGN_SUB = TokenType('sym', '-=')
+ASSIGN_MUL = TokenType('sym', '*=')
+ASSIGN_MOD = TokenType('sym', '%=')
+ASSIGN_BIT_AND = TokenType('sym', '&=')
+ASSIGN_BIT_OR = TokenType('sym', '|=')
+ASSIGN_BIT_XOR = TokenType('sym', '^=')
+ASSIGN_DIV = TokenType('sym', '/=')
+LBRACE = TokenType('sym', '{')
+RBRACE = TokenType('sym', '}')
+LPAREN = TokenType('sym', '(')
+RPAREN = TokenType('sym', ')')
+LBRACKET = TokenType('sym', '[')
+RBRACKET = TokenType('sym', ']')
+DOT = TokenType('sym', '.')
+SEMI = TokenType('sym', ';')
+COMMA = TokenType('sym', ',')
+LT = TokenType('sym', '<')
+GT = TokenType('sym', '>')
+ADD = TokenType('sym', '+')
+SUB = TokenType('sym', '-')
+MUL = TokenType('sym', '*')
+MOD = TokenType('sym', '%')
+BIT_OR = TokenType('sym', '|')
+BIT_AND = TokenType('sym', '&')
+BIT_XOR = TokenType('sym', '^')
+LOGICAL_NOT = TokenType('sym', '!')
+BIT_NOT = TokenType('sym', '~')
+QUESTION = TokenType('sym', '?')
+COLON = TokenType('sym', ':')
+ASSIGN = TokenType('sym', '=')
+DIV = TokenType('sym', '/')
+
+# Keywords
+BREAK = TokenType('kw', 'break')
+CASE = TokenType('kw', 'case')
+CATCH = TokenType('kw', 'catch')
+CONTINUE = TokenType('kw', 'continue')
+DEFAULT = TokenType('kw', 'default')
+DELETE = TokenType('kw', 'delete')
+DO = TokenType('kw', 'do')
+ELSE = TokenType('kw', 'else')
+FALSE = TokenType('kw', 'false')
+FINALLY = TokenType('kw', 'finally')
+FOR = TokenType('kw', 'for')
+FUNCTION = TokenType('kw', 'function')
+IF = TokenType('kw', 'if')
+IN = TokenType('kw', 'in')
+INSTANCEOF = TokenType('kw', 'instanceof')
+NEW = TokenType('kw', 'new')
+NULL = TokenType('kw', 'null')
+RETURN = TokenType('kw', 'return')
+SWITCH = TokenType('kw', 'switch')
+THIS = TokenType('kw', 'this')
+THROW = TokenType('kw', 'throw')
+TRUE = TokenType('kw', 'true')
+TYPEOF = TokenType('kw', 'typeof')
+TRY = TokenType('kw', 'try')
+VAR = TokenType('kw', 'var')
+VOID = TokenType('kw', 'void')
+WHILE = TokenType('kw', 'while')
+WITH = TokenType('kw', 'with')
+
+# Other tokens
+C_COMMENT = TokenType('other', '/*')
+CPP_COMMENT = TokenType('other', '//')
+HTML_COMMENT = TokenType('other', '<!--')
+ERROR = TokenType('other', 'err')
+EOF = TokenType('other', 'eof')
+EOL = TokenType('other', 'eol')
+NAME = TokenType('other', '(name)')
+NUMBER = TokenType('other', '(num)')
+OPERATOR = TokenType('other', '(op)')
+REGEXP = TokenType('other', '(re)')
+SPACE = TokenType('other', '(sp)')
+STRING = TokenType('other', '(str)')
+
+def getkeywords():
+ return dict((t.literal, t) for t in _ALL_TOKENS if t.category == 'kw')
+
+def get_punctuator_tree():
+ tree = {}
+ for punctuator in (t for t in _ALL_TOKENS if t.category == 'sym'):
+ leaf = tree
+ for c in punctuator.literal:
+ leaf = leaf.setdefault(c, {})
+ assert not None in leaf, punctuator.literal
+ leaf[None] = punctuator
+ return tree
+
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <mat...@us...> - 2013-10-09 22:41:50
|
Revision: 342
http://sourceforge.net/p/javascriptlint/code/342
Author: matthiasmiller
Date: 2013-10-09 22:41:46 +0000 (Wed, 09 Oct 2013)
Log Message:
-----------
Distinguish between characters and strings.
Modified Paths:
--------------
trunk/jsengine/tokenizer/__init__.py
trunk/jsengine/tokenizer/tok.py
Modified: trunk/jsengine/tokenizer/__init__.py
===================================================================
--- trunk/jsengine/tokenizer/__init__.py 2013-10-09 21:51:03 UTC (rev 341)
+++ trunk/jsengine/tokenizer/__init__.py 2013-10-09 22:41:46 UTC (rev 342)
@@ -14,6 +14,14 @@
_KEYWORDS = tok.getkeywords()
_PUNCTUATOR_TREE = tok.get_punctuator_tree()
+def _str_has_chr(s, c):
+ assert len(c) <= 1
+ return c in s
+
+def _chr_to_str(c):
+ assert len(c) <= 1
+ return c
+
class Token:
def __init__(self, tok, atom=None):
self.tok = tok
@@ -51,23 +59,35 @@
return self._offset >= len(self._content)
def readchr(self):
- if self._offset < len(self._content):
+ c = self.peekchr()
+ if not c:
+ raise JSSyntaxError(self.get_offset(), 'unexpected_eof')
+ self._offset += 1
+ return c
+
+ def readchrif(self, expect):
+ if self.peekchr() == expect:
self._offset += 1
- return self._content[self._offset - 1]
- raise JSSyntaxError(self.get_offset()-1, 'unexpected_eof')
+ return expect
+ return ''
- def readchrif(self, seq):
- s = self.peekchrif(seq)
+ def readchrin(self, seq):
+ s = self.peekchrin(seq)
if s:
- assert len(s) == 1
self._offset += 1
return s
- def peekchrif(self, seq):
- if self._offset < len(self._content) and \
- self._content[self._offset] in seq:
+ def peekchr(self):
+ if self._offset < len(self._content):
return self._content[self._offset]
+ return ''
+ def peekchrin(self, seq):
+ c = self.peekchr()
+ if c and _str_has_chr(seq, c):
+ return c
+ return ''
+
def readtextif(self, text):
""" Returns the string if found. Otherwise returns None.
"""
@@ -182,7 +202,7 @@
# TODO: Validate and save
while True:
- c = stream.readchrif(_IDENT)
+ c = stream.readchrin(_IDENT)
if not c:
break
@@ -194,15 +214,17 @@
if stream.eof():
return Token(tok.EOF)
+ stream.watch_reads()
+
c = stream.readchr()
# WHITESPACE
- if c in _WHITESPACE or c in _LINETERMINATOR:
- linebreak = c in _LINETERMINATOR
+ if _str_has_chr(_WHITESPACE, c) or _str_has_chr(_LINETERMINATOR, c):
+ linebreak = _str_has_chr(_LINETERMINATOR, c)
while True:
- if stream.readchrif(_LINETERMINATOR):
+ if stream.readchrin(_LINETERMINATOR):
linebreak = True
- elif stream.readchrif(_WHITESPACE):
+ elif stream.readchrin(_WHITESPACE):
pass
else:
break
@@ -213,24 +235,24 @@
# COMMENTS
if c == '/':
- if stream.peekchrif("/"):
- while not stream.eof() and not stream.peekchrif(_LINETERMINATOR):
+ if stream.peekchr() == '/':
+ while not stream.eof() and not stream.peekchrin(_LINETERMINATOR):
stream.readchr()
return Token(tok.CPP_COMMENT)
- if stream.peekchrif("*"):
+ if stream.peekchr() == '*':
linebreak = False
while True:
if stream.eof():
return Token(tok.ERROR, atom='unterminated_comment')
c = stream.readchr()
- if c in _LINETERMINATOR:
+ if _str_has_chr(_LINETERMINATOR, c):
linebreak = True
elif c == '*' and stream.readchrif('/'):
return Token(tok.C_COMMENT)
return Token(tok.EOF)
elif c == '<':
if stream.readtextif('!--'):
- while not stream.eof() and not stream.peekchrif(_LINETERMINATOR):
+ while not stream.eof() and not stream.peekchrin(_LINETERMINATOR):
stream.readchr()
return Token(tok.HTML_COMMENT)
@@ -245,66 +267,64 @@
c = stream.readchr()
elif c == quote:
return Token(tok.STRING, atom=s)
- s += c
+ s += _chr_to_str(c)
# NUMBERS
- if c in _DIGITS or (c == '.' and stream.peekchrif(_DIGITS)):
+ if _str_has_chr(_DIGITS, c) or (c == '.' and stream.peekchrin(_DIGITS)):
s = c # TODO
- stream.watch_reads()
- if c == '0' and stream.readchrif('xX'):
+ if c == '0' and stream.readchrin('xX'):
# Hex
- while stream.readchrif(_HEX_DIGITS):
+ while stream.readchrin(_HEX_DIGITS):
pass
- elif c == '0' and stream.readchrif(_DIGITS):
+ elif c == '0' and stream.readchrin(_DIGITS):
# Octal
- while stream.readchrif(_DIGITS):
+ while stream.readchrin(_DIGITS):
pass
else:
# Decimal
if c != '.':
- while stream.readchrif(_DIGITS):
+ while stream.readchrin(_DIGITS):
pass
stream.readchrif('.')
- while stream.readchrif(_DIGITS):
+ while stream.readchrin(_DIGITS):
pass
- if stream.readchrif('eE'):
- stream.readchrif('+-')
- if not stream.readchrif(_DIGITS):
+ if stream.readchrin('eE'):
+ stream.readchrin('+-')
+ if not stream.readchrin(_DIGITS):
raise JSSyntaxError(stream.get_offset(), 'syntax_error')
- while stream.readchrif(_DIGITS):
+ while stream.readchrin(_DIGITS):
pass
- if stream.peekchrif(_IDENT):
+ if stream.peekchrin(_IDENT):
return Token(tok.ERROR)
- atom = s + stream.get_watched_reads()
+ atom = stream.get_watched_reads()
return Token(tok.NUMBER, atom=atom)
if c in _PUNCTUATOR_TREE:
d = _PUNCTUATOR_TREE[c]
while True:
- c = stream.readchrif(list(d.keys()))
+ c = stream.readchrin(u''.join(d.keys()))
if c:
d = d[c]
else:
break
try:
- return Token(d[None])
+ return Token(d[''])
except KeyError:
print('oops')
raise JSSyntaxError(stream.get_offset(), 'syntax_error')
- if c in _IDENT:
- s = ''
- while c:
- s += c
- c = stream.readchrif(_IDENT + _DIGITS)
- if s in _KEYWORDS:
- return Token(_KEYWORDS[s], atom=s)
- elif s:
- return Token(tok.NAME, atom=s)
+ if _str_has_chr(_IDENT, c):
+ while stream.readchrin(_IDENT + _DIGITS):
+ pass
+ atom = stream.get_watched_reads()
+ if atom in _KEYWORDS:
+ return Token(_KEYWORDS[atom], atom=atom)
+ return Token(tok.NAME, atom=atom)
+
raise JSSyntaxError(stream.get_offset(), 'unexpected_char',
- { 'char': c })
+ { 'char': _chr_to_str(c) })
Modified: trunk/jsengine/tokenizer/tok.py
===================================================================
--- trunk/jsengine/tokenizer/tok.py 2013-10-09 21:51:03 UTC (rev 341)
+++ trunk/jsengine/tokenizer/tok.py 2013-10-09 22:41:46 UTC (rev 342)
@@ -123,6 +123,6 @@
for c in punctuator.literal:
leaf = leaf.setdefault(c, {})
assert not None in leaf, punctuator.literal
- leaf[None] = punctuator
+ leaf[''] = punctuator
return tree
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <mat...@us...> - 2013-10-10 14:07:44
|
Revision: 343
http://sourceforge.net/p/javascriptlint/code/343
Author: matthiasmiller
Date: 2013-10-10 14:07:41 +0000 (Thu, 10 Oct 2013)
Log Message:
-----------
Fully distinguish between chars and strings.
Modified Paths:
--------------
trunk/jsengine/tokenizer/__init__.py
trunk/jsengine/tokenizer/tok.py
Modified: trunk/jsengine/tokenizer/__init__.py
===================================================================
--- trunk/jsengine/tokenizer/__init__.py 2013-10-09 22:41:46 UTC (rev 342)
+++ trunk/jsengine/tokenizer/__init__.py 2013-10-10 14:07:41 UTC (rev 343)
@@ -14,14 +14,43 @@
_KEYWORDS = tok.getkeywords()
_PUNCTUATOR_TREE = tok.get_punctuator_tree()
-def _str_has_chr(s, c):
- assert len(c) <= 1
- return c in s
+class _Char(object):
+ def __init__(self, u):
+ assert isinstance(u, int) or u is None, u
+ self._u = u
-def _chr_to_str(c):
- assert len(c) <= 1
- return c
+ @classmethod
+ def fromstr(cls, s, i):
+ return _Char(ord(s[i]))
+ @classmethod
+ def ord(cls, s):
+ return _Char(ord(s))
+
+ @property
+ def uval(self):
+ return self._u
+
+ def tostr(self):
+ if self._u is None:
+ return unicode()
+ return unichr(self._u)
+
+ def instr(self, s):
+ if self._u is None:
+ return False
+ return s.find(unichr(self._u)) != -1
+
+ def __hash__(self):
+ return hash(self._u)
+
+ def __eq__(self, other):
+ assert isinstance(other, _Char), other
+ return self._u == other._u
+
+ def __nonzero__(self):
+ return not self._u is None
+
class Token:
def __init__(self, tok, atom=None):
self.tok = tok
@@ -69,7 +98,7 @@
if self.peekchr() == expect:
self._offset += 1
return expect
- return ''
+ return _Char(None)
def readchrin(self, seq):
s = self.peekchrin(seq)
@@ -79,14 +108,14 @@
def peekchr(self):
if self._offset < len(self._content):
- return self._content[self._offset]
- return ''
+ return _Char.fromstr(self._content, self._offset)
+ return _Char(None)
def peekchrin(self, seq):
c = self.peekchr()
- if c and _str_has_chr(seq, c):
+ if c and c.instr(seq):
return c
- return ''
+ return _Char(None)
def readtextif(self, text):
""" Returns the string if found. Otherwise returns None.
@@ -184,20 +213,20 @@
stream = self._stream
while True:
c = stream.readchr()
- if c == '\\':
+ if c == _Char.ord('\\'):
c = stream.readchr()
- if c == '\n':
+ if c == _Char.ord('\n'):
return Token(tok.ERROR)
- elif c == '[':
+ elif c == _Char.ord('['):
while True:
c = stream.readchr()
- if c == '\n':
+ if c == _Char.ord('\n'):
return Token(tok.ERROR)
- elif c == ']':
+ elif c == _Char.ord(']'):
break
- elif c == '\n':
+ elif c == _Char.ord('\n'):
return Token(tok.ERROR)
- elif c == '/':
+ elif c == _Char.ord('/'):
break
# TODO: Validate and save
@@ -219,8 +248,8 @@
c = stream.readchr()
# WHITESPACE
- if _str_has_chr(_WHITESPACE, c) or _str_has_chr(_LINETERMINATOR, c):
- linebreak = _str_has_chr(_LINETERMINATOR, c)
+ if c.instr(_WHITESPACE) or c.instr(_LINETERMINATOR):
+ linebreak = c.instr(_LINETERMINATOR)
while True:
if stream.readchrin(_LINETERMINATOR):
linebreak = True
@@ -234,49 +263,49 @@
return Token(tok.SPACE)
# COMMENTS
- if c == '/':
- if stream.peekchr() == '/':
+ if c == _Char.ord('/'):
+ if stream.peekchr() == _Char.ord('/'):
while not stream.eof() and not stream.peekchrin(_LINETERMINATOR):
stream.readchr()
return Token(tok.CPP_COMMENT)
- if stream.peekchr() == '*':
+ if stream.peekchr() == _Char.ord('*'):
linebreak = False
while True:
if stream.eof():
return Token(tok.ERROR, atom='unterminated_comment')
c = stream.readchr()
- if _str_has_chr(_LINETERMINATOR, c):
+ if c.instr(_LINETERMINATOR):
linebreak = True
- elif c == '*' and stream.readchrif('/'):
+ elif c == _Char.ord('*') and stream.readchrif(_Char.ord('/')):
return Token(tok.C_COMMENT)
return Token(tok.EOF)
- elif c == '<':
+ elif c == _Char.ord('<'):
if stream.readtextif('!--'):
while not stream.eof() and not stream.peekchrin(_LINETERMINATOR):
stream.readchr()
return Token(tok.HTML_COMMENT)
# STRING LITERALS
- if c == '"' or c == "'":
+ if c == _Char.ord('"') or c == _Char.ord("'"):
# TODO: Decode
s = ''
quote = c
while True:
c = stream.readchr()
- if c == '\\':
+ if c == _Char.ord('\\'):
c = stream.readchr()
elif c == quote:
return Token(tok.STRING, atom=s)
- s += _chr_to_str(c)
+ s += c.tostr()
# NUMBERS
- if _str_has_chr(_DIGITS, c) or (c == '.' and stream.peekchrin(_DIGITS)):
+ if c.instr(_DIGITS) or (c == _Char.ord('.') and stream.peekchrin(_DIGITS)):
s = c # TODO
- if c == '0' and stream.readchrin('xX'):
+ if c == _Char.ord('0') and stream.readchrin('xX'):
# Hex
while stream.readchrin(_HEX_DIGITS):
pass
- elif c == '0' and stream.readchrin(_DIGITS):
+ elif c == _Char.ord('0') and stream.readchrin(_DIGITS):
# Octal
while stream.readchrin(_DIGITS):
pass
@@ -285,7 +314,7 @@
if c != '.':
while stream.readchrin(_DIGITS):
pass
- stream.readchrif('.')
+ stream.readchrif(_Char.ord('.'))
while stream.readchrin(_DIGITS):
pass
@@ -303,21 +332,22 @@
atom = stream.get_watched_reads()
return Token(tok.NUMBER, atom=atom)
- if c in _PUNCTUATOR_TREE:
- d = _PUNCTUATOR_TREE[c]
+ if c.uval in _PUNCTUATOR_TREE:
+ d = _PUNCTUATOR_TREE[c.uval]
while True:
- c = stream.readchrin(u''.join(d.keys()))
- if c:
- d = d[c]
+ c = stream.peekchr()
+ if c and c.uval in d:
+ stream.readchr()
+ d = d[c.uval]
else:
break
try:
- return Token(d[''])
+ return Token(d[-1])
except KeyError:
print('oops')
raise JSSyntaxError(stream.get_offset(), 'syntax_error')
- if _str_has_chr(_IDENT, c):
+ if c.instr(_IDENT):
while stream.readchrin(_IDENT + _DIGITS):
pass
@@ -327,4 +357,4 @@
return Token(tok.NAME, atom=atom)
raise JSSyntaxError(stream.get_offset(), 'unexpected_char',
- { 'char': _chr_to_str(c) })
+ { 'char': c.tostr() })
Modified: trunk/jsengine/tokenizer/tok.py
===================================================================
--- trunk/jsengine/tokenizer/tok.py 2013-10-09 22:41:46 UTC (rev 342)
+++ trunk/jsengine/tokenizer/tok.py 2013-10-10 14:07:41 UTC (rev 343)
@@ -121,8 +121,8 @@
for punctuator in (t for t in _ALL_TOKENS if t.category == 'sym'):
leaf = tree
for c in punctuator.literal:
- leaf = leaf.setdefault(c, {})
+ leaf = leaf.setdefault(ord(c), {})
assert not None in leaf, punctuator.literal
- leaf[''] = punctuator
+ leaf[-1] = punctuator
return tree
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <mat...@us...> - 2013-10-10 15:28:35
|
Revision: 346
http://sourceforge.net/p/javascriptlint/code/346
Author: matthiasmiller
Date: 2013-10-10 15:28:32 +0000 (Thu, 10 Oct 2013)
Log Message:
-----------
Refactor punctuator/keyword handling.
Modified Paths:
--------------
trunk/jsengine/tokenizer/__init__.py
trunk/jsengine/tokenizer/tok.py
Modified: trunk/jsengine/tokenizer/__init__.py
===================================================================
--- trunk/jsengine/tokenizer/__init__.py 2013-10-10 15:25:47 UTC (rev 345)
+++ trunk/jsengine/tokenizer/__init__.py 2013-10-10 15:28:32 UTC (rev 346)
@@ -5,15 +5,11 @@
_WHITESPACE = u'\u0020\t\u000B\u000C\u00A0\uFFFF'
_LINETERMINATOR = u'\u000A\u000D\u2028\u2029'
_DIGITS = u'0123456789'
-_DOT_DIGITS = [u'.%s' % digit for digit in _DIGITS]
_HEX_DIGITS = _DIGITS + u'abcdefABCDEF'
_IDENT = u'abcdefghijklmnopqrstuvwxyz' + \
u'ABCDEFGHIJKLMNOPQRSTUVWXYZ' + \
u'$_'
-_KEYWORDS = tok.getkeywords()
-_PUNCTUATOR_TREE = tok.get_punctuator_tree()
-
class _Char(object):
def __init__(self, u):
assert isinstance(u, int) or u is None, u
@@ -179,7 +175,7 @@
def expect_identifiername(self):
encountered = self.advance()
- if encountered.tok in list(_KEYWORDS.values()):
+ if tok.keywords.has(encountered.tok) != -1:
encountered.tok = tok.NAME
if encountered.tok != tok.NAME:
raise JSSyntaxError(encountered.start_offset, 'syntax_error')
@@ -332,29 +328,28 @@
atom = stream.get_watched_reads()
return Token(tok.NUMBER, atom=atom)
- if c.uval in _PUNCTUATOR_TREE:
- d = _PUNCTUATOR_TREE[c.uval]
+ if tok.punctuators.hasprefix(c.tostr()):
+ s = c.tostr()
while True:
c = stream.peekchr()
- if c and c.uval in d:
+ if c and tok.punctuators.hasprefix(s + c.tostr()):
+ s += c.tostr()
stream.readchr()
- d = d[c.uval]
else:
break
- try:
- return Token(d[-1])
- except KeyError:
- print('oops')
+ d = tok.punctuators.get(s)
+ if not d:
raise JSSyntaxError(stream.get_offset(), 'syntax_error')
-
+ return Token(d)
if c.instr(_IDENT):
while stream.readchrin(_IDENT + _DIGITS):
pass
atom = stream.get_watched_reads()
- if atom in _KEYWORDS:
- return Token(_KEYWORDS[atom], atom=atom)
- return Token(tok.NAME, atom=atom)
+ tt = tok.keywords.get(atom, tok.NAME)
+ t = Token(tt)
+ t.atom = atom
+ return t
raise JSSyntaxError(stream.get_offset(), 'unexpected_char',
{ 'char': c.tostr() })
Modified: trunk/jsengine/tokenizer/tok.py
===================================================================
--- trunk/jsengine/tokenizer/tok.py 2013-10-10 15:25:47 UTC (rev 345)
+++ trunk/jsengine/tokenizer/tok.py 2013-10-10 15:28:32 UTC (rev 346)
@@ -10,12 +10,10 @@
def __repr__(self):
return 'TokenType(%r, %r)' % (self._category, self._literal)
- @property
- def category(self):
+ def getcategory(self):
return self._category
- @property
- def literal(self):
+ def getliteral(self):
return self._literal
# Symbols
@@ -113,16 +111,44 @@
SPACE = TokenType('other', '(sp)')
STRING = TokenType('other', '(str)')
-def getkeywords():
- return dict((t.literal, t) for t in _ALL_TOKENS if t.category == 'kw')
+# Freeze the list of keywords
+_ALL_TOKENS = tuple(_ALL_TOKENS)
-def get_punctuator_tree():
- tree = {}
- for punctuator in (t for t in _ALL_TOKENS if t.category == 'sym'):
- leaf = tree
- for c in punctuator.literal:
- leaf = leaf.setdefault(ord(c), {})
- assert not None in leaf, punctuator.literal
- leaf[-1] = punctuator
- return tree
+class _Keywords(object):
+ def __init__(self):
+ self._d = {}
+ for tt in _ALL_TOKENS:
+ if tt.getcategory() == 'kw':
+ self._d[tt.getliteral()] = tt
+ def get(self, literal, default):
+ return self._d.get(literal, default)
+
+ def has(self, tok):
+ for iter in self._d.values():
+ if iter == tok:
+ return True
+ return False
+
+keywords = _Keywords()
+
+class _Punctuators(object):
+ def __init__(self):
+ self._prefixes = {}
+ self._punctuators = {}
+
+ for t in _ALL_TOKENS:
+ if t.getcategory() == 'sym':
+ literal = t.getliteral()
+ for i in range(len(literal)):
+ prefix = literal[:i+1]
+ self._prefixes[prefix] = True
+ self._punctuators[literal] = t
+
+ def hasprefix(self, prefix):
+ return self._prefixes.get(prefix, False)
+
+ def get(self, literal):
+ return self._punctuators.get(literal)
+
+punctuators = _Punctuators()
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|