[cedet-semantic] [PATCH] Fix Python parsing for triple-quoted strings
Brought to you by:
zappo
From: Dale S. <da...@co...> - 2011-09-06 00:51:21
|
Hi, I've got the below patch against HEAD for wisent-python.el to improve parsing Python's triple-quoted strings. I've been using this for quite a while and it seems to improve problems I was having with Python parsing (as witnessed by Semantic + which-func-mode). Can/should this patch be included in Semantic? Do I need to sign papers with the FSF to contribute this patch nowadays? (If yes to both questions, pointer on which papers to sign with the FSF would be appreciated.) Thanks, Dale commit 03785415590ff97f11d33355e5f412053db4781f Author: Dale Sedivec <dal...@co...> Date: Fri Mar 18 16:12:00 2011 -0500 wisent-python-forward-line and -string work despite hairy strings. These functions broke on some cases of triple-quoted strings; see str_test_1 and str_test_2. This parser now depends less (if at all) on python.el applying syntax properties (which Semantic doesn't use) via font-lock-syntactic-keywords (which Semantic may not enable). diff --git a/semantic/semantic-utest.el b/semantic/semantic-utest.el index 52d8510..9e3c835 100644 --- a/semantic/semantic-utest.el +++ b/semantic/semantic-utest.el @@ -257,6 +257,26 @@ if x: x = 2 y = 3 r, s, t = 1, 2, '3' + +# Test string corner cases. Note that triple-quoted strings used +# to depend on font-lock to apply syntax properties to them. +# Code in the Python lexer that depended on scan-sexps and the +# like has been replaced with more manual methods to work around +# this problem. +def str_test_1(): + '''This might trip up wisent-python-forward-string: \\''' ''' + +def str_test_2(): + ('''Internal apostrophe in PAREN_BLOCK doesn't end this + string literal If you're using forward-sexp to skip this + parenthetical expression you'll fail here. Note there's an + odd number of apostrophes.''') + +def str_test_3(): + \"don't\" \"trip\" \"on\" \"adjacent\" \"strings\" + +def str_test_4(): + pass " @@ -408,6 +428,12 @@ r, s, t = 1, 2, '3' ("x" variable nil nil nil) ("y" variable nil nil nil) ("r, s, t" code nil nil nil) ;; TODO should be multiple variable tags + + ;; String tests + ("str_test_1" function nil nil nil) + ("str_test_2" function nil nil nil) + ("str_test_3" function nil nil nil) + ("str_test_4" function nil nil nil) ) "List of expected tag names for Python.") diff --git a/semantic/wisent/wisent-python.el b/semantic/wisent/wisent-python.el index 7d0853e..9718976 100644 --- a/semantic/wisent/wisent-python.el +++ b/semantic/wisent/wisent-python.el @@ -88,11 +88,33 @@ path." ;; to be suppressed. For example, r"01\n34" is a string with six ;; characters 0, 1, \, n, 3 and 4. The 'u' prefix means the following ;; string is a unicode. -(defconst wisent-python-string-re - (concat (regexp-opt '("r" "u" "ur" "R" "U" "UR" "Ur" "uR") t) - "?['\"]") +(defconst wisent-python-string-start-re "[uU]?[rR]?['\"]" "Regexp matching beginning of a Python string.") +(defconst wisent-python-string-re + (rx + (opt (any "uU")) (opt (any "rR")) + (or + ;; Triple-quoted string using apostrophes + (: "'''" (zero-or-more (or "\\'" + (not (any "'")) + (: (repeat 1 2 "'") (not (any "'"))))) + "'''") + ;; String using apostrophes + (: "'" (zero-or-more (or "\\'" + (not (any "'")))) + "'") + ;; Triple-quoted string using quotation marks. + (: "\"\"\"" (zero-or-more (or "\\\"" + (not (any "\"")) + (: (repeat 1 2 "\"") (not (any "\""))))) + "\"\"\"") + ;; String using quotation marks. + (: "\"" (zero-or-more (or "\\\"" + (not (any "\"")))) + "\""))) + "Regexp matching a complete Python string.") + (defvar wisent-python-EXPANDING-block nil "Non-nil when expanding a paren block for Python lexical analyzer.") @@ -104,16 +126,46 @@ curly braces." (defsubst wisent-python-forward-string () "Move point at the end of the Python string at point." - (when (looking-at wisent-python-string-re) - ;; skip the prefix - (and (match-end 1) (goto-char (match-end 1))) - ;; skip the quoted part - (cond - ((looking-at "\"\"\"[^\"]") - (search-forward "\"\"\"" nil nil 2)) - ((looking-at "'''[^']") - (search-forward "'''" nil nil 2)) - ((forward-sexp 1))))) + (if (looking-at wisent-python-string-re) + (let ((start (match-beginning 0)) + (end (match-end 0))) + ;; Incomplete triple-quoted string gets matched instead as a + ;; complete single quoted string. (This special case would be + ;; unnecessary if Emacs regular expressions had negative + ;; look-ahead assertions.) + (when (and (= (- end start) 2) + (looking-at "\"\\{3\\}\\|'\\{3\\}")) + (error "unterminated syntax")) + (goto-char end)) + (error "unterminated syntax"))) + +(defun wisent-python-forward-balanced-expression () + "Move point to the end of the balanced expression at point. +Here 'balanced expression' means anything matched by Emacs' +open/close parenthesis syntax classes. We can't use forward-sexp +for this because that Emacs built-in can't parse Python's +triple-quoted string syntax." + (let ((end-char (cdr (syntax-after (point))))) + (forward-char 1) + (while (not (or (eobp) (eq (char-after (point)) end-char))) + (cond + ;; Skip over python strings. + ((looking-at wisent-python-string-start-re) + (wisent-python-forward-string)) + ;; At a comment start just goto end of line. + ((looking-at "\\s<") + (end-of-line)) + ;; Skip over balanced expressions. + ((looking-at "\\s(") + (wisent-python-forward-balanced-expression)) + ;; Skip over white space, word, symbol, punctuation, paired + ;; delimiter (backquote) characters, line continuation, and end + ;; of comment characters (AKA newline characters in Python). + ((zerop (skip-syntax-forward "-w_.$\\>")) + (error "can't figure out how to go forward from here")))) + ;; Skip closing character. As a last resort this should raise an + ;; error if we hit EOB before we find our closing character.. + (forward-char 1))) (defun wisent-python-forward-line () "Move point to the beginning of the next logical line. @@ -127,14 +179,14 @@ line ends at the end of the buffer, leave the point there." (progn (cond ;; Skip over python strings. - ((looking-at wisent-python-string-re) + ((looking-at wisent-python-string-start-re) (wisent-python-forward-string)) ;; At a comment start just goto end of line. ((looking-at "\\s<") (end-of-line)) - ;; Skip over generic lists and strings. - ((looking-at "\\(\\s(\\|\\s\"\\)") - (forward-sexp 1)) + ;; Skip over balanced expressions. + ((looking-at "\\s(") + (wisent-python-forward-balanced-expression)) ;; At the explicit line continuation character ;; (backslash) move to next line. ((looking-at "\\s\\") @@ -256,7 +308,7 @@ continuation of current line." (define-lex-regex-analyzer wisent-python-lex-string "Detect and create python string tokens." - wisent-python-string-re + wisent-python-string-start-re (semantic-lex-push-token (semantic-lex-token 'STRING_LITERAL |