From: stassats <sta...@us...> - 2014-11-14 16:53:42
|
The branch "master" has been updated in SBCL: via ec9e01d7912b595e64327c417cf39cd4365818fe (commit) from 1356a850748fdd55080e767c893cfdcb152ae32a (commit) - Log ----------------------------------------------------------------- commit ec9e01d7912b595e64327c417cf39cd4365818fe Author: Stas Boukarev <sta...@gm...> Date: Fri Nov 14 04:52:06 2014 +0300 Speed up string decomposition. Reduce consing. Recursively expand canonical decompositions at build time. --- src/code/target-unicode.lisp | 120 ++++++++++++++++++++---------------------- tools-for-build/ucd.lisp | 69 +++++++++++++++++++----- 2 files changed, 112 insertions(+), 77 deletions(-) diff --git a/src/code/target-unicode.lisp b/src/code/target-unicode.lisp index 2455994..f201263 100644 --- a/src/code/target-unicode.lisp +++ b/src/code/target-unicode.lisp @@ -514,7 +514,8 @@ disappears when accents are placed on top of it. and NIL otherwise" (+ 4 (misc-index char))))) (values (clear-flag 7 value) (logbitp 7 value)))) -(defun char-decomposition (char length) +(defun char-decomposition (char length callback) + (declare (function callback)) ;; Caller should have gotten length from char-decomposition-info (let* ((cp (char-code char)) (cp-high (ash cp -8)) @@ -522,70 +523,62 @@ disappears when accents are placed on top of it. and NIL otherwise" (high-page (aref **character-high-pages** cp-high)) (index (unless (logbitp 15 high-page) ;; Hangul syllable (aref **character-low-pages** - (+ 1 (* 2 (+ (ldb (byte 8 0) cp) (ash high-page 8))))))) - (entry (when index (loop for i from 0 below length - collecting (aref decompositions (+ i index))))) - (result (make-string length))) - (if (= length 1) - (string (code-char (car entry))) - (progn - (if (<= #xac00 cp #xd7a3) - ;; see Unicode 6.2, section 3-12 - (let* ((sbase #xac00) - (lbase #x1100) - (vbase #x1161) - (tbase #x11a7) - (lcount 19) - (vcount 21) - (tcount 28) - (ncount (* vcount tcount)) - (scount (* lcount ncount)) - (sindex (- cp sbase)) - (lindex (floor sindex ncount)) - (vindex (floor (mod sindex ncount) tcount)) - (tindex (mod sindex tcount))) - (declare (ignore scount)) - (setf (char result 0) (code-char (+ lbase lindex))) - (setf (char result 1) (code-char (+ vbase vindex))) - (if (> tindex 0) - (setf (char result 2) (code-char (+ tbase tindex))) - (setf result (subseq result 0 2)))) ; Remove trailing #\Nul - (loop for i from 0 for code in entry - do (setf (char result i) (code-char code)))) - result)))) + (+ 1 (* 2 (+ (ldb (byte 8 0) cp) (ash high-page 8)))))))) + (cond ((= length 1) + (funcall callback (code-char (aref decompositions index)))) + ((<= #xac00 cp #xd7a3) + ;; see Unicode 6.2, section 3-12 + (let* ((sbase #xac00) + (lbase #x1100) + (vbase #x1161) + (tbase #x11a7) + (vcount 21) + (tcount 28) + (ncount (* vcount tcount)) + (sindex (- cp sbase)) + (lindex (floor sindex ncount)) + (vindex (floor (mod sindex ncount) tcount)) + (tindex (mod sindex tcount))) + (funcall callback (code-char (+ lbase lindex))) + (funcall callback (code-char (+ vbase vindex))) + (when (> tindex 0) + (funcall callback (code-char (+ tbase tindex)))))) -(defun decompose-char (char) - (let ((info (char-decomposition-info char))) - (if (= info 0) - (string char) - (char-decomposition char info)))) + (t + (loop for i below length + do + (funcall callback (code-char (aref decompositions (+ index i))))))))) + +(defun decompose-char (char compatibility callback) + (declare (function callback)) + (multiple-value-bind (info compat) (char-decomposition-info char) + (if (and (plusp info) + (or compatibility + (not compat))) + (if compatibility + (dx-flet ((callback (char) + (decompose-char char t callback))) + (char-decomposition char info #'callback)) + (char-decomposition char info callback)) + (funcall callback char)))) (defun decompose-string (string &optional (kind :canonical)) - (declare (type (member :canonical :compatibility) kind)) - (flet ((canonical (char) - (multiple-value-bind (len compat) (char-decomposition-info char) - (and (/= len 0) (not compat)))) - (compat (char) - (/= 0 (char-decomposition-info char)))) - (let (result - (fun (ecase kind - (:canonical #'canonical) - (:compatibility #'compat)))) - (do* ((start 0 (1+ end)) - (end (position-if fun string :start start) - (position-if fun string :start start))) - ((null end) (push (subseq string start end) result)) - (unless (= start end) - (push (subseq string start end) result)) - ;; FIXME: this recursive call to DECOMPOSE-STRING is necessary - ;; for correctness given our direct encoding of the - ;; decomposition data in UnicodeData.txt. It would, however, - ;; be straightforward enough to perform the recursion in table - ;; construction, and then have this simply revert to a single - ;; lookup. (Wait for tests to be hooked in, then implement). - (push (decompose-string (decompose-char (char string end)) kind) - result)) - (apply 'concatenate 'string (nreverse result))))) + (let ((compatibility (ecase kind + (:compatibility t) + (:canonical nil)))) + (let ((chars) + (length 0)) + (dx-flet ((callback (char) + (push char chars) + (incf length))) + (loop for char across string + do + (decompose-char char compatibility #'callback)) + (let ((result (make-string length))) + (loop for char in (nreverse chars) + for i from 0 + do (setf (schar result i) char)) + result))))) (defun sort-combiners (string) (let (result (start 0) first-cc first-non-cc) @@ -734,7 +727,8 @@ disappears when accents are placed on top of it. and NIL otherwise" (if (or (and (> prev-cc cc) (/= cc 0)) (proplist-p c no-prop) - (proplist-p c maybe-prop)) + (and maybe-prop + (proplist-p c maybe-prop))) (return-from quick-normalization-check nil) (setf prev-cc cc)))) string)) diff --git a/tools-for-build/ucd.lisp b/tools-for-build/ucd.lisp index f97a76c..c14f259 100644 --- a/tools-for-build/ucd.lisp +++ b/tools-for-build/ucd.lisp @@ -295,9 +295,51 @@ Length should be adjusted when the standard changes.") 0 ,(gethash code-point *line-break-class-table* 0) ,(gethash code-point *age-table* 0))) (unallocated-index (apply #'hash-misc unallocated-misc)) - (unallocated-ucd (make-ucd :misc unallocated-index :decomp 0))) + (unallocated-ucd (make-ucd :misc unallocated-index))) (setf (gethash code-point *ucd-entries*) unallocated-ucd))))) +(defun expand-decomposition (decomposition) + (loop for cp in decomposition + for ucd = (gethash cp *ucd-entries*) + for length = (elt (aref *misc-table* (ucd-misc ucd)) 4) + if (and (not (logbitp 7 length)) + (plusp length)) + append (expand-decomposition (ucd-decomp ucd)) + else + collect cp)) + +;;; Recursively expand canonical decompositions +(defun fixup-decompositions () + (loop for did-something = nil + do + (loop for code being the hash-key of *ucd-entries* + using (hash-value ucd) + when (and (ucd-decomp ucd) + (not (logbitp 7 (elt (aref *misc-table* (ucd-misc ucd)) 4)))) + do + (let ((expanded (expand-decomposition (ucd-decomp ucd)))) + (unless (equal expanded (ucd-decomp ucd)) + (setf (ucd-decomp ucd) expanded + did-something t)))) + while did-something) + (loop for i below (hash-table-count *ucd-entries*) + for ucd = (gethash i *ucd-entries*) + for decomp = (ucd-decomp ucd) + do + (setf (ucd-decomp ucd) + (cond ((not (consp decomp)) 0) + ((logbitp 7 (elt (aref *misc-table* (ucd-misc ucd)) 4)) + (prog1 (length *decompositions*) + (loop for cp in decomp + do (vector-push-extend cp *decompositions*)))) + (t + (let ((misc-entry (copy-list (aref *misc-table* (ucd-misc ucd))))) + (setf (elt misc-entry 4) (length decomp) + (ucd-misc ucd) (apply #'hash-misc misc-entry)) + (prog1 (length *decompositions*) + (loop for cp in decomp + do (vector-push-extend cp *decompositions*))))))))) + (defun fixup-compositions () (flet ((fixup (k v) (declare (ignore v)) @@ -403,27 +445,27 @@ Length should be adjusted when the standard changes.") (and (= gc-index 1) upper-index))) (bidi-mirrored-p (string= bidi-mirrored "Y")) (decomposition-info 0) - (decomposition-index 0) (eaw-index (gethash code-point *east-asian-width-table*)) (script-index (gethash code-point *script-table* 0)) (line-break-index (gethash code-point *line-break-class-table* 0)) - (age-index (gethash code-point *age-table* 0))) + (age-index (gethash code-point *age-table* 0)) + decomposition) (when (and (not cl-both-case-p) (< gc-index 2)) (format t "~A~%" name)) (when (string/= "" decomposition-type-and-mapping) - (let* ((compatibility-p (position #\> decomposition-type-and-mapping)) - (decomposition + (let* ((compatibility-p (position #\> decomposition-type-and-mapping))) + (setf decomposition (parse-codepoints (subseq decomposition-type-and-mapping - (if compatibility-p (1+ compatibility-p) 0))))) + (if compatibility-p (1+ compatibility-p) 0)))) (when (assoc code-point *decomposition-corrections*) (setf decomposition (list (cdr (assoc code-point *decomposition-corrections*))))) (setf decomposition-info (logior (length decomposition) (if compatibility-p 128 0))) - (unless (logbitp 7 decomposition-info) + (unless compatibility-p ;; Primary composition excludes: ;; * singleton decompositions; ;; * decompositions of non-starters; @@ -441,12 +483,7 @@ Length should be adjusted when the standard changes.") (setf (gethash (cons (first decomposition) (second decomposition)) *compositions*) - code-point))) - (setf decomposition-index - (prog1 - (fill-pointer *decompositions*) - (loop for i in decomposition do - (vector-push-extend i *decompositions*)))))) + code-point))))) ;; Hangul decomposition; see Unicode 6.2 section 3-12 (when (= code-point #xd7a3) ;; KLUDGE: The decomposition-length for Hangul syllables in the @@ -480,7 +517,7 @@ Length should be adjusted when the standard changes.") decomposition-info flags script-index line-break-index age-index)) (result (make-ucd :misc misc-index - :decomp decomposition-index))) + :decomp decomposition))) (when (and (> (length name) 7) (string= ", Last>" name :start2 (- (length name) 7))) ;; We can still do this despite East Asian Width being in the @@ -571,6 +608,7 @@ Length should be adjusted when the standard changes.") (complete-misc-table) (fixup-casefolding) (fixup-ages) + (fixup-decompositions) nil) @@ -732,11 +770,13 @@ Used to look up block data.") ;;; Output code (defun write-codepoint (code-point stream) + (declare (type (unsigned-byte 32) code-point)) (write-byte (ldb (byte 8 16) code-point) stream) (write-byte (ldb (byte 8 8) code-point) stream) (write-byte (ldb (byte 8 0) code-point) stream)) (defun write-4-byte (value stream) + (declare (type (unsigned-byte 32) value)) (write-byte (ldb (byte 8 24) value) stream) (write-byte (ldb (byte 8 16) value) stream) (write-byte (ldb (byte 8 8) value) stream) @@ -799,6 +839,7 @@ Used to look up block data.") (gethash (logior low-page (ash high-page 8)) *ucd-entries*) uniq-ucd-entries :test #'equalp)) (flet ((write-2-byte (int stream) + (declare (type (unsigned-byte 16) int)) (write-byte (ldb (byte 8 8) int) stream) (write-byte (ldb (byte 8 0) int) stream))) (case (length uniq-ucd-entries) ----------------------------------------------------------------------- hooks/post-receive -- SBCL |