Diff of /contrib/unicode/load-names.lisp [000000] .. [ebafa5] Maximize Restore

  Switch to unified view

a b/contrib/unicode/load-names.lisp
1
(defun split-words (text &key (set '(#\Space)) (exclude t))
2
  (loop with start = 0
3
     with output = '()
4
     with elt-type = (array-element-type text)
5
     for i from 0 below (length text)
6
     for c across text
7
     when (member c set)
8
     do (setf output (list* (make-array (+ (- i start) (if exclude 0 1))
9
                     :element-type elt-type
10
                     :displaced-to text
11
                     :displaced-index-offset start)
12
              output)
13
        start (1+ i))
14
     finally (return (nreverse (list* (make-array (- i start)
15
                       :element-type elt-type
16
                       :displaced-to text
17
                       :displaced-index-offset start)
18
                    output)))))
19
20
(defun encode-words (words hash)
21
  (loop for word in words
22
     collect (or (gethash word hash)
23
       (let* ((word (copy-seq word))
24
          (ndx (hash-table-count hash)))
25
         (setf (gethash word hash) (1+ ndx))))))
26
27
(defun fixup-hangul-syllables (dictionary)
28
  ;; "Hangul Syllable Composition, Unicode 5.1 section 3-12"
29
  (let* ((sbase #xac00)
30
         (lbase #x1100)
31
         (vbase #x1161)
32
         (tbase #x11a7)
33
         (scount 11172)
34
         (lcount 19)
35
         (vcount 21)
36
         (tcount 28)
37
         (ncount (* vcount tcount))
38
         (table (make-hash-table)))
39
    (with-open-file (*standard-input*
40
                     (make-pathname :name "Jamo" :type "txt"))
41
      (loop for line = (read-line nil nil)
42
            while line
43
            if (position #\; line)
44
            do (add-jamo-information line table)))
45
    (loop for sindex from 0 below scount
46
       for l = (+ lbase (floor sindex ncount))
47
       for v = (+ vbase (floor (mod sindex ncount) tcount))
48
       for tee = (+ tbase (mod sindex tcount))
49
       for name = (list* "HANGUL_" "SYLLABLE_"
50
           (gethash l table) (gethash v table)
51
           (unless (= tee tbase) (list (gethash tee table))))
52
       for code = (+ sbase sindex)
53
       collect (list* code (apply #'concatenate 'string name)
54
            (encode-words name dictionary)))))
55
56
(defun add-jamo-information (line table)
57
  (let* ((split (split-words line :set '(#\;) :exclude t))
58
         (code (parse-integer (first split) :radix 16))
59
         (syllable (string-trim '(#\Space)
60
                                (subseq (second split) 0 (position #\# (second split))))))
61
    (setf (gethash code table) syllable)))
62
63
(defvar *words*)
64
65
(defparameter *data*
66
  (with-open-file (in "~/src/sbcl/tools-for-build/UnicodeData.txt" :direction :input)
67
    (loop with words = (setf *words* (make-hash-table :size 1024 :test #'equal))
68
       for ucd-line = (read-line in nil nil nil)
69
       while ucd-line
70
       nconc (let* ((ucd-data (split-words ucd-line :set '(#\;)))
71
          (code (first ucd-data))
72
          (name (second ucd-data)))
73
         (unless (eql (char name 0) #\<)
74
       (setf name (substitute #\_ #\Space name))
75
       (list (list* (parse-integer code :radix 16)
76
                name
77
                (encode-words (split-words
78
                       name
79
                       :set '(#\Space #\_ #\-)
80
                       :exclude nil)
81
                      words))))))))
82
83
(print (length *data*))
84
(print (first (last *data*)))
85
86
;#+(or)
87
(progn
88
  (setf *data*
89
  (sort (nconc (fixup-hangul-syllables *words*) *data*)
90
        #'<
91
        :key #'car))
92
  (print (length *data*))
93
  (print (first (last *data*))))
94
95
(defparameter *words-array*
96
  (loop with array = (make-array (1+ (hash-table-count *words*)))
97
     for k being the hash-key in *words* using (hash-value v)
98
     do (setf (aref array v) k)
99
     finally (return array)))
100
101
(defparameter *last-word-index* (1- (length *words-array*)))
102
103
(defparameter *words-array-bytes*
104
  (loop for c across *words-array*
105
     sum (1+ (length c))))
106
107
(defun code-to-string (code)
108
  (aref *words-array* code))
109
110
(defparameter *flattened-data*
111
  (loop for (code name . rest) in *data*
112
     nconc (append rest (list 0))))
113
114
(defparameter *group-names*
115
  (loop with output = '()
116
     with start = (first (first *data*))
117
     with last = start
118
     for (code name . rest) in *data*
119
     do (when (>= (- code last) 2)
120
    (setf output (cons (list start last) output)
121
      start code))
122
       (setf last code)
123
     finally (return (nreverse (cons (list start code) output)))))