[ebafa5]: contrib / unicode / names-pairs.lisp Maximize Restore History

Download this file

names-pairs.lisp    139 lines (121 with data), 3.8 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
(load "./load-names.lisp")
(declaim (optimize (debug 0) (speed 3)))
(setf *print-circle* t)
(defun compute-pairs (data table)
(clrhash table)
(loop with max = 0
with max-pair = nil
for (code name . l) in data
do (loop for l2 on l
for a = (car l2)
for b = (cadr l2)
while b
do (let* ((pair (cons a b))
(c (gethash pair table)))
(setf (gethash pair table)
(setf c (if c (1+ c) 1))
a b)
(when (> c max)
(setf max c max-pair pair))))
finally (return (cons max max-pair))))
(defun replace-pair (pair code data)
(let ((old-a (car pair))
(old-b (cdr pair)))
(loop with more = 0
for (ucd-code name . l) in data
do (loop with l2 = l
for a = (first l2)
for b = (second l2)
while b
do (when (and (eql a old-a) (eql b old-b))
;; replace (a b . c) with (pair . c)
(setf (car l2) code
(cdr l2) (cddr l2)))
do (setf l2 (cdr l2)))
do (setf more (+ more (1- (length l))))
finally (return more))))
(defun compress (data)
(loop with last-length = 0
with table = (make-hash-table :size 2048 :test #'equal)
with pairs = '()
for new-symbol from (1+ *last-word-index*)
for (frequency . pair) = (compute-pairs data table)
while (and pair (> frequency 1))
do
(format t "~%;;; ~A, ~D -> ~D, ~D left" pair frequency new-symbol
(replace-pair pair new-symbol data))
(setf pairs (acons new-symbol pair pairs))
finally
;; There are no redundant pairs. We just define ad-hoc new
;; symbols for all remaining strings.
(loop with n = new-symbol
for (code name . l) in data
do (loop with l2 = l
for a = (first l2)
for b = (second l2)
while b
do (setf pairs (acons n (cons a b) pairs)
(car l2) n
(cdr l2) (cddr l2)
n (1+ n))))
(print 'finished)
(return-from compress (nreverse pairs))))
(progn
(defparameter *compressed-data* (copy-tree *data*))
(defparameter *paired-data* (compress *compressed-data*)))
(defparameter *last-code* (first (first (last *paired-data*))))
(defparameter *code-ndx-size* (ceiling (integer-length *last-code*) 8))
(defparameter *pair-table-size* (* (length *paired-data*)
(* 2 *code-ndx-size*)))
(defparameter *code-to-name-bytes*
(* (length *compressed-data*)
(+ 3 ; Size of Unicode code
;; Size of index into the data table
*code-ndx-size*)))
(defparameter *sorted-names-bytes*
;; The sorted list of character names is just a list of indices into
;; the *code-to-name-bytes* table
(* (length *compressed-data*) *code-ndx-size*))
(defparameter *word-dictionary*
(+ *words-array-bytes*))
(format t "
;;; Codes dictionary = ~D bytes
;;; Pair table size = ~D bytes
;;; Code to names table = ~D bytes
;;; Names to codes table = ~D bytes
;;; Total = ~D bytes
"
*word-dictionary*
*pair-table-size*
*code-to-name-bytes*
*sorted-names-bytes*
(+
*word-dictionary*
*pair-table-size*
*code-to-name-bytes*
*sorted-names-bytes*
))
;;; WITH HANGUL
;;; Codes dictionary = 78566 bytes
;;; Pair table size = 198752 bytes
;;; Code to names table = 164570 bytes
;;; Names to codes table = 65828 bytes
;;; Total = 507716 bytes
;;; WITHOUT HANGUL
;;; Codes dictionary = 78555 bytes
;;; Pair table size = 150868 bytes
;;; Code to names table = 108710 bytes
;;; Names to codes table = 43484 bytes
;;; Total = 381617 bytes
;;; Without HANGUL (split by space and -)
;;; Codes dictionary = 58258 bytes
;;; Pair table size = 160576 bytes
;;; Code to names table = 108710 bytes
;;; Names to codes table = 43484 bytes
;;; Total = 371028 bytes
;;; With HANGUL (split by space and -)
;;; Codes dictionary = 58269 bytes
;;; Pair table size = 208460 bytes
;;; Code to names table = 164570 bytes
;;; Names to codes table = 65828 bytes
;;; Total = 497127 bytes