diff options
Diffstat (limited to 'lib/uniname/gen-uninames.lisp')
-rwxr-xr-x | lib/uniname/gen-uninames.lisp | 139 |
1 files changed, 85 insertions, 54 deletions
diff --git a/lib/uniname/gen-uninames.lisp b/lib/uniname/gen-uninames.lisp index d08e93f0..9f795621 100755 --- a/lib/uniname/gen-uninames.lisp +++ b/lib/uniname/gen-uninames.lisp @@ -6,12 +6,18 @@ (defparameter add-comments nil) (defstruct unicode-char - (code nil :type integer) + (index nil :type integer) (name nil :type string) word-indices word-indices-index ) +(defstruct range + (index nil :type integer) + (start-code nil :type integer) + (end-code nil :type integer) +) + (defstruct word-list (hashed nil :type hash-table) (sorted nil :type list) @@ -19,10 +25,16 @@ length ; number of words ) -(defun main (inputfile outputfile) - (declare (type string inputfile outputfile)) +(defun main (inputfile outputfile aliasfile) + (declare (type string inputfile outputfile aliasfile)) #+UNICODE (setq *default-file-encoding* charset:utf-8) - (let ((all-chars '())) + (let ((all-chars '()) + (all-chars-hashed (make-hash-table :test #'equal)) + (all-aliases '()) + all-chars-and-aliases + (all-ranges '()) + (name-index 0) + range) ;; Read all characters and names from the input file. (with-open-file (istream inputfile :direction :input) (loop @@ -41,43 +53,53 @@ ; specially as well. (unless (or (<= #xF900 code #xFA2D) (<= #xFA30 code #xFA6A) (<= #xFA70 code #xFAD9) (<= #x2F800 code #x2FA1D)) - ; Transform the code so that it fits in 16 bits. In - ; Unicode 5.1 the following ranges are used. - ; 0x00000..0x04DFF >>12= 0x00..0x04 -> 0x0..0x4 - ; 0x0A000..0x0AAFF >>12= 0x0A -> 0x5 - ; 0x0F900..0x0FFFF >>12= 0x0F -> 0x6 - ; 0x10000..0x10A58 >>12= 0x10 -> 0x7 - ; 0x12000..0x12473 >>12= 0x12 -> 0x8 - ; 0x1D000..0x1D7FF >>12= 0x1D -> 0x9 - ; 0x1F000..0x1F093 >>12= 0x1F -> 0xA - ; 0x2F800..0x2FAFF >>12= 0x2F -> 0xB - ; 0xE0000..0xE00FF >>12= 0xE0 -> 0xC - (flet ((transform (x) - (dpb - (case (ash x -12) - ((#x00 #x01 #x02 #x03 #x04) (ash x -12)) - (#x0A 5) - (#x0F 6) - (#x10 7) - (#x12 8) - (#x1D 9) - (#x1F #xA) - (#x2F #xB) - (#xE0 #xC) - (t (error "Update the transform function for 0x~5,'0X" x)) - ) - (byte 8 12) - x - )) ) - (push (make-unicode-char :code (transform code) + ;; Also ignore variationselectors; they are treated + ;; specially as well. + (unless (or (<= #xFE00 code #xFE0F) (<= #xE0100 code #xE01EF)) + (push (make-unicode-char :index name-index :name name-string) - all-chars - ) ) ) ) ) + all-chars) + (setf (gethash code all-chars-hashed) (car all-chars)) + ;; Update the contiguous range, or start a new range. + (if (and range (= (1+ (range-end-code range)) code)) + (setf (range-end-code range) code) + (progn + (when range + (push range all-ranges)) + (setq range (make-range :index name-index + :start-code code + :end-code code)))) + (incf name-index) + (setq last-code code) + ) ) ) ) ) ) ) ) (setq all-chars (nreverse all-chars)) + (if range + (push range all-ranges)) + (setq all-ranges (nreverse all-ranges)) + (when aliasfile + ;; Read all characters and names from the alias file. + (with-open-file (istream aliasfile :direction :input) + (loop + (let ((line (read-line istream nil nil))) + (unless line (return)) + (let* ((i1 (position #\; line)) + (i2 (position #\; line :start (1+ i1))) + (code-string (subseq line 0 i1)) + (code (parse-integer code-string :radix 16)) + (name-string (subseq line (1+ i1) i2)) + (uc (gethash code all-chars-hashed))) + (when uc + (push (make-unicode-char :index (unicode-char-index uc) + :name name-string) + all-aliases) + ) ) ) ) ) ) + (setq all-aliases (nreverse all-aliases) + all-chars-and-aliases (append all-chars all-aliases)) ;; Split into words. (let ((words-by-length (make-array 0 :adjustable t))) - (dolist (name (list* "HANGUL SYLLABLE" "CJK COMPATIBILITY" (mapcar #'unicode-char-name all-chars))) + (dolist (name (list* "HANGUL SYLLABLE" "CJK COMPATIBILITY" "VARIATION" + (mapcar #'unicode-char-name all-chars-and-aliases))) (let ((i1 0)) (loop (when (>= i1 (length name)) (return)) @@ -195,12 +217,12 @@ (setf (gethash word (word-list-hashed word-list)) ind-offset) (incf ind-offset) ) ) ) ) - (dolist (word '("HANGUL" "SYLLABLE" "CJK" "COMPATIBILITY")) + (dolist (word '("HANGUL" "SYLLABLE" "CJK" "COMPATIBILITY" "VARIATION")) (format ostream "#define UNICODE_CHARNAME_WORD_~A ~D~%" word (gethash word (word-list-hashed (aref words-by-length (length word)))) ) ) ;; Compute the word-indices for every unicode-char. - (dolist (uc all-chars) + (dolist (uc all-chars-and-aliases) (let ((name (unicode-char-name uc)) (indices '())) (let ((i1 0)) @@ -220,8 +242,8 @@ ) ) ) ;; Sort the list of unicode-chars by word-indices. - (setq all-chars - (sort all-chars + (setq all-chars-and-aliases + (sort all-chars-and-aliases (lambda (vec1 vec2) (let ((len1 (length vec1)) (len2 (length vec2))) @@ -240,10 +262,10 @@ ) ) ;; Output the word-indices. (format ostream "static const uint16_t unicode_names[~D] = {~%" - (reduce #'+ (mapcar (lambda (uc) (length (unicode-char-word-indices uc))) all-chars)) + (reduce #'+ (mapcar (lambda (uc) (length (unicode-char-word-indices uc))) all-chars-and-aliases)) ) (let ((i 0)) - (dolist (uc all-chars) + (dolist (uc all-chars-and-aliases) (format ostream " ~{ ~D,~}" (maplist (lambda (r) (+ (* 2 (car r)) (if (cdr r) 1 0))) (coerce (unicode-char-word-indices uc) 'list) @@ -257,14 +279,14 @@ (incf i (length (unicode-char-word-indices uc))) ) ) (format ostream "};~%") - (format ostream "static const struct { uint16_t code; uint32_t name:24; }~%") + (format ostream "static const struct { uint16_t index; uint32_t name:24; }~%") (format ostream "#if __GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 7)~%__attribute__((__packed__))~%#endif~%") - (format ostream "unicode_name_to_code[~D] = {~%" - (length all-chars) + (format ostream "unicode_name_to_index[~D] = {~%" + (length all-chars-and-aliases) ) - (dolist (uc all-chars) + (dolist (uc all-chars-and-aliases) (format ostream " { 0x~4,'0X, ~D }," - (unicode-char-code uc) + (unicode-char-index uc) (unicode-char-word-indices-index uc) ) (when add-comments @@ -273,14 +295,14 @@ (format ostream "~%") ) (format ostream "};~%") - (format ostream "static const struct { uint16_t code; uint32_t name:24; }~%") + (format ostream "static const struct { uint16_t index; uint32_t name:24; }~%") (format ostream "#if __GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 7)~%__attribute__((__packed__))~%#endif~%") - (format ostream "unicode_code_to_name[~D] = {~%" + (format ostream "unicode_index_to_name[~D] = {~%" (length all-chars) ) - (dolist (uc (sort (copy-list all-chars) #'< :key #'unicode-char-code)) + (dolist (uc (sort (copy-list all-chars) #'< :key #'unicode-char-index)) (format ostream " { 0x~4,'0X, ~D }," - (unicode-char-code uc) + (unicode-char-index uc) (unicode-char-word-indices-index uc) ) (when add-comments @@ -290,12 +312,21 @@ ) (format ostream "};~%") (format ostream "#define UNICODE_CHARNAME_MAX_LENGTH ~D~%" - (reduce #'max (mapcar (lambda (uc) (length (unicode-char-name uc))) all-chars)) + (reduce #'max (mapcar (lambda (uc) (length (unicode-char-name uc))) all-chars-and-aliases)) ) (format ostream "#define UNICODE_CHARNAME_MAX_WORDS ~D~%" - (reduce #'max (mapcar (lambda (uc) (length (unicode-char-word-indices uc))) all-chars)) + (reduce #'max (mapcar (lambda (uc) (length (unicode-char-word-indices uc))) all-chars-and-aliases)) ) + (format ostream "static const struct { uint16_t index; uint32_t gap; uint16_t length; } unicode_ranges[~D] = {~%" + (length all-ranges)) + (dolist (range all-ranges) + (format ostream " { ~D, ~D, ~D },~%" + (range-index range) + (- (range-start-code range) (range-index range)) + (1+ (- (range-end-code range) (range-start-code range)))) + ) + (format ostream "};~%") ) ) ) ) -(main (first *args*) (second *args*)) +(main (first *args*) (second *args*) (third *args*)) |