summaryrefslogtreecommitdiff
path: root/lib/uniname/gen-uninames.lisp
diff options
context:
space:
mode:
authorManuel A. Fernandez Montecelo <manuel.montezelo@gmail.com>2016-05-26 16:48:15 +0100
committerManuel A. Fernandez Montecelo <manuel.montezelo@gmail.com>2016-05-26 16:48:15 +0100
commit5f2b09982312c98863eb9a8dfe2c608b81f58259 (patch)
treee5d38581c2f36e1cca02efedd2d85044d77f76f9 /lib/uniname/gen-uninames.lisp
parent3e0814cd9862b89c7a39672672937477bd87ddfb (diff)
Imported Upstream version 0.9.6upstream/0.9.6
Diffstat (limited to 'lib/uniname/gen-uninames.lisp')
-rwxr-xr-xlib/uniname/gen-uninames.lisp139
1 files changed, 85 insertions, 54 deletions
diff --git a/lib/uniname/gen-uninames.lisp b/lib/uniname/gen-uninames.lisp
index d08e93f0..9f795621 100755
--- a/lib/uniname/gen-uninames.lisp
+++ b/lib/uniname/gen-uninames.lisp
@@ -6,12 +6,18 @@
(defparameter add-comments nil)
(defstruct unicode-char
- (code nil :type integer)
+ (index nil :type integer)
(name nil :type string)
word-indices
word-indices-index
)
+(defstruct range
+ (index nil :type integer)
+ (start-code nil :type integer)
+ (end-code nil :type integer)
+)
+
(defstruct word-list
(hashed nil :type hash-table)
(sorted nil :type list)
@@ -19,10 +25,16 @@
length ; number of words
)
-(defun main (inputfile outputfile)
- (declare (type string inputfile outputfile))
+(defun main (inputfile outputfile aliasfile)
+ (declare (type string inputfile outputfile aliasfile))
#+UNICODE (setq *default-file-encoding* charset:utf-8)
- (let ((all-chars '()))
+ (let ((all-chars '())
+ (all-chars-hashed (make-hash-table :test #'equal))
+ (all-aliases '())
+ all-chars-and-aliases
+ (all-ranges '())
+ (name-index 0)
+ range)
;; Read all characters and names from the input file.
(with-open-file (istream inputfile :direction :input)
(loop
@@ -41,43 +53,53 @@
; specially as well.
(unless (or (<= #xF900 code #xFA2D) (<= #xFA30 code #xFA6A)
(<= #xFA70 code #xFAD9) (<= #x2F800 code #x2FA1D))
- ; Transform the code so that it fits in 16 bits. In
- ; Unicode 5.1 the following ranges are used.
- ; 0x00000..0x04DFF >>12= 0x00..0x04 -> 0x0..0x4
- ; 0x0A000..0x0AAFF >>12= 0x0A -> 0x5
- ; 0x0F900..0x0FFFF >>12= 0x0F -> 0x6
- ; 0x10000..0x10A58 >>12= 0x10 -> 0x7
- ; 0x12000..0x12473 >>12= 0x12 -> 0x8
- ; 0x1D000..0x1D7FF >>12= 0x1D -> 0x9
- ; 0x1F000..0x1F093 >>12= 0x1F -> 0xA
- ; 0x2F800..0x2FAFF >>12= 0x2F -> 0xB
- ; 0xE0000..0xE00FF >>12= 0xE0 -> 0xC
- (flet ((transform (x)
- (dpb
- (case (ash x -12)
- ((#x00 #x01 #x02 #x03 #x04) (ash x -12))
- (#x0A 5)
- (#x0F 6)
- (#x10 7)
- (#x12 8)
- (#x1D 9)
- (#x1F #xA)
- (#x2F #xB)
- (#xE0 #xC)
- (t (error "Update the transform function for 0x~5,'0X" x))
- )
- (byte 8 12)
- x
- )) )
- (push (make-unicode-char :code (transform code)
+ ;; Also ignore variationselectors; they are treated
+ ;; specially as well.
+ (unless (or (<= #xFE00 code #xFE0F) (<= #xE0100 code #xE01EF))
+ (push (make-unicode-char :index name-index
:name name-string)
- all-chars
- ) ) ) ) )
+ all-chars)
+ (setf (gethash code all-chars-hashed) (car all-chars))
+ ;; Update the contiguous range, or start a new range.
+ (if (and range (= (1+ (range-end-code range)) code))
+ (setf (range-end-code range) code)
+ (progn
+ (when range
+ (push range all-ranges))
+ (setq range (make-range :index name-index
+ :start-code code
+ :end-code code))))
+ (incf name-index)
+ (setq last-code code)
+ ) ) ) )
) ) ) )
(setq all-chars (nreverse all-chars))
+ (if range
+ (push range all-ranges))
+ (setq all-ranges (nreverse all-ranges))
+ (when aliasfile
+ ;; Read all characters and names from the alias file.
+ (with-open-file (istream aliasfile :direction :input)
+ (loop
+ (let ((line (read-line istream nil nil)))
+ (unless line (return))
+ (let* ((i1 (position #\; line))
+ (i2 (position #\; line :start (1+ i1)))
+ (code-string (subseq line 0 i1))
+ (code (parse-integer code-string :radix 16))
+ (name-string (subseq line (1+ i1) i2))
+ (uc (gethash code all-chars-hashed)))
+ (when uc
+ (push (make-unicode-char :index (unicode-char-index uc)
+ :name name-string)
+ all-aliases)
+ ) ) ) ) ) )
+ (setq all-aliases (nreverse all-aliases)
+ all-chars-and-aliases (append all-chars all-aliases))
;; Split into words.
(let ((words-by-length (make-array 0 :adjustable t)))
- (dolist (name (list* "HANGUL SYLLABLE" "CJK COMPATIBILITY" (mapcar #'unicode-char-name all-chars)))
+ (dolist (name (list* "HANGUL SYLLABLE" "CJK COMPATIBILITY" "VARIATION"
+ (mapcar #'unicode-char-name all-chars-and-aliases)))
(let ((i1 0))
(loop
(when (>= i1 (length name)) (return))
@@ -195,12 +217,12 @@
(setf (gethash word (word-list-hashed word-list)) ind-offset)
(incf ind-offset)
) ) ) )
- (dolist (word '("HANGUL" "SYLLABLE" "CJK" "COMPATIBILITY"))
+ (dolist (word '("HANGUL" "SYLLABLE" "CJK" "COMPATIBILITY" "VARIATION"))
(format ostream "#define UNICODE_CHARNAME_WORD_~A ~D~%" word
(gethash word (word-list-hashed (aref words-by-length (length word))))
) )
;; Compute the word-indices for every unicode-char.
- (dolist (uc all-chars)
+ (dolist (uc all-chars-and-aliases)
(let ((name (unicode-char-name uc))
(indices '()))
(let ((i1 0))
@@ -220,8 +242,8 @@
)
) )
;; Sort the list of unicode-chars by word-indices.
- (setq all-chars
- (sort all-chars
+ (setq all-chars-and-aliases
+ (sort all-chars-and-aliases
(lambda (vec1 vec2)
(let ((len1 (length vec1))
(len2 (length vec2)))
@@ -240,10 +262,10 @@
) )
;; Output the word-indices.
(format ostream "static const uint16_t unicode_names[~D] = {~%"
- (reduce #'+ (mapcar (lambda (uc) (length (unicode-char-word-indices uc))) all-chars))
+ (reduce #'+ (mapcar (lambda (uc) (length (unicode-char-word-indices uc))) all-chars-and-aliases))
)
(let ((i 0))
- (dolist (uc all-chars)
+ (dolist (uc all-chars-and-aliases)
(format ostream " ~{ ~D,~}"
(maplist (lambda (r) (+ (* 2 (car r)) (if (cdr r) 1 0)))
(coerce (unicode-char-word-indices uc) 'list)
@@ -257,14 +279,14 @@
(incf i (length (unicode-char-word-indices uc)))
) )
(format ostream "};~%")
- (format ostream "static const struct { uint16_t code; uint32_t name:24; }~%")
+ (format ostream "static const struct { uint16_t index; uint32_t name:24; }~%")
(format ostream "#if __GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 7)~%__attribute__((__packed__))~%#endif~%")
- (format ostream "unicode_name_to_code[~D] = {~%"
- (length all-chars)
+ (format ostream "unicode_name_to_index[~D] = {~%"
+ (length all-chars-and-aliases)
)
- (dolist (uc all-chars)
+ (dolist (uc all-chars-and-aliases)
(format ostream " { 0x~4,'0X, ~D },"
- (unicode-char-code uc)
+ (unicode-char-index uc)
(unicode-char-word-indices-index uc)
)
(when add-comments
@@ -273,14 +295,14 @@
(format ostream "~%")
)
(format ostream "};~%")
- (format ostream "static const struct { uint16_t code; uint32_t name:24; }~%")
+ (format ostream "static const struct { uint16_t index; uint32_t name:24; }~%")
(format ostream "#if __GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 7)~%__attribute__((__packed__))~%#endif~%")
- (format ostream "unicode_code_to_name[~D] = {~%"
+ (format ostream "unicode_index_to_name[~D] = {~%"
(length all-chars)
)
- (dolist (uc (sort (copy-list all-chars) #'< :key #'unicode-char-code))
+ (dolist (uc (sort (copy-list all-chars) #'< :key #'unicode-char-index))
(format ostream " { 0x~4,'0X, ~D },"
- (unicode-char-code uc)
+ (unicode-char-index uc)
(unicode-char-word-indices-index uc)
)
(when add-comments
@@ -290,12 +312,21 @@
)
(format ostream "};~%")
(format ostream "#define UNICODE_CHARNAME_MAX_LENGTH ~D~%"
- (reduce #'max (mapcar (lambda (uc) (length (unicode-char-name uc))) all-chars))
+ (reduce #'max (mapcar (lambda (uc) (length (unicode-char-name uc))) all-chars-and-aliases))
)
(format ostream "#define UNICODE_CHARNAME_MAX_WORDS ~D~%"
- (reduce #'max (mapcar (lambda (uc) (length (unicode-char-word-indices uc))) all-chars))
+ (reduce #'max (mapcar (lambda (uc) (length (unicode-char-word-indices uc))) all-chars-and-aliases))
)
+ (format ostream "static const struct { uint16_t index; uint32_t gap; uint16_t length; } unicode_ranges[~D] = {~%"
+ (length all-ranges))
+ (dolist (range all-ranges)
+ (format ostream " { ~D, ~D, ~D },~%"
+ (range-index range)
+ (- (range-start-code range) (range-index range))
+ (1+ (- (range-end-code range) (range-start-code range))))
+ )
+ (format ostream "};~%")
)
) ) )
-(main (first *args*) (second *args*))
+(main (first *args*) (second *args*) (third *args*))