summaryrefslogtreecommitdiff
path: root/lib/uniname/gen-uninames.lisp
diff options
context:
space:
mode:
authorAndreas Rottmann <a.rottmann@gmx.at>2009-09-14 12:32:44 +0200
committerAndreas Rottmann <a.rottmann@gmx.at>2009-09-14 12:32:44 +0200
commitfa095a4504cbe668e4244547e2c141597bea4ecf (patch)
tree06135820a286ffec47804e75fbf8a147e92acd2e /lib/uniname/gen-uninames.lisp
Imported Upstream version 0.9.1upstream/0.9.1
Diffstat (limited to 'lib/uniname/gen-uninames.lisp')
-rwxr-xr-xlib/uniname/gen-uninames.lisp301
1 files changed, 301 insertions, 0 deletions
diff --git a/lib/uniname/gen-uninames.lisp b/lib/uniname/gen-uninames.lisp
new file mode 100755
index 00000000..d08e93f0
--- /dev/null
+++ b/lib/uniname/gen-uninames.lisp
@@ -0,0 +1,301 @@
+#!/usr/local/bin/clisp -C
+
+;;; Creation of gnulib's uninames.h from the UnicodeData.txt table.
+;;; Bruno Haible 2000-12-28
+
+(defparameter add-comments nil)
+
+(defstruct unicode-char
+ (code nil :type integer)
+ (name nil :type string)
+ word-indices
+ word-indices-index
+)
+
+(defstruct word-list
+ (hashed nil :type hash-table)
+ (sorted nil :type list)
+ size ; number of characters total
+ length ; number of words
+)
+
+(defun main (inputfile outputfile)
+ (declare (type string inputfile outputfile))
+ #+UNICODE (setq *default-file-encoding* charset:utf-8)
+ (let ((all-chars '()))
+ ;; Read all characters and names from the input file.
+ (with-open-file (istream inputfile :direction :input)
+ (loop
+ (let ((line (read-line istream nil nil)))
+ (unless line (return))
+ (let* ((i1 (position #\; line))
+ (i2 (position #\; line :start (1+ i1)))
+ (code-string (subseq line 0 i1))
+ (code (parse-integer code-string :radix 16))
+ (name-string (subseq line (1+ i1) i2)))
+ ; Ignore characters whose name starts with "<".
+ (unless (eql (char name-string 0) #\<)
+ ; Also ignore Hangul syllables; they are treated specially.
+ (unless (<= #xAC00 code #xD7A3)
+ ; Also ignore CJK compatibility ideographs; they are treated
+ ; specially as well.
+ (unless (or (<= #xF900 code #xFA2D) (<= #xFA30 code #xFA6A)
+ (<= #xFA70 code #xFAD9) (<= #x2F800 code #x2FA1D))
+ ; Transform the code so that it fits in 16 bits. In
+ ; Unicode 5.1 the following ranges are used.
+ ; 0x00000..0x04DFF >>12= 0x00..0x04 -> 0x0..0x4
+ ; 0x0A000..0x0AAFF >>12= 0x0A -> 0x5
+ ; 0x0F900..0x0FFFF >>12= 0x0F -> 0x6
+ ; 0x10000..0x10A58 >>12= 0x10 -> 0x7
+ ; 0x12000..0x12473 >>12= 0x12 -> 0x8
+ ; 0x1D000..0x1D7FF >>12= 0x1D -> 0x9
+ ; 0x1F000..0x1F093 >>12= 0x1F -> 0xA
+ ; 0x2F800..0x2FAFF >>12= 0x2F -> 0xB
+ ; 0xE0000..0xE00FF >>12= 0xE0 -> 0xC
+ (flet ((transform (x)
+ (dpb
+ (case (ash x -12)
+ ((#x00 #x01 #x02 #x03 #x04) (ash x -12))
+ (#x0A 5)
+ (#x0F 6)
+ (#x10 7)
+ (#x12 8)
+ (#x1D 9)
+ (#x1F #xA)
+ (#x2F #xB)
+ (#xE0 #xC)
+ (t (error "Update the transform function for 0x~5,'0X" x))
+ )
+ (byte 8 12)
+ x
+ )) )
+ (push (make-unicode-char :code (transform code)
+ :name name-string)
+ all-chars
+ ) ) ) ) )
+ ) ) ) )
+ (setq all-chars (nreverse all-chars))
+ ;; Split into words.
+ (let ((words-by-length (make-array 0 :adjustable t)))
+ (dolist (name (list* "HANGUL SYLLABLE" "CJK COMPATIBILITY" (mapcar #'unicode-char-name all-chars)))
+ (let ((i1 0))
+ (loop
+ (when (>= i1 (length name)) (return))
+ (let ((i2 (or (position #\Space name :start i1) (length name))))
+ (let* ((word (subseq name i1 i2))
+ (len (length word)))
+ (when (>= len (length words-by-length))
+ (adjust-array words-by-length (1+ len))
+ )
+ (unless (aref words-by-length len)
+ (setf (aref words-by-length len)
+ (make-word-list
+ :hashed (make-hash-table :test #'equal)
+ :sorted '()
+ ) ) )
+ (let ((word-list (aref words-by-length len)))
+ (unless (gethash word (word-list-hashed word-list))
+ (setf (gethash word (word-list-hashed word-list)) t)
+ (push word (word-list-sorted word-list))
+ ) )
+ )
+ (setq i1 (1+ i2))
+ ) ) ) )
+ ;; Sort the word lists.
+ (dotimes (len (length words-by-length))
+ (unless (aref words-by-length len)
+ (setf (aref words-by-length len)
+ (make-word-list
+ :hashed (make-hash-table :test #'equal)
+ :sorted '()
+ ) ) )
+ (let ((word-list (aref words-by-length len)))
+ (setf (word-list-sorted word-list)
+ (sort (word-list-sorted word-list) #'string<)
+ )
+ (setf (word-list-size word-list)
+ (reduce #'+ (mapcar #'length (word-list-sorted word-list)))
+ )
+ (setf (word-list-length word-list)
+ (length (word-list-sorted word-list))
+ ) ) )
+ ;; Output the tables.
+ (with-open-file (ostream outputfile :direction :output
+ #+UNICODE :external-format #+UNICODE charset:ascii)
+ (format ostream "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */~%")
+ (format ostream "/*~%")
+ (format ostream " * ~A~%" (file-namestring outputfile))
+ (format ostream " *~%")
+ (format ostream " * Unicode character name table.~%")
+ (format ostream " * Generated automatically by the gen-uninames utility.~%")
+ (format ostream " */~%")
+ (format ostream "~%")
+ (format ostream "static const char unicode_name_words[~D] = {~%"
+ (let ((sum 0))
+ (dotimes (len (length words-by-length))
+ (let ((word-list (aref words-by-length len)))
+ (incf sum (word-list-size word-list))
+ ) )
+ sum
+ ) )
+ (dotimes (len (length words-by-length))
+ (let ((word-list (aref words-by-length len)))
+ (dolist (word (word-list-sorted word-list))
+ (format ostream " ~{ '~C',~}~%" (coerce word 'list))
+ ) ) )
+ (format ostream "};~%")
+ (format ostream "#define UNICODE_CHARNAME_NUM_WORDS ~D~%"
+ (let ((sum 0))
+ (dotimes (len (length words-by-length))
+ (let ((word-list (aref words-by-length len)))
+ (incf sum (word-list-length word-list))
+ ) )
+ sum
+ ) )
+ #| ; Redundant data
+ (format ostream "static const uint16_t unicode_name_word_offsets[~D] = {~%"
+ (let ((sum 0))
+ (dotimes (len (length words-by-length))
+ (let ((word-list (aref words-by-length len)))
+ (incf sum (word-list-length word-list))
+ ) )
+ sum
+ ) )
+ (dotimes (len (length words-by-length))
+ (let ((word-list (aref words-by-length len)))
+ (when (word-list-sorted word-list)
+ (format ostream " ")
+ (do ((l (word-list-sorted word-list) (cdr l))
+ (offset 0 (+ offset (length (car l)))))
+ ((endp l))
+ (format ostream "~<~% ~0,79:; ~D,~>" offset)
+ )
+ (format ostream "~%")
+ ) ) )
+ (format ostream "};~%")
+ |#
+ (format ostream "static const struct { uint16_t extra_offset; uint16_t ind_offset; } unicode_name_by_length[~D] = {~%"
+ (1+ (length words-by-length))
+ )
+ (let ((extra-offset 0)
+ (ind-offset 0))
+ (dotimes (len (length words-by-length))
+ (let ((word-list (aref words-by-length len)))
+ (format ostream " { ~D, ~D },~%" extra-offset ind-offset)
+ (incf extra-offset (word-list-size word-list))
+ (incf ind-offset (word-list-length word-list))
+ ) )
+ (format ostream " { ~D, ~D }~%" extra-offset ind-offset)
+ )
+ (format ostream "};~%")
+ (let ((ind-offset 0))
+ (dotimes (len (length words-by-length))
+ (let ((word-list (aref words-by-length len)))
+ (dolist (word (word-list-sorted word-list))
+ (setf (gethash word (word-list-hashed word-list)) ind-offset)
+ (incf ind-offset)
+ ) ) ) )
+ (dolist (word '("HANGUL" "SYLLABLE" "CJK" "COMPATIBILITY"))
+ (format ostream "#define UNICODE_CHARNAME_WORD_~A ~D~%" word
+ (gethash word (word-list-hashed (aref words-by-length (length word))))
+ ) )
+ ;; Compute the word-indices for every unicode-char.
+ (dolist (uc all-chars)
+ (let ((name (unicode-char-name uc))
+ (indices '()))
+ (let ((i1 0))
+ (loop
+ (when (>= i1 (length name)) (return))
+ (let ((i2 (or (position #\Space name :start i1) (length name))))
+ (let* ((word (subseq name i1 i2))
+ (len (length word)))
+ (push (gethash word (word-list-hashed (aref words-by-length len)))
+ indices
+ )
+ )
+ (setq i1 (1+ i2))
+ ) ) )
+ (setf (unicode-char-word-indices uc)
+ (coerce (nreverse indices) 'vector)
+ )
+ ) )
+ ;; Sort the list of unicode-chars by word-indices.
+ (setq all-chars
+ (sort all-chars
+ (lambda (vec1 vec2)
+ (let ((len1 (length vec1))
+ (len2 (length vec2)))
+ (do ((i 0 (1+ i)))
+ (nil)
+ (if (< i len2)
+ (if (< i len1)
+ (cond ((< (aref vec1 i) (aref vec2 i)) (return t))
+ ((> (aref vec1 i) (aref vec2 i)) (return nil))
+ )
+ (return t)
+ )
+ (return nil)
+ ) ) ) )
+ :key #'unicode-char-word-indices
+ ) )
+ ;; Output the word-indices.
+ (format ostream "static const uint16_t unicode_names[~D] = {~%"
+ (reduce #'+ (mapcar (lambda (uc) (length (unicode-char-word-indices uc))) all-chars))
+ )
+ (let ((i 0))
+ (dolist (uc all-chars)
+ (format ostream " ~{ ~D,~}"
+ (maplist (lambda (r) (+ (* 2 (car r)) (if (cdr r) 1 0)))
+ (coerce (unicode-char-word-indices uc) 'list)
+ )
+ )
+ (when add-comments
+ (format ostream "~40T/* ~A */" (unicode-char-name uc))
+ )
+ (format ostream "~%")
+ (setf (unicode-char-word-indices-index uc) i)
+ (incf i (length (unicode-char-word-indices uc)))
+ ) )
+ (format ostream "};~%")
+ (format ostream "static const struct { uint16_t code; uint32_t name:24; }~%")
+ (format ostream "#if __GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 7)~%__attribute__((__packed__))~%#endif~%")
+ (format ostream "unicode_name_to_code[~D] = {~%"
+ (length all-chars)
+ )
+ (dolist (uc all-chars)
+ (format ostream " { 0x~4,'0X, ~D },"
+ (unicode-char-code uc)
+ (unicode-char-word-indices-index uc)
+ )
+ (when add-comments
+ (format ostream "~21T/* ~A */" (unicode-char-name uc))
+ )
+ (format ostream "~%")
+ )
+ (format ostream "};~%")
+ (format ostream "static const struct { uint16_t code; uint32_t name:24; }~%")
+ (format ostream "#if __GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 7)~%__attribute__((__packed__))~%#endif~%")
+ (format ostream "unicode_code_to_name[~D] = {~%"
+ (length all-chars)
+ )
+ (dolist (uc (sort (copy-list all-chars) #'< :key #'unicode-char-code))
+ (format ostream " { 0x~4,'0X, ~D },"
+ (unicode-char-code uc)
+ (unicode-char-word-indices-index uc)
+ )
+ (when add-comments
+ (format ostream "~21T/* ~A */" (unicode-char-name uc))
+ )
+ (format ostream "~%")
+ )
+ (format ostream "};~%")
+ (format ostream "#define UNICODE_CHARNAME_MAX_LENGTH ~D~%"
+ (reduce #'max (mapcar (lambda (uc) (length (unicode-char-name uc))) all-chars))
+ )
+ (format ostream "#define UNICODE_CHARNAME_MAX_WORDS ~D~%"
+ (reduce #'max (mapcar (lambda (uc) (length (unicode-char-word-indices uc))) all-chars))
+ )
+ )
+) ) )
+
+(main (first *args*) (second *args*))