1 files changed, 85 insertions, 54 deletions
diff --git a/lib/uniname/gen-uninames.lisp b/lib/uniname/gen-uninames.lisp
index d08e93f0..9f795621 100755
--- a/lib/uniname/gen-uninames.lisp
+++ b/lib/uniname/gen-uninames.lisp
@@ -6,12 +6,18 @@
 (defparameter add-comments nil)
 
 (defstruct unicode-char
-  (code nil :type integer)
+  (index nil :type integer)
   (name nil :type string)
   word-indices
   word-indices-index
 )
 
+(defstruct range
+  (index nil :type integer)
+  (start-code nil :type integer)
+  (end-code nil :type integer)
+)
+
 (defstruct word-list
   (hashed nil :type hash-table)
   (sorted nil :type list)
@@ -19,10 +25,16 @@
   length                        ; number of words
 )
 
-(defun main (inputfile outputfile)
-  (declare (type string inputfile outputfile))
+(defun main (inputfile outputfile aliasfile)
+  (declare (type string inputfile outputfile aliasfile))
   #+UNICODE (setq *default-file-encoding* charset:utf-8)
-  (let ((all-chars '()))
+  (let ((all-chars '())
+        (all-chars-hashed (make-hash-table :test #'equal))
+        (all-aliases '())
+        all-chars-and-aliases
+        (all-ranges '())
+        (name-index 0)
+        range)
     ;; Read all characters and names from the input file.
     (with-open-file (istream inputfile :direction :input)
       (loop
@@ -41,43 +53,53 @@
                 ; specially as well.
                 (unless (or (<= #xF900 code #xFA2D) (<= #xFA30 code #xFA6A)
                             (<= #xFA70 code #xFAD9) (<= #x2F800 code #x2FA1D))
-                  ; Transform the code so that it fits in 16 bits. In
-                  ; Unicode 5.1 the following ranges are used.
-                  ;   0x00000..0x04DFF  >>12=  0x00..0x04  -> 0x0..0x4
-                  ;   0x0A000..0x0AAFF  >>12=  0x0A        -> 0x5
-                  ;   0x0F900..0x0FFFF  >>12=  0x0F        -> 0x6
-                  ;   0x10000..0x10A58  >>12=  0x10        -> 0x7
-                  ;   0x12000..0x12473  >>12=  0x12        -> 0x8
-                  ;   0x1D000..0x1D7FF  >>12=  0x1D        -> 0x9
-                  ;   0x1F000..0x1F093  >>12=  0x1F        -> 0xA
-                  ;   0x2F800..0x2FAFF  >>12=  0x2F        -> 0xB
-                  ;   0xE0000..0xE00FF  >>12=  0xE0        -> 0xC
-                  (flet ((transform (x)
-                           (dpb
-                             (case (ash x -12)
-                               ((#x00 #x01 #x02 #x03 #x04) (ash x -12))
-                               (#x0A 5)
-                               (#x0F 6)
-                               (#x10 7)
-                               (#x12 8)
-                               (#x1D 9)
-                               (#x1F #xA)
-                               (#x2F #xB)
-                               (#xE0 #xC)
-                               (t (error "Update the transform function for 0x~5,'0X" x))
-                             )
-                             (byte 8 12)
-                             x
-                        )) )
-                    (push (make-unicode-char :code (transform code)
+                  ;; Also ignore variationselectors; they are treated
+                  ;; specially as well.
+                  (unless (or (<= #xFE00 code #xFE0F) (<= #xE0100 code #xE01EF))
+                    (push (make-unicode-char :index name-index
                                              :name name-string)
-                          all-chars
-            ) ) ) ) )
+                          all-chars)
+                    (setf (gethash code all-chars-hashed) (car all-chars))
+                    ;; Update the contiguous range, or start a new range.
+                    (if (and range (= (1+ (range-end-code range)) code))
+                        (setf (range-end-code range) code)
+                      (progn
+                        (when range
+                          (push range all-ranges))
+                        (setq range (make-range :index name-index
+                                                :start-code code
+                                                :end-code code))))
+                    (incf name-index)
+                    (setq last-code code)
+                  ) ) ) )
     ) ) ) )
     (setq all-chars (nreverse all-chars))
+    (if range
+        (push range all-ranges))
+    (setq all-ranges (nreverse all-ranges))
+    (when aliasfile
+      ;; Read all characters and names from the alias file.
+      (with-open-file (istream aliasfile :direction :input)
+        (loop
+         (let ((line (read-line istream nil nil)))
+           (unless line (return))
+           (let* ((i1 (position #\; line))
+                  (i2 (position #\; line :start (1+ i1)))
+                  (code-string (subseq line 0 i1))
+                  (code (parse-integer code-string :radix 16))
+                  (name-string (subseq line (1+ i1) i2))
+                  (uc (gethash code all-chars-hashed)))
+             (when uc
+               (push (make-unicode-char :index (unicode-char-index uc)
+                                        :name name-string)
+                     all-aliases)
+             ) ) ) ) ) )
+    (setq all-aliases (nreverse all-aliases)
+          all-chars-and-aliases (append all-chars all-aliases))
     ;; Split into words.
     (let ((words-by-length (make-array 0 :adjustable t)))
-      (dolist (name (list* "HANGUL SYLLABLE" "CJK COMPATIBILITY" (mapcar #'unicode-char-name all-chars)))
+      (dolist (name (list* "HANGUL SYLLABLE" "CJK COMPATIBILITY" "VARIATION"
+                           (mapcar #'unicode-char-name all-chars-and-aliases)))
         (let ((i1 0))
           (loop
             (when (>= i1 (length name)) (return))
@@ -195,12 +217,12 @@
                 (setf (gethash word (word-list-hashed word-list)) ind-offset)
                 (incf ind-offset)
         ) ) ) )
-        (dolist (word '("HANGUL" "SYLLABLE" "CJK" "COMPATIBILITY"))
+        (dolist (word '("HANGUL" "SYLLABLE" "CJK" "COMPATIBILITY" "VARIATION"))
           (format ostream "#define UNICODE_CHARNAME_WORD_~A ~D~%" word
                           (gethash word (word-list-hashed (aref words-by-length (length word))))
         ) )
         ;; Compute the word-indices for every unicode-char.
-        (dolist (uc all-chars)
+        (dolist (uc all-chars-and-aliases)
           (let ((name (unicode-char-name uc))
                 (indices '()))
             (let ((i1 0))
@@ -220,8 +242,8 @@
             )
         ) )
         ;; Sort the list of unicode-chars by word-indices.
-        (setq all-chars
-              (sort all-chars
+        (setq all-chars-and-aliases
+              (sort all-chars-and-aliases
                     (lambda (vec1 vec2)
                       (let ((len1 (length vec1))
                             (len2 (length vec2)))
@@ -240,10 +262,10 @@
         )     )
         ;; Output the word-indices.
         (format ostream "static const uint16_t unicode_names[~D] = {~%"
-                        (reduce #'+ (mapcar (lambda (uc) (length (unicode-char-word-indices uc))) all-chars))
+                        (reduce #'+ (mapcar (lambda (uc) (length (unicode-char-word-indices uc))) all-chars-and-aliases))
         )
         (let ((i 0))
-          (dolist (uc all-chars)
+          (dolist (uc all-chars-and-aliases)
             (format ostream " ~{ ~D,~}"
                             (maplist (lambda (r) (+ (* 2 (car r)) (if (cdr r) 1 0)))
                                      (coerce (unicode-char-word-indices uc) 'list)
@@ -257,14 +279,14 @@
             (incf i (length (unicode-char-word-indices uc)))
         ) )
         (format ostream "};~%")
-        (format ostream "static const struct { uint16_t code; uint32_t name:24; }~%")
+        (format ostream "static const struct { uint16_t index; uint32_t name:24; }~%")
         (format ostream "#if __GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 7)~%__attribute__((__packed__))~%#endif~%")
-        (format ostream "unicode_name_to_code[~D] = {~%"
-                        (length all-chars)
+        (format ostream "unicode_name_to_index[~D] = {~%"
+                        (length all-chars-and-aliases)
         )
-        (dolist (uc all-chars)
+        (dolist (uc all-chars-and-aliases)
           (format ostream "  { 0x~4,'0X, ~D },"
-                          (unicode-char-code uc)
+                          (unicode-char-index uc)
                           (unicode-char-word-indices-index uc)
           )
           (when add-comments
@@ -273,14 +295,14 @@
           (format ostream "~%")
         )
         (format ostream "};~%")
-        (format ostream "static const struct { uint16_t code; uint32_t name:24; }~%")
+        (format ostream "static const struct { uint16_t index; uint32_t name:24; }~%")
         (format ostream "#if __GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 7)~%__attribute__((__packed__))~%#endif~%")
-        (format ostream "unicode_code_to_name[~D] = {~%"
+        (format ostream "unicode_index_to_name[~D] = {~%"
                         (length all-chars)
         )
-        (dolist (uc (sort (copy-list all-chars) #'< :key #'unicode-char-code))
+        (dolist (uc (sort (copy-list all-chars) #'< :key #'unicode-char-index))
           (format ostream "  { 0x~4,'0X, ~D },"
-                          (unicode-char-code uc)
+                          (unicode-char-index uc)
                           (unicode-char-word-indices-index uc)
           )
           (when add-comments
@@ -290,12 +312,21 @@
         )
         (format ostream "};~%")
         (format ostream "#define UNICODE_CHARNAME_MAX_LENGTH ~D~%"
-                        (reduce #'max (mapcar (lambda (uc) (length (unicode-char-name uc))) all-chars))
+                        (reduce #'max (mapcar (lambda (uc) (length (unicode-char-name uc))) all-chars-and-aliases))
         )
         (format ostream "#define UNICODE_CHARNAME_MAX_WORDS ~D~%"
-                        (reduce #'max (mapcar (lambda (uc) (length (unicode-char-word-indices uc))) all-chars))
+                        (reduce #'max (mapcar (lambda (uc) (length (unicode-char-word-indices uc))) all-chars-and-aliases))
         )
+        (format ostream "static const struct { uint16_t index; uint32_t gap; uint16_t length; } unicode_ranges[~D] = {~%"
+                        (length all-ranges))
+        (dolist (range all-ranges)
+          (format ostream "  { ~D, ~D, ~D },~%"
+                  (range-index range)
+                  (- (range-start-code range) (range-index range))
+                  (1+ (- (range-end-code range) (range-start-code range))))
+        )
+        (format ostream "};~%")
       )
 ) ) )
 
-(main (first *args*) (second *args*))
+(main (first *args*) (second *args*) (third *args*))