diff options
Diffstat (limited to 'lib/uninorm.h')
-rw-r--r-- | lib/uninorm.h | 248 |
1 files changed, 248 insertions, 0 deletions
diff --git a/lib/uninorm.h b/lib/uninorm.h new file mode 100644 index 00000000..6850967e --- /dev/null +++ b/lib/uninorm.h @@ -0,0 +1,248 @@ +/* Normalization forms (composition and decomposition) of Unicode strings. + Copyright (C) 2001-2002, 2009 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2009. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#ifndef _UNINORM_H +#define _UNINORM_H + +/* Get LIBUNISTRING_DLL_VARIABLE. */ +#include <unistring/woe32dll.h> + +/* Get size_t. */ +#include <stddef.h> + +#include "unitypes.h" + + +#ifdef __cplusplus +extern "C" { +#endif + + +/* Conventions: + + All functions prefixed with u8_ operate on UTF-8 encoded strings. + Their unit is an uint8_t (1 byte). + + All functions prefixed with u16_ operate on UTF-16 encoded strings. + Their unit is an uint16_t (a 2-byte word). + + All functions prefixed with u32_ operate on UCS-4 encoded strings. + Their unit is an uint32_t (a 4-byte word). + + All argument pairs (s, n) denote a Unicode string s[0..n-1] with exactly + n units. + + Functions returning a string result take a (resultbuf, lengthp) argument + pair. If resultbuf is not NULL and the result fits into *lengthp units, + it is put in resultbuf, and resultbuf is returned. Otherwise, a freshly + allocated string is returned. In both cases, *lengthp is set to the + length (number of units) of the returned string. In case of error, + NULL is returned and errno is set. */ + + +enum +{ + UC_DECOMP_CANONICAL,/* Canonical decomposition. */ + UC_DECOMP_FONT, /* <font> A font variant (e.g. a blackletter form). */ + UC_DECOMP_NOBREAK, /* <noBreak> A no-break version of a space or hyphen. */ + UC_DECOMP_INITIAL, /* <initial> An initial presentation form (Arabic). */ + UC_DECOMP_MEDIAL, /* <medial> A medial presentation form (Arabic). */ + UC_DECOMP_FINAL, /* <final> A final presentation form (Arabic). */ + UC_DECOMP_ISOLATED,/* <isolated> An isolated presentation form (Arabic). */ + UC_DECOMP_CIRCLE, /* <circle> An encircled form. */ + UC_DECOMP_SUPER, /* <super> A superscript form. */ + UC_DECOMP_SUB, /* <sub> A subscript form. */ + UC_DECOMP_VERTICAL,/* <vertical> A vertical layout presentation form. */ + UC_DECOMP_WIDE, /* <wide> A wide (or zenkaku) compatibility character. */ + UC_DECOMP_NARROW, /* <narrow> A narrow (or hankaku) compatibility character. */ + UC_DECOMP_SMALL, /* <small> A small variant form (CNS compatibility). */ + UC_DECOMP_SQUARE, /* <square> A CJK squared font variant. */ + UC_DECOMP_FRACTION,/* <fraction> A vulgar fraction form. */ + UC_DECOMP_COMPAT /* <compat> Otherwise unspecified compatibility character. */ +}; + +/* Maximum size of decomposition of a single Unicode character. */ +#define UC_DECOMPOSITION_MAX_LENGTH 32 + +/* Return the character decomposition mapping of a Unicode character. + DECOMPOSITION must point to an array of at least UC_DECOMPOSITION_MAX_LENGTH + ucs_t elements. + When a decomposition exists, DECOMPOSITION[0..N-1] and *DECOMP_TAG are + filled and N is returned. Otherwise -1 is returned. */ +extern int + uc_decomposition (ucs4_t uc, int *decomp_tag, ucs4_t *decomposition); + +/* Return the canonical character decomposition mapping of a Unicode character. + DECOMPOSITION must point to an array of at least UC_DECOMPOSITION_MAX_LENGTH + ucs_t elements. + When a decomposition exists, DECOMPOSITION[0..N-1] is filled and N is + returned. Otherwise -1 is returned. */ +extern int + uc_canonical_decomposition (ucs4_t uc, ucs4_t *decomposition); + + +/* Attempt to combine the Unicode characters uc1, uc2. + uc1 is known to have canonical combining class 0. + Return the combination of uc1 and uc2, if it exists. + Return 0 otherwise. + Not all decompositions can be recombined using this function. See the + Unicode file CompositionExclusions.txt for details. */ +extern ucs4_t + uc_composition (ucs4_t uc1, ucs4_t uc2); + + +/* An object of type uninorm_t denotes a Unicode normalization form. */ +struct unicode_normalization_form; +typedef const struct unicode_normalization_form *uninorm_t; + +/* UNINORM_NFD: Normalization form D: canonical decomposition. */ +extern LIBUNISTRING_DLL_VARIABLE const struct unicode_normalization_form uninorm_nfd; +#define UNINORM_NFD (&uninorm_nfd) + +/* UNINORM_NFC: Normalization form C: canonical decomposition, then + canonical composition. */ +extern LIBUNISTRING_DLL_VARIABLE const struct unicode_normalization_form uninorm_nfc; +#define UNINORM_NFC (&uninorm_nfc) + +/* UNINORM_NFKD: Normalization form KD: compatibility decomposition. */ +extern LIBUNISTRING_DLL_VARIABLE const struct unicode_normalization_form uninorm_nfkd; +#define UNINORM_NFKD (&uninorm_nfkd) + +/* UNINORM_NFKC: Normalization form KC: compatibility decomposition, then + canonical composition. */ +extern LIBUNISTRING_DLL_VARIABLE const struct unicode_normalization_form uninorm_nfkc; +#define UNINORM_NFKC (&uninorm_nfkc) + +/* Test whether a normalization form does compatibility decomposition. */ +#define uninorm_is_compat_decomposing(nf) \ + ((* (const unsigned int *) (nf) >> 0) & 1) + +/* Test whether a normalization form includes canonical composition. */ +#define uninorm_is_composing(nf) \ + ((* (const unsigned int *) (nf) >> 1) & 1) + +/* Return the decomposing variant of a normalization form. + This maps NFC,NFD -> NFD and NFKC,NFKD -> NFKD. */ +extern uninorm_t uninorm_decomposing_form (uninorm_t nf); + + +/* Return the specified normalization form of a string. */ +extern uint8_t * + u8_normalize (uninorm_t nf, const uint8_t *s, size_t n, + uint8_t *resultbuf, size_t *lengthp); +extern uint16_t * + u16_normalize (uninorm_t nf, const uint16_t *s, size_t n, + uint16_t *resultbuf, size_t *lengthp); +extern uint32_t * + u32_normalize (uninorm_t nf, const uint32_t *s, size_t n, + uint32_t *resultbuf, size_t *lengthp); + + +/* Compare S1 and S2, ignoring differences in normalization. + NF must be either UNINORM_NFD or UNINORM_NFKD. + If successful, set *RESULTP to -1 if S1 < S2, 0 if S1 = S2, 1 if S1 > S2, and + return 0. Upon failure, return -1 with errno set. */ +extern int + u8_normcmp (const uint8_t *s1, size_t n1, const uint8_t *s2, size_t n2, + uninorm_t nf, int *resultp); +extern int + u16_normcmp (const uint16_t *s1, size_t n1, const uint16_t *s2, size_t n2, + uninorm_t nf, int *resultp); +extern int + u32_normcmp (const uint32_t *s1, size_t n1, const uint32_t *s2, size_t n2, + uninorm_t nf, int *resultp); + + +/* Converts the string S of length N to a NUL-terminated byte sequence, in such + a way that comparing uN_normxfrm (S1) and uN_normxfrm (S2) with uN_cmp2() is + equivalent to comparing S1 and S2 with uN_normcoll(). + NF must be either UNINORM_NFC or UNINORM_NFKC. */ +extern char * + u8_normxfrm (const uint8_t *s, size_t n, uninorm_t nf, + char *resultbuf, size_t *lengthp); +extern char * + u16_normxfrm (const uint16_t *s, size_t n, uninorm_t nf, + char *resultbuf, size_t *lengthp); +extern char * + u32_normxfrm (const uint32_t *s, size_t n, uninorm_t nf, + char *resultbuf, size_t *lengthp); + + +/* Compare S1 and S2, ignoring differences in normalization, using the + collation rules of the current locale. + NF must be either UNINORM_NFC or UNINORM_NFKC. + If successful, set *RESULTP to -1 if S1 < S2, 0 if S1 = S2, 1 if S1 > S2, and + return 0. Upon failure, return -1 with errno set. */ +extern int + u8_normcoll (const uint8_t *s1, size_t n1, const uint8_t *s2, size_t n2, + uninorm_t nf, int *resultp); +extern int + u16_normcoll (const uint16_t *s1, size_t n1, const uint16_t *s2, size_t n2, + uninorm_t nf, int *resultp); +extern int + u32_normcoll (const uint32_t *s1, size_t n1, const uint32_t *s2, size_t n2, + uninorm_t nf, int *resultp); + + +/* Normalization of a stream of Unicode characters. + + A "stream of Unicode characters" is essentially a function that accepts an + ucs4_t argument repeatedly, optionally combined with a function that + "flushes" the stream. */ + +/* Data type of a stream of Unicode characters that normalizes its input + according to a given normalization form and passes the normalized character + sequence to the encapsulated stream of Unicode characters. */ +struct uninorm_filter; + +/* Create and return a normalization filter for Unicode characters. + The pair (stream_func, stream_data) is the encapsulated stream. + stream_func (stream_data, uc) receives the Unicode character uc + and returns 0 if successful, or -1 with errno set upon failure. + Return the new filter, or NULL with errno set upon failure. */ +extern struct uninorm_filter * + uninorm_filter_create (uninorm_t nf, + int (*stream_func) (void *stream_data, ucs4_t uc), + void *stream_data); + +/* Stuff a Unicode character into a normalizing filter. + Return 0 if successful, or -1 with errno set upon failure. */ +extern int + uninorm_filter_write (struct uninorm_filter *filter, ucs4_t uc); + +/* Bring data buffered in the filter to its destination, the encapsulated + stream. + Return 0 if successful, or -1 with errno set upon failure. + Note! If after calling this function, additional characters are written + into the filter, the resulting character sequence in the encapsulated stream + will not necessarily be normalized. */ +extern int + uninorm_filter_flush (struct uninorm_filter *filter); + +/* Bring data buffered in the filter to its destination, the encapsulated + stream, then close and free the filter. + Return 0 if successful, or -1 with errno set upon failure. */ +extern int + uninorm_filter_free (struct uninorm_filter *filter); + + +#ifdef __cplusplus +} +#endif + + +#endif /* _UNINORM_H */ |