diff options
Diffstat (limited to 'lib/unicase.h')
-rw-r--r-- | lib/unicase.h | 457 |
1 files changed, 457 insertions, 0 deletions
diff --git a/lib/unicase.h b/lib/unicase.h new file mode 100644 index 00000000..e360ef44 --- /dev/null +++ b/lib/unicase.h @@ -0,0 +1,457 @@ +/* Unicode character case mappings. + Copyright (C) 2002, 2009 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#ifndef _UNICASE_H +#define _UNICASE_H + +#include "unitypes.h" + +/* Get bool. */ +#include <unistring/stdbool.h> + +/* Get size_t. */ +#include <stddef.h> + +/* Get uninorm_t. */ +#include "uninorm.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/* ========================================================================= */ + +/* Character case mappings. + These mappings are locale and context independent. + WARNING! These functions are not sufficient for languages such as German. + Better use the functions below that treat an entire string at once and are + language aware. */ + +/* Return the uppercase mapping of a Unicode character. */ +extern ucs4_t + uc_toupper (ucs4_t uc); + +/* Return the lowercase mapping of a Unicode character. */ +extern ucs4_t + uc_tolower (ucs4_t uc); + +/* Return the titlecase mapping of a Unicode character. */ +extern ucs4_t + uc_totitle (ucs4_t uc); + +/* ========================================================================= */ + +/* String case mappings. */ + +/* These functions are locale dependent. The iso639_language argument + identifies the language (e.g. "tr" for Turkish). NULL means to use + locale independent case mappings. */ + +/* Return the ISO 639 language code of the current locale. + Return "" if it is unknown, or in the "C" locale. */ +extern const char * + uc_locale_language (void); + +/* Conventions: + + All functions prefixed with u8_ operate on UTF-8 encoded strings. + Their unit is an uint8_t (1 byte). + + All functions prefixed with u16_ operate on UTF-16 encoded strings. + Their unit is an uint16_t (a 2-byte word). + + All functions prefixed with u32_ operate on UCS-4 encoded strings. + Their unit is an uint32_t (a 4-byte word). + + All argument pairs (s, n) denote a Unicode string s[0..n-1] with exactly + n units. + + Functions returning a string result take a (resultbuf, lengthp) argument + pair. If resultbuf is not NULL and the result fits into *lengthp units, + it is put in resultbuf, and resultbuf is returned. Otherwise, a freshly + allocated string is returned. In both cases, *lengthp is set to the + length (number of units) of the returned string. In case of error, + NULL is returned and errno is set. */ + +/* Return the uppercase mapping of a string. + The nf argument identifies the normalization form to apply after the + case-mapping. It can also be NULL, for no normalization. */ +extern uint8_t * + u8_toupper (const uint8_t *s, size_t n, const char *iso639_language, + uninorm_t nf, + uint8_t *resultbuf, size_t *lengthp); +extern uint16_t * + u16_toupper (const uint16_t *s, size_t n, const char *iso639_language, + uninorm_t nf, + uint16_t *resultbuf, size_t *lengthp); +extern uint32_t * + u32_toupper (const uint32_t *s, size_t n, const char *iso639_language, + uninorm_t nf, + uint32_t *resultbuf, size_t *lengthp); + +/* Return the lowercase mapping of a string. + The nf argument identifies the normalization form to apply after the + case-mapping. It can also be NULL, for no normalization. */ +extern uint8_t * + u8_tolower (const uint8_t *s, size_t n, const char *iso639_language, + uninorm_t nf, + uint8_t *resultbuf, size_t *lengthp); +extern uint16_t * + u16_tolower (const uint16_t *s, size_t n, const char *iso639_language, + uninorm_t nf, + uint16_t *resultbuf, size_t *lengthp); +extern uint32_t * + u32_tolower (const uint32_t *s, size_t n, const char *iso639_language, + uninorm_t nf, + uint32_t *resultbuf, size_t *lengthp); + +/* Return the titlecase mapping of a string. + The nf argument identifies the normalization form to apply after the + case-mapping. It can also be NULL, for no normalization. */ +extern uint8_t * + u8_totitle (const uint8_t *s, size_t n, const char *iso639_language, + uninorm_t nf, + uint8_t *resultbuf, size_t *lengthp); +extern uint16_t * + u16_totitle (const uint16_t *s, size_t n, const char *iso639_language, + uninorm_t nf, + uint16_t *resultbuf, size_t *lengthp); +extern uint32_t * + u32_totitle (const uint32_t *s, size_t n, const char *iso639_language, + uninorm_t nf, + uint32_t *resultbuf, size_t *lengthp); + +/* The case-mapping context given by a prefix string. */ +typedef struct casing_prefix_context + { + /* These fields are private, undocumented. */ + uint32_t last_char_except_ignorable; + uint32_t last_char_normal_or_above; + } + casing_prefix_context_t; +/* The case-mapping context of the empty prefix string. */ +extern LIBUNISTRING_DLL_VARIABLE const casing_prefix_context_t unicase_empty_prefix_context; +/* Return the case-mapping context of a given prefix string. */ +extern casing_prefix_context_t + u8_casing_prefix_context (const uint8_t *s, size_t n); +extern casing_prefix_context_t + u16_casing_prefix_context (const uint16_t *s, size_t n); +extern casing_prefix_context_t + u32_casing_prefix_context (const uint32_t *s, size_t n); +/* Return the case-mapping context of the prefix concat(A, S), given the + case-mapping context of the prefix A. */ +extern casing_prefix_context_t + u8_casing_prefixes_context (const uint8_t *s, size_t n, + casing_prefix_context_t a_context); +extern casing_prefix_context_t + u16_casing_prefixes_context (const uint16_t *s, size_t n, + casing_prefix_context_t a_context); +extern casing_prefix_context_t + u32_casing_prefixes_context (const uint32_t *s, size_t n, + casing_prefix_context_t a_context); + +/* The case-mapping context given by a suffix string. */ +typedef struct casing_suffix_context + { + /* These fields are private, undocumented. */ + uint32_t first_char_except_ignorable; + uint32_t bits; + } + casing_suffix_context_t; +/* The case-mapping context of the empty suffix string. */ +extern LIBUNISTRING_DLL_VARIABLE const casing_suffix_context_t unicase_empty_suffix_context; +/* Return the case-mapping context of a given suffix string. */ +extern casing_suffix_context_t + u8_casing_suffix_context (const uint8_t *s, size_t n); +extern casing_suffix_context_t + u16_casing_suffix_context (const uint16_t *s, size_t n); +extern casing_suffix_context_t + u32_casing_suffix_context (const uint32_t *s, size_t n); +/* Return the case-mapping context of the suffix concat(S, A), given the + case-mapping context of the suffix A. */ +extern casing_suffix_context_t + u8_casing_suffixes_context (const uint8_t *s, size_t n, + casing_suffix_context_t a_context); +extern casing_suffix_context_t + u16_casing_suffixes_context (const uint16_t *s, size_t n, + casing_suffix_context_t a_context); +extern casing_suffix_context_t + u32_casing_suffixes_context (const uint32_t *s, size_t n, + casing_suffix_context_t a_context); + +/* Return the uppercase mapping of a string that is surrounded by a prefix + and a suffix. */ +extern uint8_t * + u8_ct_toupper (const uint8_t *s, size_t n, + casing_prefix_context_t prefix_context, + casing_suffix_context_t suffix_context, + const char *iso639_language, + uninorm_t nf, + uint8_t *resultbuf, size_t *lengthp); +extern uint16_t * + u16_ct_toupper (const uint16_t *s, size_t n, + casing_prefix_context_t prefix_context, + casing_suffix_context_t suffix_context, + const char *iso639_language, + uninorm_t nf, + uint16_t *resultbuf, size_t *lengthp); +extern uint32_t * + u32_ct_toupper (const uint32_t *s, size_t n, + casing_prefix_context_t prefix_context, + casing_suffix_context_t suffix_context, + const char *iso639_language, + uninorm_t nf, + uint32_t *resultbuf, size_t *lengthp); + +/* Return the lowercase mapping of a string that is surrounded by a prefix + and a suffix. */ +extern uint8_t * + u8_ct_tolower (const uint8_t *s, size_t n, + casing_prefix_context_t prefix_context, + casing_suffix_context_t suffix_context, + const char *iso639_language, + uninorm_t nf, + uint8_t *resultbuf, size_t *lengthp); +extern uint16_t * + u16_ct_tolower (const uint16_t *s, size_t n, + casing_prefix_context_t prefix_context, + casing_suffix_context_t suffix_context, + const char *iso639_language, + uninorm_t nf, + uint16_t *resultbuf, size_t *lengthp); +extern uint32_t * + u32_ct_tolower (const uint32_t *s, size_t n, + casing_prefix_context_t prefix_context, + casing_suffix_context_t suffix_context, + const char *iso639_language, + uninorm_t nf, + uint32_t *resultbuf, size_t *lengthp); + +/* Return the titlecase mapping of a string that is surrounded by a prefix + and a suffix. */ +extern uint8_t * + u8_ct_totitle (const uint8_t *s, size_t n, + casing_prefix_context_t prefix_context, + casing_suffix_context_t suffix_context, + const char *iso639_language, + uninorm_t nf, + uint8_t *resultbuf, size_t *lengthp); +extern uint16_t * + u16_ct_totitle (const uint16_t *s, size_t n, + casing_prefix_context_t prefix_context, + casing_suffix_context_t suffix_context, + const char *iso639_language, + uninorm_t nf, + uint16_t *resultbuf, size_t *lengthp); +extern uint32_t * + u32_ct_totitle (const uint32_t *s, size_t n, + casing_prefix_context_t prefix_context, + casing_suffix_context_t suffix_context, + const char *iso639_language, + uninorm_t nf, + uint32_t *resultbuf, size_t *lengthp); + +/* Return the case folded string. + Comparing uN_casefold (S1) and uN_casefold (S2) with uN_cmp2() is equivalent + to comparing S1 and S2 with uN_casecmp(). + The nf argument identifies the normalization form to apply after the + case-mapping. It can also be NULL, for no normalization. */ +extern uint8_t * + u8_casefold (const uint8_t *s, size_t n, const char *iso639_language, + uninorm_t nf, + uint8_t *resultbuf, size_t *lengthp); +extern uint16_t * + u16_casefold (const uint16_t *s, size_t n, const char *iso639_language, + uninorm_t nf, + uint16_t *resultbuf, size_t *lengthp); +extern uint32_t * + u32_casefold (const uint32_t *s, size_t n, const char *iso639_language, + uninorm_t nf, + uint32_t *resultbuf, size_t *lengthp); +/* Likewise, for a string that is surrounded by a prefix and a suffix. */ +extern uint8_t * + u8_ct_casefold (const uint8_t *s, size_t n, + casing_prefix_context_t prefix_context, + casing_suffix_context_t suffix_context, + const char *iso639_language, + uninorm_t nf, + uint8_t *resultbuf, size_t *lengthp); +extern uint16_t * + u16_ct_casefold (const uint16_t *s, size_t n, + casing_prefix_context_t prefix_context, + casing_suffix_context_t suffix_context, + const char *iso639_language, + uninorm_t nf, + uint16_t *resultbuf, size_t *lengthp); +extern uint32_t * + u32_ct_casefold (const uint32_t *s, size_t n, + casing_prefix_context_t prefix_context, + casing_suffix_context_t suffix_context, + const char *iso639_language, + uninorm_t nf, + uint32_t *resultbuf, size_t *lengthp); + +/* Compare S1 and S2, ignoring differences in case and normalization. + The nf argument identifies the normalization form to apply after the + case-mapping. It can also be NULL, for no normalization. + If successful, set *RESULTP to -1 if S1 < S2, 0 if S1 = S2, 1 if S1 > S2, and + return 0. Upon failure, return -1 with errno set. */ +extern int + u8_casecmp (const uint8_t *s1, size_t n1, + const uint8_t *s2, size_t n2, + const char *iso639_language, uninorm_t nf, int *resultp); +extern int + u16_casecmp (const uint16_t *s1, size_t n1, + const uint16_t *s2, size_t n2, + const char *iso639_language, uninorm_t nf, int *resultp); +extern int + u32_casecmp (const uint32_t *s1, size_t n1, + const uint32_t *s2, size_t n2, + const char *iso639_language, uninorm_t nf, int *resultp); +extern int + ulc_casecmp (const char *s1, size_t n1, + const char *s2, size_t n2, + const char *iso639_language, uninorm_t nf, int *resultp); + +/* Convert the string S of length N to a NUL-terminated byte sequence, in such + a way that comparing uN_casexfrm (S1) and uN_casexfrm (S2) with the gnulib + function memcmp2() is equivalent to comparing S1 and S2 with uN_casecoll(). + NF must be either UNINORM_NFC, UNINORM_NFKC, or NULL for no normalization. */ +extern char * + u8_casexfrm (const uint8_t *s, size_t n, const char *iso639_language, + uninorm_t nf, char *resultbuf, size_t *lengthp); +extern char * + u16_casexfrm (const uint16_t *s, size_t n, const char *iso639_language, + uninorm_t nf, char *resultbuf, size_t *lengthp); +extern char * + u32_casexfrm (const uint32_t *s, size_t n, const char *iso639_language, + uninorm_t nf, char *resultbuf, size_t *lengthp); +extern char * + ulc_casexfrm (const char *s, size_t n, const char *iso639_language, + uninorm_t nf, char *resultbuf, size_t *lengthp); + +/* Compare S1 and S2, ignoring differences in case and normalization, using the + collation rules of the current locale. + The nf argument identifies the normalization form to apply after the + case-mapping. It must be either UNINORM_NFC or UNINORM_NFKC. It can also + be NULL, for no normalization. + If successful, set *RESULTP to -1 if S1 < S2, 0 if S1 = S2, 1 if S1 > S2, and + return 0. Upon failure, return -1 with errno set. */ +extern int + u8_casecoll (const uint8_t *s1, size_t n1, + const uint8_t *s2, size_t n2, + const char *iso639_language, uninorm_t nf, int *resultp); +extern int + u16_casecoll (const uint16_t *s1, size_t n1, + const uint16_t *s2, size_t n2, + const char *iso639_language, uninorm_t nf, int *resultp); +extern int + u32_casecoll (const uint32_t *s1, size_t n1, + const uint32_t *s2, size_t n2, + const char *iso639_language, uninorm_t nf, int *resultp); +extern int + ulc_casecoll (const char *s1, size_t n1, + const char *s2, size_t n2, + const char *iso639_language, uninorm_t nf, int *resultp); + + +/* Set *RESULTP to true if mapping NFD(S) to upper case is a no-op, or to false + otherwise, and return 0. Upon failure, return -1 with errno set. */ +extern int + u8_is_uppercase (const uint8_t *s, size_t n, + const char *iso639_language, + bool *resultp); +extern int + u16_is_uppercase (const uint16_t *s, size_t n, + const char *iso639_language, + bool *resultp); +extern int + u32_is_uppercase (const uint32_t *s, size_t n, + const char *iso639_language, + bool *resultp); + +/* Set *RESULTP to true if mapping NFD(S) to lower case is a no-op, or to false + otherwise, and return 0. Upon failure, return -1 with errno set. */ +extern int + u8_is_lowercase (const uint8_t *s, size_t n, + const char *iso639_language, + bool *resultp); +extern int + u16_is_lowercase (const uint16_t *s, size_t n, + const char *iso639_language, + bool *resultp); +extern int + u32_is_lowercase (const uint32_t *s, size_t n, + const char *iso639_language, + bool *resultp); + +/* Set *RESULTP to true if mapping NFD(S) to title case is a no-op, or to false + otherwise, and return 0. Upon failure, return -1 with errno set. */ +extern int + u8_is_titlecase (const uint8_t *s, size_t n, + const char *iso639_language, + bool *resultp); +extern int + u16_is_titlecase (const uint16_t *s, size_t n, + const char *iso639_language, + bool *resultp); +extern int + u32_is_titlecase (const uint32_t *s, size_t n, + const char *iso639_language, + bool *resultp); + +/* Set *RESULTP to true if applying case folding to NFD(S) is a no-op, or to + false otherwise, and return 0. Upon failure, return -1 with errno set. */ +extern int + u8_is_casefolded (const uint8_t *s, size_t n, + const char *iso639_language, + bool *resultp); +extern int + u16_is_casefolded (const uint16_t *s, size_t n, + const char *iso639_language, + bool *resultp); +extern int + u32_is_casefolded (const uint32_t *s, size_t n, + const char *iso639_language, + bool *resultp); + +/* Set *RESULTP to true if case matters for S, that is, if mapping NFD(S) to + either upper case or lower case or title case is not a no-op. + Set *RESULTP to false if NFD(S) maps to itself under the upper case mapping, + under the lower case mapping, and under the title case mapping; in other + words, when NFD(S) consists entirely of caseless characters. + Upon failure, return -1 with errno set. */ +extern int + u8_is_cased (const uint8_t *s, size_t n, + const char *iso639_language, + bool *resultp); +extern int + u16_is_cased (const uint16_t *s, size_t n, + const char *iso639_language, + bool *resultp); +extern int + u32_is_cased (const uint32_t *s, size_t n, + const char *iso639_language, + bool *resultp); + + +/* ========================================================================= */ + +#ifdef __cplusplus +} +#endif + +#endif /* _UNICASE_H */ |