summaryrefslogtreecommitdiff
path: root/lib/uninorm.h
diff options
context:
space:
mode:
Diffstat (limited to 'lib/uninorm.h')
-rw-r--r--lib/uninorm.h248
1 files changed, 248 insertions, 0 deletions
diff --git a/lib/uninorm.h b/lib/uninorm.h
new file mode 100644
index 00000000..6850967e
--- /dev/null
+++ b/lib/uninorm.h
@@ -0,0 +1,248 @@
+/* Normalization forms (composition and decomposition) of Unicode strings.
+ Copyright (C) 2001-2002, 2009 Free Software Foundation, Inc.
+ Written by Bruno Haible <bruno@clisp.org>, 2009.
+
+ This program is free software: you can redistribute it and/or modify it
+ under the terms of the GNU Lesser General Public License as published
+ by the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>. */
+
+#ifndef _UNINORM_H
+#define _UNINORM_H
+
+/* Get LIBUNISTRING_DLL_VARIABLE. */
+#include <unistring/woe32dll.h>
+
+/* Get size_t. */
+#include <stddef.h>
+
+#include "unitypes.h"
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/* Conventions:
+
+ All functions prefixed with u8_ operate on UTF-8 encoded strings.
+ Their unit is an uint8_t (1 byte).
+
+ All functions prefixed with u16_ operate on UTF-16 encoded strings.
+ Their unit is an uint16_t (a 2-byte word).
+
+ All functions prefixed with u32_ operate on UCS-4 encoded strings.
+ Their unit is an uint32_t (a 4-byte word).
+
+ All argument pairs (s, n) denote a Unicode string s[0..n-1] with exactly
+ n units.
+
+ Functions returning a string result take a (resultbuf, lengthp) argument
+ pair. If resultbuf is not NULL and the result fits into *lengthp units,
+ it is put in resultbuf, and resultbuf is returned. Otherwise, a freshly
+ allocated string is returned. In both cases, *lengthp is set to the
+ length (number of units) of the returned string. In case of error,
+ NULL is returned and errno is set. */
+
+
+enum
+{
+ UC_DECOMP_CANONICAL,/* Canonical decomposition. */
+ UC_DECOMP_FONT, /* <font> A font variant (e.g. a blackletter form). */
+ UC_DECOMP_NOBREAK, /* <noBreak> A no-break version of a space or hyphen. */
+ UC_DECOMP_INITIAL, /* <initial> An initial presentation form (Arabic). */
+ UC_DECOMP_MEDIAL, /* <medial> A medial presentation form (Arabic). */
+ UC_DECOMP_FINAL, /* <final> A final presentation form (Arabic). */
+ UC_DECOMP_ISOLATED,/* <isolated> An isolated presentation form (Arabic). */
+ UC_DECOMP_CIRCLE, /* <circle> An encircled form. */
+ UC_DECOMP_SUPER, /* <super> A superscript form. */
+ UC_DECOMP_SUB, /* <sub> A subscript form. */
+ UC_DECOMP_VERTICAL,/* <vertical> A vertical layout presentation form. */
+ UC_DECOMP_WIDE, /* <wide> A wide (or zenkaku) compatibility character. */
+ UC_DECOMP_NARROW, /* <narrow> A narrow (or hankaku) compatibility character. */
+ UC_DECOMP_SMALL, /* <small> A small variant form (CNS compatibility). */
+ UC_DECOMP_SQUARE, /* <square> A CJK squared font variant. */
+ UC_DECOMP_FRACTION,/* <fraction> A vulgar fraction form. */
+ UC_DECOMP_COMPAT /* <compat> Otherwise unspecified compatibility character. */
+};
+
+/* Maximum size of decomposition of a single Unicode character. */
+#define UC_DECOMPOSITION_MAX_LENGTH 32
+
+/* Return the character decomposition mapping of a Unicode character.
+ DECOMPOSITION must point to an array of at least UC_DECOMPOSITION_MAX_LENGTH
+ ucs_t elements.
+ When a decomposition exists, DECOMPOSITION[0..N-1] and *DECOMP_TAG are
+ filled and N is returned. Otherwise -1 is returned. */
+extern int
+ uc_decomposition (ucs4_t uc, int *decomp_tag, ucs4_t *decomposition);
+
+/* Return the canonical character decomposition mapping of a Unicode character.
+ DECOMPOSITION must point to an array of at least UC_DECOMPOSITION_MAX_LENGTH
+ ucs_t elements.
+ When a decomposition exists, DECOMPOSITION[0..N-1] is filled and N is
+ returned. Otherwise -1 is returned. */
+extern int
+ uc_canonical_decomposition (ucs4_t uc, ucs4_t *decomposition);
+
+
+/* Attempt to combine the Unicode characters uc1, uc2.
+ uc1 is known to have canonical combining class 0.
+ Return the combination of uc1 and uc2, if it exists.
+ Return 0 otherwise.
+ Not all decompositions can be recombined using this function. See the
+ Unicode file CompositionExclusions.txt for details. */
+extern ucs4_t
+ uc_composition (ucs4_t uc1, ucs4_t uc2);
+
+
+/* An object of type uninorm_t denotes a Unicode normalization form. */
+struct unicode_normalization_form;
+typedef const struct unicode_normalization_form *uninorm_t;
+
+/* UNINORM_NFD: Normalization form D: canonical decomposition. */
+extern LIBUNISTRING_DLL_VARIABLE const struct unicode_normalization_form uninorm_nfd;
+#define UNINORM_NFD (&uninorm_nfd)
+
+/* UNINORM_NFC: Normalization form C: canonical decomposition, then
+ canonical composition. */
+extern LIBUNISTRING_DLL_VARIABLE const struct unicode_normalization_form uninorm_nfc;
+#define UNINORM_NFC (&uninorm_nfc)
+
+/* UNINORM_NFKD: Normalization form KD: compatibility decomposition. */
+extern LIBUNISTRING_DLL_VARIABLE const struct unicode_normalization_form uninorm_nfkd;
+#define UNINORM_NFKD (&uninorm_nfkd)
+
+/* UNINORM_NFKC: Normalization form KC: compatibility decomposition, then
+ canonical composition. */
+extern LIBUNISTRING_DLL_VARIABLE const struct unicode_normalization_form uninorm_nfkc;
+#define UNINORM_NFKC (&uninorm_nfkc)
+
+/* Test whether a normalization form does compatibility decomposition. */
+#define uninorm_is_compat_decomposing(nf) \
+ ((* (const unsigned int *) (nf) >> 0) & 1)
+
+/* Test whether a normalization form includes canonical composition. */
+#define uninorm_is_composing(nf) \
+ ((* (const unsigned int *) (nf) >> 1) & 1)
+
+/* Return the decomposing variant of a normalization form.
+ This maps NFC,NFD -> NFD and NFKC,NFKD -> NFKD. */
+extern uninorm_t uninorm_decomposing_form (uninorm_t nf);
+
+
+/* Return the specified normalization form of a string. */
+extern uint8_t *
+ u8_normalize (uninorm_t nf, const uint8_t *s, size_t n,
+ uint8_t *resultbuf, size_t *lengthp);
+extern uint16_t *
+ u16_normalize (uninorm_t nf, const uint16_t *s, size_t n,
+ uint16_t *resultbuf, size_t *lengthp);
+extern uint32_t *
+ u32_normalize (uninorm_t nf, const uint32_t *s, size_t n,
+ uint32_t *resultbuf, size_t *lengthp);
+
+
+/* Compare S1 and S2, ignoring differences in normalization.
+ NF must be either UNINORM_NFD or UNINORM_NFKD.
+ If successful, set *RESULTP to -1 if S1 < S2, 0 if S1 = S2, 1 if S1 > S2, and
+ return 0. Upon failure, return -1 with errno set. */
+extern int
+ u8_normcmp (const uint8_t *s1, size_t n1, const uint8_t *s2, size_t n2,
+ uninorm_t nf, int *resultp);
+extern int
+ u16_normcmp (const uint16_t *s1, size_t n1, const uint16_t *s2, size_t n2,
+ uninorm_t nf, int *resultp);
+extern int
+ u32_normcmp (const uint32_t *s1, size_t n1, const uint32_t *s2, size_t n2,
+ uninorm_t nf, int *resultp);
+
+
+/* Converts the string S of length N to a NUL-terminated byte sequence, in such
+ a way that comparing uN_normxfrm (S1) and uN_normxfrm (S2) with uN_cmp2() is
+ equivalent to comparing S1 and S2 with uN_normcoll().
+ NF must be either UNINORM_NFC or UNINORM_NFKC. */
+extern char *
+ u8_normxfrm (const uint8_t *s, size_t n, uninorm_t nf,
+ char *resultbuf, size_t *lengthp);
+extern char *
+ u16_normxfrm (const uint16_t *s, size_t n, uninorm_t nf,
+ char *resultbuf, size_t *lengthp);
+extern char *
+ u32_normxfrm (const uint32_t *s, size_t n, uninorm_t nf,
+ char *resultbuf, size_t *lengthp);
+
+
+/* Compare S1 and S2, ignoring differences in normalization, using the
+ collation rules of the current locale.
+ NF must be either UNINORM_NFC or UNINORM_NFKC.
+ If successful, set *RESULTP to -1 if S1 < S2, 0 if S1 = S2, 1 if S1 > S2, and
+ return 0. Upon failure, return -1 with errno set. */
+extern int
+ u8_normcoll (const uint8_t *s1, size_t n1, const uint8_t *s2, size_t n2,
+ uninorm_t nf, int *resultp);
+extern int
+ u16_normcoll (const uint16_t *s1, size_t n1, const uint16_t *s2, size_t n2,
+ uninorm_t nf, int *resultp);
+extern int
+ u32_normcoll (const uint32_t *s1, size_t n1, const uint32_t *s2, size_t n2,
+ uninorm_t nf, int *resultp);
+
+
+/* Normalization of a stream of Unicode characters.
+
+ A "stream of Unicode characters" is essentially a function that accepts an
+ ucs4_t argument repeatedly, optionally combined with a function that
+ "flushes" the stream. */
+
+/* Data type of a stream of Unicode characters that normalizes its input
+ according to a given normalization form and passes the normalized character
+ sequence to the encapsulated stream of Unicode characters. */
+struct uninorm_filter;
+
+/* Create and return a normalization filter for Unicode characters.
+ The pair (stream_func, stream_data) is the encapsulated stream.
+ stream_func (stream_data, uc) receives the Unicode character uc
+ and returns 0 if successful, or -1 with errno set upon failure.
+ Return the new filter, or NULL with errno set upon failure. */
+extern struct uninorm_filter *
+ uninorm_filter_create (uninorm_t nf,
+ int (*stream_func) (void *stream_data, ucs4_t uc),
+ void *stream_data);
+
+/* Stuff a Unicode character into a normalizing filter.
+ Return 0 if successful, or -1 with errno set upon failure. */
+extern int
+ uninorm_filter_write (struct uninorm_filter *filter, ucs4_t uc);
+
+/* Bring data buffered in the filter to its destination, the encapsulated
+ stream.
+ Return 0 if successful, or -1 with errno set upon failure.
+ Note! If after calling this function, additional characters are written
+ into the filter, the resulting character sequence in the encapsulated stream
+ will not necessarily be normalized. */
+extern int
+ uninorm_filter_flush (struct uninorm_filter *filter);
+
+/* Bring data buffered in the filter to its destination, the encapsulated
+ stream, then close and free the filter.
+ Return 0 if successful, or -1 with errno set upon failure. */
+extern int
+ uninorm_filter_free (struct uninorm_filter *filter);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif /* _UNINORM_H */