summaryrefslogtreecommitdiff
path: root/doc/libunistring_12.html
diff options
context:
space:
mode:
Diffstat (limited to 'doc/libunistring_12.html')
-rw-r--r--doc/libunistring_12.html475
1 files changed, 391 insertions, 84 deletions
diff --git a/doc/libunistring_12.html b/doc/libunistring_12.html
index f4bee9d3..1a4db370 100644
--- a/doc/libunistring_12.html
+++ b/doc/libunistring_12.html
@@ -1,6 +1,6 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html401/loose.dtd">
<html>
-<!-- Created on July, 8 2015 by texi2html 1.78a -->
+<!-- Created on March, 30 2010 by texi2html 1.78a -->
<!--
Written by: Lionel Cons <Lionel.Cons@cern.ch> (original author)
Karl Berry <karl@freefriends.org>
@@ -11,10 +11,10 @@ Send bugs and suggestions to <texi2html-bug@nongnu.org>
-->
<head>
-<title>GNU libunistring: 12. Line breaking &lt;unilbrk.h&gt;</title>
+<title>GNU libunistring: 12. Normalization forms (composition and decomposition) &lt;uninorm.h&gt;</title>
-<meta name="description" content="GNU libunistring: 12. Line breaking &lt;unilbrk.h&gt;">
-<meta name="keywords" content="GNU libunistring: 12. Line breaking &lt;unilbrk.h&gt;">
+<meta name="description" content="GNU libunistring: 12. Normalization forms (composition and decomposition) &lt;uninorm.h&gt;">
+<meta name="keywords" content="GNU libunistring: 12. Normalization forms (composition and decomposition) &lt;uninorm.h&gt;">
<meta name="resource-type" content="document">
<meta name="distribution" content="global">
<meta name="Generator" content="texi2html 1.78a">
@@ -42,7 +42,7 @@ ul.toc {list-style: none}
<body lang="en" bgcolor="#FFFFFF" text="#000000" link="#0000FF" vlink="#800080" alink="#FF0000">
<table cellpadding="1" cellspacing="1" border="0">
-<tr><td valign="middle" align="left">[<a href="libunistring_11.html#SEC44" title="Beginning of this chapter or previous chapter"> &lt;&lt; </a>]</td>
+<tr><td valign="middle" align="left">[<a href="libunistring_11.html#SEC41" title="Beginning of this chapter or previous chapter"> &lt;&lt; </a>]</td>
<td valign="middle" align="left">[<a href="libunistring_13.html#SEC48" title="Next chapter"> &gt;&gt; </a>]</td>
<td valign="middle" align="left"> &nbsp; </td>
<td valign="middle" align="left"> &nbsp; </td>
@@ -51,133 +51,440 @@ ul.toc {list-style: none}
<td valign="middle" align="left"> &nbsp; </td>
<td valign="middle" align="left">[<a href="libunistring.html#SEC_Top" title="Cover (top) of document">Top</a>]</td>
<td valign="middle" align="left">[<a href="libunistring.html#SEC_Contents" title="Table of contents">Contents</a>]</td>
-<td valign="middle" align="left">[<a href="libunistring_19.html#SEC77" title="Index">Index</a>]</td>
+<td valign="middle" align="left">[<a href="libunistring_18.html#SEC71" title="Index">Index</a>]</td>
<td valign="middle" align="left">[<a href="libunistring_abt.html#SEC_About" title="About (help)"> ? </a>]</td>
</tr></table>
<hr size="2">
-<a name="unilbrk_002eh"></a>
-<a name="SEC47"></a>
-<h1 class="chapter"> <a href="libunistring.html#TOC47">12. Line breaking <code>&lt;unilbrk.h&gt;</code></a> </h1>
+<a name="uninorm_002eh"></a>
+<a name="SEC42"></a>
+<h1 class="chapter"> <a href="libunistring.html#TOC42">12. Normalization forms (composition and decomposition) <code>&lt;uninorm.h&gt;</code></a> </h1>
+
+<p>This include file defines functions for transforming Unicode strings to one
+of the four normal forms, known as NFC, NFD, NKFC, NFKD. These
+transformations involve decomposition and &mdash; for NFC and NFKC &mdash; composition
+of Unicode characters.
+</p>
+
+<hr size="6">
+<a name="Decomposition-of-characters"></a>
+<a name="SEC43"></a>
+<h2 class="section"> <a href="libunistring.html#TOC43">12.1 Decomposition of Unicode characters</a> </h2>
+
+<p>The following enumerated values are the possible types of decomposition of a
+Unicode character.
+</p>
+<dl>
+<dt><u>Constant:</u> int <b>UC_DECOMP_CANONICAL</b>
+<a name="IDX646"></a>
+</dt>
+<dd><p>Denotes canonical decomposition.
+</p></dd></dl>
+
+<dl>
+<dt><u>Constant:</u> int <b>UC_DECOMP_FONT</b>
+<a name="IDX647"></a>
+</dt>
+<dd><p>UCD marker: <code>&lt;font&gt;</code>. Denotes a font variant (e.g. a blackletter form).
+</p></dd></dl>
+
+<dl>
+<dt><u>Constant:</u> int <b>UC_DECOMP_NOBREAK</b>
+<a name="IDX648"></a>
+</dt>
+<dd><p>UCD marker: <code>&lt;noBreak&gt;</code>.
+Denotes a no-break version of a space or hyphen.
+</p></dd></dl>
+
+<dl>
+<dt><u>Constant:</u> int <b>UC_DECOMP_INITIAL</b>
+<a name="IDX649"></a>
+</dt>
+<dd><p>UCD marker: <code>&lt;initial&gt;</code>.
+Denotes an initial presentation form (Arabic).
+</p></dd></dl>
+
+<dl>
+<dt><u>Constant:</u> int <b>UC_DECOMP_MEDIAL</b>
+<a name="IDX650"></a>
+</dt>
+<dd><p>UCD marker: <code>&lt;medial&gt;</code>.
+Denotes a medial presentation form (Arabic).
+</p></dd></dl>
+
+<dl>
+<dt><u>Constant:</u> int <b>UC_DECOMP_FINAL</b>
+<a name="IDX651"></a>
+</dt>
+<dd><p>UCD marker: <code>&lt;final&gt;</code>.
+Denotes a final presentation form (Arabic).
+</p></dd></dl>
+
+<dl>
+<dt><u>Constant:</u> int <b>UC_DECOMP_ISOLATED</b>
+<a name="IDX652"></a>
+</dt>
+<dd><p>UCD marker: <code>&lt;isolated&gt;</code>.
+Denotes an isolated presentation form (Arabic).
+</p></dd></dl>
+
+<dl>
+<dt><u>Constant:</u> int <b>UC_DECOMP_CIRCLE</b>
+<a name="IDX653"></a>
+</dt>
+<dd><p>UCD marker: <code>&lt;circle&gt;</code>.
+Denotes an encircled form.
+</p></dd></dl>
+
+<dl>
+<dt><u>Constant:</u> int <b>UC_DECOMP_SUPER</b>
+<a name="IDX654"></a>
+</dt>
+<dd><p>UCD marker: <code>&lt;super&gt;</code>.
+Denotes a superscript form.
+</p></dd></dl>
+
+<dl>
+<dt><u>Constant:</u> int <b>UC_DECOMP_SUB</b>
+<a name="IDX655"></a>
+</dt>
+<dd><p>UCD marker: <code>&lt;sub&gt;</code>.
+Denotes a subscript form.
+</p></dd></dl>
+
+<dl>
+<dt><u>Constant:</u> int <b>UC_DECOMP_VERTICAL</b>
+<a name="IDX656"></a>
+</dt>
+<dd><p>UCD marker: <code>&lt;vertical&gt;</code>.
+Denotes a vertical layout presentation form.
+</p></dd></dl>
+
+<dl>
+<dt><u>Constant:</u> int <b>UC_DECOMP_WIDE</b>
+<a name="IDX657"></a>
+</dt>
+<dd><p>UCD marker: <code>&lt;wide&gt;</code>.
+Denotes a wide (or zenkaku) compatibility character.
+</p></dd></dl>
+
+<dl>
+<dt><u>Constant:</u> int <b>UC_DECOMP_NARROW</b>
+<a name="IDX658"></a>
+</dt>
+<dd><p>UCD marker: <code>&lt;narrow&gt;</code>.
+Denotes a narrow (or hankaku) compatibility character.
+</p></dd></dl>
+
+<dl>
+<dt><u>Constant:</u> int <b>UC_DECOMP_SMALL</b>
+<a name="IDX659"></a>
+</dt>
+<dd><p>UCD marker: <code>&lt;small&gt;</code>.
+Denotes a small variant form (CNS compatibility).
+</p></dd></dl>
+
+<dl>
+<dt><u>Constant:</u> int <b>UC_DECOMP_SQUARE</b>
+<a name="IDX660"></a>
+</dt>
+<dd><p>UCD marker: <code>&lt;square&gt;</code>.
+Denotes a CJK squared font variant.
+</p></dd></dl>
+
+<dl>
+<dt><u>Constant:</u> int <b>UC_DECOMP_FRACTION</b>
+<a name="IDX661"></a>
+</dt>
+<dd><p>UCD marker: <code>&lt;fraction&gt;</code>.
+Denotes a vulgar fraction form.
+</p></dd></dl>
+
+<dl>
+<dt><u>Constant:</u> int <b>UC_DECOMP_COMPAT</b>
+<a name="IDX662"></a>
+</dt>
+<dd><p>UCD marker: <code>&lt;compat&gt;</code>.
+Denotes an otherwise unspecified compatibility character.
+</p></dd></dl>
+
+<p>The following constant denotes the maximum size of decomposition of a single
+Unicode character.
+</p>
+<dl>
+<dt><u>Macro:</u> unsigned int <b>UC_DECOMPOSITION_MAX_LENGTH</b>
+<a name="IDX663"></a>
+</dt>
+<dd><p>This macro expands to a constant that is the required size of buffer passed to
+the <code>uc_decomposition</code> and <code>uc_canonical_decomposition</code> functions.
+</p></dd></dl>
-<p>This include file declares functions for determining where in a string
-line breaks could or should be introduced, in order to make the displayed
-string fit into a column of given width.
+<p>The following functions decompose a Unicode character.
+</p>
+<dl>
+<dt><u>Function:</u> int <b>uc_decomposition</b><i> (ucs4_t <var>uc</var>, int *<var>decomp_tag</var>, ucs4_t *<var>decomposition</var>)</i>
+<a name="IDX664"></a>
+</dt>
+<dd><p>Returns the character decomposition mapping of the Unicode character <var>uc</var>.
+<var>decomposition</var> must point to an array of at least
+<code>UC_DECOMPOSITION_MAX_LENGTH</code> <code>ucs_t</code> elements.
</p>
-<p>These functions are locale dependent. The <var>encoding</var> argument identifies
-the encoding (e.g. <code>&quot;ISO-8859-2&quot;</code> for Polish).
+<p>When a decomposition exists, <code><var>decomposition</var>[0..<var>n</var>-1]</code> and
+<code>*<var>decomp_tag</var></code> are filled and <var>n</var> is returned. Otherwise -1 is
+returned.
+</p></dd></dl>
+
+<dl>
+<dt><u>Function:</u> int <b>uc_canonical_decomposition</b><i> (ucs4_t <var>uc</var>, ucs4_t *<var>decomposition</var>)</i>
+<a name="IDX665"></a>
+</dt>
+<dd><p>Returns the canonical character decomposition mapping of the Unicode character
+<var>uc</var>. <var>decomposition</var> must point to an array of at least
+<code>UC_DECOMPOSITION_MAX_LENGTH</code> <code>ucs_t</code> elements.
</p>
-<p>The following enumerated values indicate whether, at a given position, a line
-break is possible or not. Given an string <var>s</var> as an array
-<code><var>s</var>[0..<var>n</var>-1]</code> and a position <var>i</var>, the values have the
-following meanings:
+<p>When a decomposition exists, <code><var>decomposition</var>[0..<var>n</var>-1]</code> is filled
+and <var>n</var> is returned. Otherwise -1 is returned.
+</p></dd></dl>
+
+<hr size="6">
+<a name="Composition-of-characters"></a>
+<a name="SEC44"></a>
+<h2 class="section"> <a href="libunistring.html#TOC44">12.2 Composition of Unicode characters</a> </h2>
+
+<p>The following function composes a Unicode character from two Unicode
+characters.
</p>
<dl>
-<dt><u>Constant:</u> int <b>UC_BREAK_MANDATORY</b>
-<a name="IDX754"></a>
+<dt><u>Function:</u> ucs4_t <b>uc_composition</b><i> (ucs4_t <var>uc1</var>, ucs4_t <var>uc2</var>)</i>
+<a name="IDX666"></a>
</dt>
-<dd><p>This value indicates that <code><var>s</var>[<var>i</var>]</code> is a line break character.
+<dd><p>Attempts to combine the Unicode characters <var>uc1</var>, <var>uc2</var>.
+<var>uc1</var> is known to have canonical combining class 0.
+</p>
+<p>Returns the combination of <var>uc1</var> and <var>uc2</var>, if it exists.
+Returns 0 otherwise.
+</p>
+<p>Not all decompositions can be recombined using this function. See the Unicode
+file &lsquo;<tt>CompositionExclusions.txt</tt>&rsquo; for details.
</p></dd></dl>
+<hr size="6">
+<a name="Normalization-of-strings"></a>
+<a name="SEC45"></a>
+<h2 class="section"> <a href="libunistring.html#TOC45">12.3 Normalization of strings</a> </h2>
+
+<p>The Unicode standard defines four normalization forms for Unicode strings.
+The following type is used to denote a normalization form.
+</p>
<dl>
-<dt><u>Constant:</u> int <b>UC_BREAK_POSSIBLE</b>
-<a name="IDX755"></a>
+<dt><u>Type:</u> <b>uninorm_t</b>
+<a name="IDX667"></a>
</dt>
-<dd><p>This value indicates that a line break may be inserted between
-<code><var>s</var>[<var>i</var>-1]</code> and <code><var>s</var>[<var>i</var>]</code>.
+<dd><p>An object of type <code>uninorm_t</code> denotes a Unicode normalization form.
+This is a scalar type; its values can be compared with <code>==</code>.
</p></dd></dl>
+<p>The following constants denote the four normalization forms.
+</p>
<dl>
-<dt><u>Constant:</u> int <b>UC_BREAK_HYPHENATION</b>
-<a name="IDX756"></a>
+<dt><u>Macro:</u> uninorm_t <b>UNINORM_NFD</b>
+<a name="IDX668"></a>
</dt>
-<dd><p>This value indicates that a hyphen and a line break may be inserted between
-<code><var>s</var>[<var>i</var>-1]</code> and <code><var>s</var>[<var>i</var>]</code>. But beware of language
-dependent hyphenation rules.
+<dd><p>Denotes Normalization form D: canonical decomposition.
</p></dd></dl>
<dl>
-<dt><u>Constant:</u> int <b>UC_BREAK_PROHIBITED</b>
-<a name="IDX757"></a>
+<dt><u>Macro:</u> uninorm_t <b>UNINORM_NFC</b>
+<a name="IDX669"></a>
</dt>
-<dd><p>This value indicates that <code><var>s</var>[<var>i</var>-1]</code> and <code><var>s</var>[<var>i</var>]</code>
-must not be separated.
+<dd><p>Normalization form C: canonical decomposition, then canonical composition.
</p></dd></dl>
<dl>
-<dt><u>Constant:</u> int <b>UC_BREAK_UNDEFINED</b>
-<a name="IDX758"></a>
+<dt><u>Macro:</u> uninorm_t <b>UNINORM_NFKD</b>
+<a name="IDX670"></a>
</dt>
-<dd><p>This value is not used as a return value; rather, in the overriding argument of
-the <code>u*_width_linebreaks</code> functions, it indicates the absence of an
-override.
+<dd><p>Normalization form KD: compatibility decomposition.
</p></dd></dl>
-<p>The following functions determine the positions at which line breaks are
-possible.
+<dl>
+<dt><u>Macro:</u> uninorm_t <b>UNINORM_NFKC</b>
+<a name="IDX671"></a>
+</dt>
+<dd><p>Normalization form KC: compatibility decomposition, then canonical composition.
+</p></dd></dl>
+
+<p>The following functions operate on <code>uninorm_t</code> objects.
</p>
<dl>
-<dt><u>Function:</u> void <b>u8_possible_linebreaks</b><i> (const uint8_t *<var>s</var>, size_t <var>n</var>, const char *<var>encoding</var>, char *<var>p</var>)</i>
-<a name="IDX759"></a>
+<dt><u>Function:</u> bool <b>uninorm_is_compat_decomposing</b><i> (uninorm_t <var>nf</var>)</i>
+<a name="IDX672"></a>
</dt>
-<dt><u>Function:</u> void <b>u16_possible_linebreaks</b><i> (const uint16_t *<var>s</var>, size_t <var>n</var>, const char *<var>encoding</var>, char *<var>p</var>)</i>
-<a name="IDX760"></a>
+<dd><p>Tests whether the normalization form <var>nf</var> does compatibility decomposition.
+</p></dd></dl>
+
+<dl>
+<dt><u>Function:</u> bool <b>uninorm_is_composing</b><i> (uninorm_t <var>nf</var>)</i>
+<a name="IDX673"></a>
</dt>
-<dt><u>Function:</u> void <b>u32_possible_linebreaks</b><i> (const uint32_t *<var>s</var>, size_t <var>n</var>, const char *<var>encoding</var>, char *<var>p</var>)</i>
-<a name="IDX761"></a>
+<dd><p>Tests whether the normalization form <var>nf</var> includes canonical composition.
+</p></dd></dl>
+
+<dl>
+<dt><u>Function:</u> uninorm_t <b>uninorm_decomposing_form</b><i> (uninorm_t <var>nf</var>)</i>
+<a name="IDX674"></a>
</dt>
-<dt><u>Function:</u> void <b>ulc_possible_linebreaks</b><i> (const char *<var>s</var>, size_t <var>n</var>, const char *<var>encoding</var>, char *<var>p</var>)</i>
-<a name="IDX762"></a>
+<dd><p>Returns the decomposing variant of the normalization form <var>nf</var>.
+This maps NFC,NFD → NFD and NFKC,NFKD → NFKD.
+</p></dd></dl>
+
+<p>The following functions apply a Unicode normalization form to a Unicode string.
+</p>
+<dl>
+<dt><u>Function:</u> uint8_t * <b>u8_normalize</b><i> (uninorm_t <var>nf</var>, const uint8_t *<var>s</var>, size_t <var>n</var>, uint8_t *<var>resultbuf</var>, size_t *<var>lengthp</var>)</i>
+<a name="IDX675"></a>
+</dt>
+<dt><u>Function:</u> uint16_t * <b>u16_normalize</b><i> (uninorm_t <var>nf</var>, const uint16_t *<var>s</var>, size_t <var>n</var>, uint16_t *<var>resultbuf</var>, size_t *<var>lengthp</var>)</i>
+<a name="IDX676"></a>
+</dt>
+<dt><u>Function:</u> uint32_t * <b>u32_normalize</b><i> (uninorm_t <var>nf</var>, const uint32_t *<var>s</var>, size_t <var>n</var>, uint32_t *<var>resultbuf</var>, size_t *<var>lengthp</var>)</i>
+<a name="IDX677"></a>
</dt>
-<dd><p>Determines the line break points in <var>s</var>, and stores the result at
-<code><var>p</var>[0..<var>n</var>-1]</code>. Every <code><var>p</var>[<var>i</var>]</code> is assigned one of
-the values <code>UC_BREAK_MANDATORY</code>, <code>UC_BREAK_POSSIBLE</code>,
-<code>UC_BREAK_HYPHENATION</code>, <code>UC_BREAK_PROHIBITED</code>.
+<dd><p>Returns the specified normalization form of a string.
</p></dd></dl>
-<p>The following functions determine where line breaks should be inserted so that
-each line fits in a given width, when output to a device that uses
-non-proportional fonts.
+<hr size="6">
+<a name="Normalizing-comparisons"></a>
+<a name="SEC46"></a>
+<h2 class="section"> <a href="libunistring.html#TOC46">12.4 Normalizing comparisons</a> </h2>
+
+<p>The following functions compare Unicode string, ignoring differences in
+normalization.
</p>
<dl>
-<dt><u>Function:</u> int <b>u8_width_linebreaks</b><i> (const uint8_t *<var>s</var>, size_t <var>n</var>, int <var>width</var>, int <var>start_column</var>, int <var>at_end_columns</var>, const char *<var>override</var>, const char *<var>encoding</var>, char *<var>p</var>)</i>
-<a name="IDX763"></a>
+<dt><u>Function:</u> int <b>u8_normcmp</b><i> (const uint8_t *<var>s1</var>, size_t <var>n1</var>, const uint8_t *<var>s2</var>, size_t <var>n2</var>, uninorm_t <var>nf</var>, int *<var>resultp</var>)</i>
+<a name="IDX678"></a>
</dt>
-<dt><u>Function:</u> int <b>u16_width_linebreaks</b><i> (const uint16_t *<var>s</var>, size_t <var>n</var>, int <var>width</var>, int <var>start_column</var>, int <var>at_end_columns</var>, const char *<var>override</var>, const char *<var>encoding</var>, char *<var>p</var>)</i>
-<a name="IDX764"></a>
+<dt><u>Function:</u> int <b>u16_normcmp</b><i> (const uint16_t *<var>s1</var>, size_t <var>n1</var>, const uint16_t *<var>s2</var>, size_t <var>n2</var>, uninorm_t <var>nf</var>, int *<var>resultp</var>)</i>
+<a name="IDX679"></a>
</dt>
-<dt><u>Function:</u> int <b>u32_width_linebreaks</b><i> (const uint32_t *<var>s</var>, size_t <var>n</var>, int <var>width</var>, int <var>start_column</var>, int <var>at_end_columns</var>, const char *<var>override</var>, const char *<var>encoding</var>, char *<var>p</var>)</i>
-<a name="IDX765"></a>
+<dt><u>Function:</u> int <b>u32_normcmp</b><i> (const uint32_t *<var>s1</var>, size_t <var>n1</var>, const uint32_t *<var>s2</var>, size_t <var>n2</var>, uninorm_t <var>nf</var>, int *<var>resultp</var>)</i>
+<a name="IDX680"></a>
</dt>
-<dt><u>Function:</u> int <b>ulc_width_linebreaks</b><i> (const char *<var>s</var>, size_t <var>n</var>, int <var>width</var>, int <var>start_column</var>, int <var>at_end_columns</var>, const char *<var>override</var>, const char *<var>encoding</var>, char *<var>p</var>)</i>
-<a name="IDX766"></a>
+<dd><p>Compares <var>s1</var> and <var>s2</var>, ignoring differences in normalization.
+</p>
+<p><var>nf</var> must be either <code>UNINORM_NFD</code> or <code>UNINORM_NFKD</code>.
+</p>
+<p>If successful, sets <code>*<var>resultp</var></code> to -1 if <var>s1</var> &lt; <var>s2</var>,
+0 if <var>s1</var> = <var>s2</var>, 1 if <var>s1</var> &gt; <var>s2</var>, and returns 0.
+Upon failure, returns -1 with <code>errno</code> set.
+</p></dd></dl>
+
+<a name="IDX681"></a>
+<a name="IDX682"></a>
+<dl>
+<dt><u>Function:</u> char * <b>u8_normxfrm</b><i> (const uint8_t *<var>s</var>, size_t <var>n</var>, uninorm_t <var>nf</var>, char *<var>resultbuf</var>, size_t *<var>lengthp</var>)</i>
+<a name="IDX683"></a>
+</dt>
+<dt><u>Function:</u> char * <b>u16_normxfrm</b><i> (const uint16_t *<var>s</var>, size_t <var>n</var>, uninorm_t <var>nf</var>, char *<var>resultbuf</var>, size_t *<var>lengthp</var>)</i>
+<a name="IDX684"></a>
+</dt>
+<dt><u>Function:</u> char * <b>u32_normxfrm</b><i> (const uint32_t *<var>s</var>, size_t <var>n</var>, uninorm_t <var>nf</var>, char *<var>resultbuf</var>, size_t *<var>lengthp</var>)</i>
+<a name="IDX685"></a>
+</dt>
+<dd><p>Converts the string <var>s</var> of length <var>n</var> to a NUL-terminated byte
+sequence, in such a way that comparing <code>u8_normxfrm (<var>s1</var>)</code> and
+<code>u8_normxfrm (<var>s2</var>)</code> with the <code>u8_cmp2</code> function is equivalent to
+comparing <var>s1</var> and <var>s2</var> with the <code>u8_normcoll</code> function.
+</p>
+<p><var>nf</var> must be either <code>UNINORM_NFC</code> or <code>UNINORM_NFKC</code>.
+</p></dd></dl>
+
+<dl>
+<dt><u>Function:</u> int <b>u8_normcoll</b><i> (const uint8_t *<var>s1</var>, size_t <var>n1</var>, const uint8_t *<var>s2</var>, size_t <var>n2</var>, uninorm_t <var>nf</var>, int *<var>resultp</var>)</i>
+<a name="IDX686"></a>
+</dt>
+<dt><u>Function:</u> int <b>u16_normcoll</b><i> (const uint16_t *<var>s1</var>, size_t <var>n1</var>, const uint16_t *<var>s2</var>, size_t <var>n2</var>, uninorm_t <var>nf</var>, int *<var>resultp</var>)</i>
+<a name="IDX687"></a>
+</dt>
+<dt><u>Function:</u> int <b>u32_normcoll</b><i> (const uint32_t *<var>s1</var>, size_t <var>n1</var>, const uint32_t *<var>s2</var>, size_t <var>n2</var>, uninorm_t <var>nf</var>, int *<var>resultp</var>)</i>
+<a name="IDX688"></a>
+</dt>
+<dd><p>Compares <var>s1</var> and <var>s2</var>, ignoring differences in normalization, using
+the collation rules of the current locale.
+</p>
+<p><var>nf</var> must be either <code>UNINORM_NFC</code> or <code>UNINORM_NFKC</code>.
+</p>
+<p>If successful, sets <code>*<var>resultp</var></code> to -1 if <var>s1</var> &lt; <var>s2</var>,
+0 if <var>s1</var> = <var>s2</var>, 1 if <var>s1</var> &gt; <var>s2</var>, and returns 0.
+Upon failure, returns -1 with <code>errno</code> set.
+</p></dd></dl>
+
+<hr size="6">
+<a name="Normalization-of-streams"></a>
+<a name="SEC47"></a>
+<h2 class="section"> <a href="libunistring.html#TOC47">12.5 Normalization of streams of Unicode characters</a> </h2>
+
+<p>A &ldquo;stream of Unicode characters&rdquo; is essentially a function that accepts an
+<code>ucs4_t</code> argument repeatedly, optionally combined with a function that
+&ldquo;flushes&rdquo; the stream.
+</p>
+<dl>
+<dt><u>Type:</u> <b>struct uninorm_filter</b>
+<a name="IDX689"></a>
+</dt>
+<dd><p>This is the data type of a stream of Unicode characters that normalizes its
+input according to a given normalization form and passes the normalized
+character sequence to the encapsulated stream of Unicode characters.
+</p></dd></dl>
+
+<dl>
+<dt><u>Function:</u> struct uninorm_filter * <b>uninorm_filter_create</b><i> (uninorm_t <var>nf</var>, int (*<var>stream_func</var>) (void *<var>stream_data</var>, ucs4_t <var>uc</var>), void *<var>stream_data</var>)</i>
+<a name="IDX690"></a>
</dt>
-<dd><p>Chooses the best line breaks, assuming that every character occupies a width
-given by the <code>uc_width</code> function (see <a href="libunistring_9.html#SEC40">Display width <code>&lt;uniwidth.h&gt;</code></a>).
+<dd><p>Creates and returns a normalization filter for Unicode characters.
</p>
-<p>The string is <code><var>s</var>[0..<var>n</var>-1]</code>.
+<p>The pair (<var>stream_func</var>, <var>stream_data</var>) is the encapsulated stream.
+<code><var>stream_func</var> (<var>stream_data</var>, <var>uc</var>)</code> receives the Unicode
+character <var>uc</var> and returns 0 if successful, or -1 with <code>errno</code> set
+upon failure.
</p>
-<p>The maximum number of columns per line is given as <var>width</var>.
-The starting column of the string is given as <var>start_column</var>.
-If the algorithm shall keep room after the last piece, this amount of room can
-be given as <var>at_end_columns</var>.
+<p>Returns the new filter, or NULL with <code>errno</code> set upon failure.
+</p></dd></dl>
+
+<dl>
+<dt><u>Function:</u> int <b>uninorm_filter_write</b><i> (struct uninorm_filter *<var>filter</var>, ucs4_t <var>uc</var>)</i>
+<a name="IDX691"></a>
+</dt>
+<dd><p>Stuffs a Unicode character into a normalizing filter.
+Returns 0 if successful, or -1 with <code>errno</code> set upon failure.
+</p></dd></dl>
+
+<dl>
+<dt><u>Function:</u> int <b>uninorm_filter_flush</b><i> (struct uninorm_filter *<var>filter</var>)</i>
+<a name="IDX692"></a>
+</dt>
+<dd><p>Brings data buffered in the filter to its destination, the encapsulated stream.
</p>
-<p><var>override</var> is an optional override; if
-<code><var>override</var>[<var>i</var>] != UC_BREAK_UNDEFINED</code>,
-<code><var>override</var>[<var>i</var>]</code> takes precedence over <code><var>p</var>[<var>i</var>]</code>
-as returned by the <code>u*_possible_linebreaks</code> function.
+<p>Returns 0 if successful, or -1 with <code>errno</code> set upon failure.
</p>
-<p>The given <var>encoding</var> is used for disambiguating widths in <code>uc_width</code>.
+<p>Note! If after calling this function, additional characters are written
+into the filter, the resulting character sequence in the encapsulated stream
+will not necessarily be normalized.
+</p></dd></dl>
+
+<dl>
+<dt><u>Function:</u> int <b>uninorm_filter_free</b><i> (struct uninorm_filter *<var>filter</var>)</i>
+<a name="IDX693"></a>
+</dt>
+<dd><p>Brings data buffered in the filter to its destination, the encapsulated stream,
+then closes and frees the filter.
</p>
-<p>Returns the column after the end of the string, and stores the result at
-<code><var>p</var>[0..<var>n</var>-1]</code>. Every <code><var>p</var>[<var>i</var>]</code> is assigned one of
-the values <code>UC_BREAK_MANDATORY</code>, <code>UC_BREAK_POSSIBLE</code>,
-<code>UC_BREAK_HYPHENATION</code>, <code>UC_BREAK_PROHIBITED</code>. Here the value
-<code>UC_BREAK_POSSIBLE</code> indicates that a line break <em>should</em> be inserted.
+<p>Returns 0 if successful, or -1 with <code>errno</code> set upon failure.
</p></dd></dl>
<hr size="6">
<table cellpadding="1" cellspacing="1" border="0">
-<tr><td valign="middle" align="left">[<a href="libunistring_11.html#SEC44" title="Beginning of this chapter or previous chapter"> &lt;&lt; </a>]</td>
+<tr><td valign="middle" align="left">[<a href="#SEC42" title="Beginning of this chapter or previous chapter"> &lt;&lt; </a>]</td>
<td valign="middle" align="left">[<a href="libunistring_13.html#SEC48" title="Next chapter"> &gt;&gt; </a>]</td>
<td valign="middle" align="left"> &nbsp; </td>
<td valign="middle" align="left"> &nbsp; </td>
@@ -186,12 +493,12 @@ the values <code>UC_BREAK_MANDATORY</code>, <code>UC_BREAK_POSSIBLE</code>,
<td valign="middle" align="left"> &nbsp; </td>
<td valign="middle" align="left">[<a href="libunistring.html#SEC_Top" title="Cover (top) of document">Top</a>]</td>
<td valign="middle" align="left">[<a href="libunistring.html#SEC_Contents" title="Table of contents">Contents</a>]</td>
-<td valign="middle" align="left">[<a href="libunistring_19.html#SEC77" title="Index">Index</a>]</td>
+<td valign="middle" align="left">[<a href="libunistring_18.html#SEC71" title="Index">Index</a>]</td>
<td valign="middle" align="left">[<a href="libunistring_abt.html#SEC_About" title="About (help)"> ? </a>]</td>
</tr></table>
<p>
<font size="-1">
- This document was generated by <em>Daiki Ueno</em> on <em>July, 8 2015</em> using <a href="http://www.nongnu.org/texi2html/"><em>texi2html 1.78a</em></a>.
+ This document was generated by <em>Bruno Haible</em> on <em>March, 30 2010</em> using <a href="http://www.nongnu.org/texi2html/"><em>texi2html 1.78a</em></a>.
</font>
<br>