summaryrefslogtreecommitdiff
path: root/lib/uniwbrk
diff options
context:
space:
mode:
Diffstat (limited to 'lib/uniwbrk')
-rw-r--r--lib/uniwbrk/u-wordbreaks.h166
-rw-r--r--lib/uniwbrk/u16-wordbreaks.c2
-rw-r--r--lib/uniwbrk/u32-wordbreaks.c2
-rw-r--r--lib/uniwbrk/u8-wordbreaks.c68
-rw-r--r--lib/uniwbrk/ulc-wordbreaks.c168
-rw-r--r--lib/uniwbrk/wbrkprop.h2
-rw-r--r--lib/uniwbrk/wbrktable.c22
-rw-r--r--lib/uniwbrk/wbrktable.h2
-rw-r--r--lib/uniwbrk/wordbreak-property.c20
9 files changed, 226 insertions, 226 deletions
diff --git a/lib/uniwbrk/u-wordbreaks.h b/lib/uniwbrk/u-wordbreaks.h
index 5ef4e8c1..b0fd301e 100644
--- a/lib/uniwbrk/u-wordbreaks.h
+++ b/lib/uniwbrk/u-wordbreaks.h
@@ -1,5 +1,5 @@
/* Word breaks in UTF-8/UTF-16/UTF-32 strings.
- Copyright (C) 2009 Free Software Foundation, Inc.
+ Copyright (C) 2009-2010 Free Software Foundation, Inc.
Written by Bruno Haible <bruno@clisp.org>, 2009.
This program is free software: you can redistribute it and/or modify it
@@ -23,105 +23,105 @@ FUNC (const UNIT *s, size_t n, char *p)
const UNIT *s_end = s + n;
/* Word break property of the last character.
- -1 at the very beginning of the string. */
+ -1 at the very beginning of the string. */
int last_char_prop = -1;
/* Format and Extend characters are ignored; this means, the mostly used
- unit is the complex character (= character with subsequent ignored
- characters).
- Word break property of the last complex character.
- -1 at the very beginning of the string. */
+ unit is the complex character (= character with subsequent ignored
+ characters).
+ Word break property of the last complex character.
+ -1 at the very beginning of the string. */
int last_compchar_prop = -1;
char *last_compchar_ptr = NULL;
/* For recognizing rules involving 3 complex characters:
- Word break property of the second-to-last complex character.
- -1 at the very beginning of the string. */
+ Word break property of the second-to-last complex character.
+ -1 at the very beginning of the string. */
int secondlast_compchar_prop = -1;
/* Don't break inside multibyte characters. */
memset (p, 0, n);
while (s < s_end)
- {
- ucs4_t uc;
- int count = U_MBTOUC_UNSAFE (&uc, s, s_end - s);
- int prop = uc_wordbreak_property (uc);
+ {
+ ucs4_t uc;
+ int count = U_MBTOUC_UNSAFE (&uc, s, s_end - s);
+ int prop = uc_wordbreak_property (uc);
- /* No break at the start of the string. */
- if (last_char_prop >= 0)
- {
- /* No break between CR and LF. */
- if (last_char_prop == WBP_CR && prop == WBP_LF)
- /* *p = 0 */;
- /* Break before and after newlines. */
- else if (last_char_prop >= WBP_NEWLINE
- /* same as:
- last_char_prop == WBP_CR
- || last_char_prop == WBP_LF
- || last_char_prop == WBP_NEWLINE */
- || prop >= WBP_NEWLINE
- /* same as:
- prop == WBP_CR
- || prop == WBP_LF
- || prop == WBP_NEWLINE */)
- *p = 1;
- /* Ignore Format and Extend characters. */
- else if (!(prop == WBP_EXTEND || prop == WBP_FORMAT))
- {
- /* No break in these situations (see UAX #29):
+ /* No break at the start of the string. */
+ if (last_char_prop >= 0)
+ {
+ /* No break between CR and LF. */
+ if (last_char_prop == WBP_CR && prop == WBP_LF)
+ /* *p = 0 */;
+ /* Break before and after newlines. */
+ else if (last_char_prop >= WBP_NEWLINE
+ /* same as:
+ last_char_prop == WBP_CR
+ || last_char_prop == WBP_LF
+ || last_char_prop == WBP_NEWLINE */
+ || prop >= WBP_NEWLINE
+ /* same as:
+ prop == WBP_CR
+ || prop == WBP_LF
+ || prop == WBP_NEWLINE */)
+ *p = 1;
+ /* Ignore Format and Extend characters. */
+ else if (!(prop == WBP_EXTEND || prop == WBP_FORMAT))
+ {
+ /* No break in these situations (see UAX #29):
- secondlast last current
+ secondlast last current
- ALetter (MidLetter | MidNumLet) × ALetter (WB7)
- ALetter × (MidLetter | MidNumLet) ALetter (WB6)
- Numeric (MidNum | MidNumLet) × Numeric (WB11)
- Numeric × (MidNum | MidNumLet) Numeric (WB12)
- ALetter × ALetter (WB5)
- ALetter × Numeric (WB9)
- Numeric × ALetter (WB10)
- Numeric × Numeric (WB8)
- Katakana × Katakana (WB13)
- (ALetter | Numeric | Katakana) × ExtendNumLet (WB13a)
- ExtendNumLet × ExtendNumLet (WB13a)
- ExtendNumLet × (ALetter | Numeric | Katakana) (WB13b)
- */
- /* No break across certain punctuation. Also, disable word
- breaks that were recognized earlier (due to lookahead of
- only one complex character). */
- if ((prop == WBP_ALETTER
- && (last_compchar_prop == WBP_MIDLETTER
- || last_compchar_prop == WBP_MIDNUMLET)
- && secondlast_compchar_prop == WBP_ALETTER)
- || (prop == WBP_NUMERIC
- && (last_compchar_prop == WBP_MIDNUM
- || last_compchar_prop == WBP_MIDNUMLET)
- && secondlast_compchar_prop == WBP_NUMERIC))
- {
- *last_compchar_ptr = 0;
- /* *p = 0; */
- }
- else
- {
- /* Perform a single table lookup. */
- if (uniwbrk_table[last_compchar_prop][prop])
- *p = 1;
- /* else *p = 0; */
- }
- }
- }
+ ALetter (MidLetter | MidNumLet) × ALetter (WB7)
+ ALetter × (MidLetter | MidNumLet) ALetter (WB6)
+ Numeric (MidNum | MidNumLet) × Numeric (WB11)
+ Numeric × (MidNum | MidNumLet) Numeric (WB12)
+ ALetter × ALetter (WB5)
+ ALetter × Numeric (WB9)
+ Numeric × ALetter (WB10)
+ Numeric × Numeric (WB8)
+ Katakana × Katakana (WB13)
+ (ALetter | Numeric | Katakana) × ExtendNumLet (WB13a)
+ ExtendNumLet × ExtendNumLet (WB13a)
+ ExtendNumLet × (ALetter | Numeric | Katakana) (WB13b)
+ */
+ /* No break across certain punctuation. Also, disable word
+ breaks that were recognized earlier (due to lookahead of
+ only one complex character). */
+ if ((prop == WBP_ALETTER
+ && (last_compchar_prop == WBP_MIDLETTER
+ || last_compchar_prop == WBP_MIDNUMLET)
+ && secondlast_compchar_prop == WBP_ALETTER)
+ || (prop == WBP_NUMERIC
+ && (last_compchar_prop == WBP_MIDNUM
+ || last_compchar_prop == WBP_MIDNUMLET)
+ && secondlast_compchar_prop == WBP_NUMERIC))
+ {
+ *last_compchar_ptr = 0;
+ /* *p = 0; */
+ }
+ else
+ {
+ /* Perform a single table lookup. */
+ if (uniwbrk_table[last_compchar_prop][prop])
+ *p = 1;
+ /* else *p = 0; */
+ }
+ }
+ }
- last_char_prop = prop;
- /* Ignore Format and Extend characters, except at the start of the string. */
- if (last_compchar_prop < 0 || !(prop == WBP_EXTEND || prop == WBP_FORMAT))
- {
- secondlast_compchar_prop = last_compchar_prop;
- last_compchar_prop = prop;
- last_compchar_ptr = p;
- }
+ last_char_prop = prop;
+ /* Ignore Format and Extend characters, except at the start of the string. */
+ if (last_compchar_prop < 0 || !(prop == WBP_EXTEND || prop == WBP_FORMAT))
+ {
+ secondlast_compchar_prop = last_compchar_prop;
+ last_compchar_prop = prop;
+ last_compchar_ptr = p;
+ }
- s += count;
- p += count;
- }
+ s += count;
+ p += count;
+ }
}
}
diff --git a/lib/uniwbrk/u16-wordbreaks.c b/lib/uniwbrk/u16-wordbreaks.c
index 3398fd3a..ea2a53d2 100644
--- a/lib/uniwbrk/u16-wordbreaks.c
+++ b/lib/uniwbrk/u16-wordbreaks.c
@@ -1,5 +1,5 @@
/* Word breaks in UTF-16 strings.
- Copyright (C) 2009 Free Software Foundation, Inc.
+ Copyright (C) 2009-2010 Free Software Foundation, Inc.
Written by Bruno Haible <bruno@clisp.org>, 2009.
This program is free software: you can redistribute it and/or modify it
diff --git a/lib/uniwbrk/u32-wordbreaks.c b/lib/uniwbrk/u32-wordbreaks.c
index 6763fb9e..86a26160 100644
--- a/lib/uniwbrk/u32-wordbreaks.c
+++ b/lib/uniwbrk/u32-wordbreaks.c
@@ -1,5 +1,5 @@
/* Word breaks in UTF-32 strings.
- Copyright (C) 2009 Free Software Foundation, Inc.
+ Copyright (C) 2009-2010 Free Software Foundation, Inc.
Written by Bruno Haible <bruno@clisp.org>, 2009.
This program is free software: you can redistribute it and/or modify it
diff --git a/lib/uniwbrk/u8-wordbreaks.c b/lib/uniwbrk/u8-wordbreaks.c
index 59d2076d..c7edfe9e 100644
--- a/lib/uniwbrk/u8-wordbreaks.c
+++ b/lib/uniwbrk/u8-wordbreaks.c
@@ -1,5 +1,5 @@
/* Word breaks in UTF-8 strings.
- Copyright (C) 2009 Free Software Foundation, Inc.
+ Copyright (C) 2009-2010 Free Software Foundation, Inc.
Written by Bruno Haible <bruno@clisp.org>, 2009.
This program is free software: you can redistribute it and/or modify it
@@ -50,28 +50,28 @@ read_file (FILE *stream)
while (! feof (stream))
{
if (size + BUFSIZE > alloc)
- {
- alloc = alloc + alloc / 2;
- if (alloc < size + BUFSIZE)
- alloc = size + BUFSIZE;
- buf = realloc (buf, alloc);
- if (buf == NULL)
- {
- fprintf (stderr, "out of memory\n");
- exit (1);
- }
- }
+ {
+ alloc = alloc + alloc / 2;
+ if (alloc < size + BUFSIZE)
+ alloc = size + BUFSIZE;
+ buf = realloc (buf, alloc);
+ if (buf == NULL)
+ {
+ fprintf (stderr, "out of memory\n");
+ exit (1);
+ }
+ }
count = fread (buf + size, 1, BUFSIZE, stream);
if (count == 0)
- {
- if (ferror (stream))
- {
- perror ("fread");
- exit (1);
- }
- }
+ {
+ if (ferror (stream))
+ {
+ perror ("fread");
+ exit (1);
+ }
+ }
else
- size += count;
+ size += count;
}
buf = realloc (buf, size + 1);
if (buf == NULL)
@@ -98,20 +98,20 @@ main (int argc, char * argv[])
u8_wordbreaks ((uint8_t *) input, length, breaks);
for (i = 0; i < length; i++)
- {
- switch (breaks[i])
- {
- case 1:
- /* U+2027 in UTF-8 encoding */
- putc (0xe2, stdout); putc (0x80, stdout); putc (0xa7, stdout);
- break;
- case 0:
- break;
- default:
- abort ();
- }
- putc (input[i], stdout);
- }
+ {
+ switch (breaks[i])
+ {
+ case 1:
+ /* U+2027 in UTF-8 encoding */
+ putc (0xe2, stdout); putc (0x80, stdout); putc (0xa7, stdout);
+ break;
+ case 0:
+ break;
+ default:
+ abort ();
+ }
+ putc (input[i], stdout);
+ }
free (breaks);
diff --git a/lib/uniwbrk/ulc-wordbreaks.c b/lib/uniwbrk/ulc-wordbreaks.c
index cb6e131c..6e17026b 100644
--- a/lib/uniwbrk/ulc-wordbreaks.c
+++ b/lib/uniwbrk/ulc-wordbreaks.c
@@ -1,5 +1,5 @@
/* Word breaks in strings.
- Copyright (C) 2001-2003, 2006-2009 Free Software Foundation, Inc.
+ Copyright (C) 2001-2003, 2006-2010 Free Software Foundation, Inc.
Written by Bruno Haible <bruno@clisp.org>, 2009.
This program is free software: you can redistribute it and/or modify it
@@ -49,60 +49,60 @@ ulc_wordbreaks (const char *s, size_t n, char *p)
const char *encoding = locale_charset ();
if (is_utf8_encoding (encoding))
- u8_wordbreaks ((const uint8_t *) s, n, p);
+ u8_wordbreaks ((const uint8_t *) s, n, p);
else
- {
- /* Convert the string to UTF-8 and build a translation table
- from offsets into s to offsets into the translated string. */
- size_t *offsets = (size_t *) malloc (n * sizeof (size_t));
-
- if (offsets != NULL)
- {
- uint8_t *t;
- size_t m;
-
- t = u8_conv_from_encoding (encoding, iconveh_question_mark,
- s, n, offsets, NULL, &m);
- if (t != NULL)
- {
- char *q = (char *) (m > 0 ? malloc (m) : NULL);
-
- if (m == 0 || q != NULL)
- {
- size_t i;
-
- /* Determine the word breaks of the UTF-8 string. */
- u8_wordbreaks (t, m, q);
-
- /* Translate the result back to the original string. */
- memset (p, 0, n);
- for (i = 0; i < n; i++)
- if (offsets[i] != (size_t)(-1))
- p[i] = q[offsets[i]];
-
- free (q);
- free (t);
- free (offsets);
- return;
- }
- free (t);
- }
- free (offsets);
- }
-
- /* Impossible to convert. */
+ {
+ /* Convert the string to UTF-8 and build a translation table
+ from offsets into s to offsets into the translated string. */
+ size_t *offsets = (size_t *) malloc (n * sizeof (size_t));
+
+ if (offsets != NULL)
+ {
+ uint8_t *t;
+ size_t m;
+
+ t = u8_conv_from_encoding (encoding, iconveh_question_mark,
+ s, n, offsets, NULL, &m);
+ if (t != NULL)
+ {
+ char *q = (char *) (m > 0 ? malloc (m) : NULL);
+
+ if (m == 0 || q != NULL)
+ {
+ size_t i;
+
+ /* Determine the word breaks of the UTF-8 string. */
+ u8_wordbreaks (t, m, q);
+
+ /* Translate the result back to the original string. */
+ memset (p, 0, n);
+ for (i = 0; i < n; i++)
+ if (offsets[i] != (size_t)(-1))
+ p[i] = q[offsets[i]];
+
+ free (q);
+ free (t);
+ free (offsets);
+ return;
+ }
+ free (t);
+ }
+ free (offsets);
+ }
+
+ /* Impossible to convert. */
#if C_CTYPE_ASCII
- if (is_all_ascii (s, n))
- {
- /* ASCII is a subset of UTF-8. */
- u8_wordbreaks ((const uint8_t *) s, n, p);
- return;
- }
+ if (is_all_ascii (s, n))
+ {
+ /* ASCII is a subset of UTF-8. */
+ u8_wordbreaks ((const uint8_t *) s, n, p);
+ return;
+ }
#endif
- /* We have a non-ASCII string and cannot convert it.
- Don't produce any word breaks. */
- memset (p, 0, n);
- }
+ /* We have a non-ASCII string and cannot convert it.
+ Don't produce any word breaks. */
+ memset (p, 0, n);
+ }
}
}
@@ -127,28 +127,28 @@ read_file (FILE *stream)
while (! feof (stream))
{
if (size + BUFSIZE > alloc)
- {
- alloc = alloc + alloc / 2;
- if (alloc < size + BUFSIZE)
- alloc = size + BUFSIZE;
- buf = realloc (buf, alloc);
- if (buf == NULL)
- {
- fprintf (stderr, "out of memory\n");
- exit (1);
- }
- }
+ {
+ alloc = alloc + alloc / 2;
+ if (alloc < size + BUFSIZE)
+ alloc = size + BUFSIZE;
+ buf = realloc (buf, alloc);
+ if (buf == NULL)
+ {
+ fprintf (stderr, "out of memory\n");
+ exit (1);
+ }
+ }
count = fread (buf + size, 1, BUFSIZE, stream);
if (count == 0)
- {
- if (ferror (stream))
- {
- perror ("fread");
- exit (1);
- }
- }
+ {
+ if (ferror (stream))
+ {
+ perror ("fread");
+ exit (1);
+ }
+ }
else
- size += count;
+ size += count;
}
buf = realloc (buf, size + 1);
if (buf == NULL)
@@ -176,19 +176,19 @@ main (int argc, char * argv[])
ulc_wordbreaks (input, length, breaks);
for (i = 0; i < length; i++)
- {
- switch (breaks[i])
- {
- case 1:
- putc ('|', stdout);
- break;
- case 0:
- break;
- default:
- abort ();
- }
- putc (input[i], stdout);
- }
+ {
+ switch (breaks[i])
+ {
+ case 1:
+ putc ('|', stdout);
+ break;
+ case 0:
+ break;
+ default:
+ abort ();
+ }
+ putc (input[i], stdout);
+ }
free (breaks);
diff --git a/lib/uniwbrk/wbrkprop.h b/lib/uniwbrk/wbrkprop.h
index 3b50e17e..77fd61de 100644
--- a/lib/uniwbrk/wbrkprop.h
+++ b/lib/uniwbrk/wbrkprop.h
@@ -2,7 +2,7 @@
/* Line breaking properties of Unicode characters. */
/* Generated automatically by gen-uni-tables for Unicode 5.1.0. */
-/* Copyright (C) 2000-2002, 2004, 2007-2009 Free Software Foundation, Inc.
+/* Copyright (C) 2000-2002, 2004, 2007-2010 Free Software Foundation, Inc.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as published by
diff --git a/lib/uniwbrk/wbrktable.c b/lib/uniwbrk/wbrktable.c
index 81a2323e..ff25fb31 100644
--- a/lib/uniwbrk/wbrktable.c
+++ b/lib/uniwbrk/wbrktable.c
@@ -1,5 +1,5 @@
/* Word break auxiliary table.
- Copyright (C) 2009 Free Software Foundation, Inc.
+ Copyright (C) 2009-2010 Free Software Foundation, Inc.
Written by Bruno Haible <bruno@clisp.org>, 2009.
This program is free software: you can redistribute it and/or modify it
@@ -22,22 +22,22 @@
/* This table contains the following rules (see UAX #29):
- last current
+ last current
- ALetter × ALetter (WB5)
- ALetter × Numeric (WB9)
- Numeric × ALetter (WB10)
- Numeric × Numeric (WB8)
- Katakana × Katakana (WB13)
+ ALetter × ALetter (WB5)
+ ALetter × Numeric (WB9)
+ Numeric × ALetter (WB10)
+ Numeric × Numeric (WB8)
+ Katakana × Katakana (WB13)
(ALetter | Numeric | Katakana) × ExtendNumLet (WB13a)
- ExtendNumLet × ExtendNumLet (WB13a)
- ExtendNumLet × (ALetter | Numeric | Katakana) (WB13b)
+ ExtendNumLet × ExtendNumLet (WB13a)
+ ExtendNumLet × (ALetter | Numeric | Katakana) (WB13b)
*/
const unsigned char uniwbrk_table[10][8] =
{ /* current: OTHER MIDNUMLET NUMERIC */
- /* KATAKANA MIDLETTER EXTENDNUMLET */
- /* ALETTER MIDNUM */
+ /* KATAKANA MIDLETTER EXTENDNUMLET */
+ /* ALETTER MIDNUM */
/* last */
/* WBP_OTHER */ { 1, 1, 1, 1, 1, 1, 1, 1 },
/* WBP_KATAKANA */ { 1, 0, 1, 1, 1, 1, 1, 0 },
diff --git a/lib/uniwbrk/wbrktable.h b/lib/uniwbrk/wbrktable.h
index 14efee90..8a13378b 100644
--- a/lib/uniwbrk/wbrktable.h
+++ b/lib/uniwbrk/wbrktable.h
@@ -1,5 +1,5 @@
/* Word break auxiliary table.
- Copyright (C) 2009 Free Software Foundation, Inc.
+ Copyright (C) 2009-2010 Free Software Foundation, Inc.
Written by Bruno Haible <bruno@clisp.org>, 2009.
This program is free software: you can redistribute it and/or modify it
diff --git a/lib/uniwbrk/wordbreak-property.c b/lib/uniwbrk/wordbreak-property.c
index 4d0a212d..9d98b0b5 100644
--- a/lib/uniwbrk/wordbreak-property.c
+++ b/lib/uniwbrk/wordbreak-property.c
@@ -1,5 +1,5 @@
/* Word break property.
- Copyright (C) 2001-2003, 2006-2009 Free Software Foundation, Inc.
+ Copyright (C) 2001-2003, 2006-2010 Free Software Foundation, Inc.
Written by Bruno Haible <bruno@clisp.org>, 2009.
This program is free software: you can redistribute it and/or modify it
@@ -30,15 +30,15 @@ uc_wordbreak_property (ucs4_t uc)
{
int lookup1 = uniwbrkprop.level1[index1];
if (lookup1 >= 0)
- {
- unsigned int index2 = (uc >> wbrkprop_header_2) & wbrkprop_header_3;
- int lookup2 = uniwbrkprop.level2[lookup1 + index2];
- if (lookup2 >= 0)
- {
- unsigned int index3 = uc & wbrkprop_header_4;
- return uniwbrkprop.level3[lookup2 + index3];
- }
- }
+ {
+ unsigned int index2 = (uc >> wbrkprop_header_2) & wbrkprop_header_3;
+ int lookup2 = uniwbrkprop.level2[lookup1 + index2];
+ if (lookup2 >= 0)
+ {
+ unsigned int index3 = uc & wbrkprop_header_4;
+ return uniwbrkprop.level3[lookup2 + index3];
+ }
+ }
}
return WBP_OTHER;
}