diff options
Diffstat (limited to 'lib/uniwbrk')
-rw-r--r-- | lib/uniwbrk/u-wordbreaks.h | 166 | ||||
-rw-r--r-- | lib/uniwbrk/u16-wordbreaks.c | 2 | ||||
-rw-r--r-- | lib/uniwbrk/u32-wordbreaks.c | 2 | ||||
-rw-r--r-- | lib/uniwbrk/u8-wordbreaks.c | 68 | ||||
-rw-r--r-- | lib/uniwbrk/ulc-wordbreaks.c | 168 | ||||
-rw-r--r-- | lib/uniwbrk/wbrkprop.h | 2 | ||||
-rw-r--r-- | lib/uniwbrk/wbrktable.c | 22 | ||||
-rw-r--r-- | lib/uniwbrk/wbrktable.h | 2 | ||||
-rw-r--r-- | lib/uniwbrk/wordbreak-property.c | 20 |
9 files changed, 226 insertions, 226 deletions
diff --git a/lib/uniwbrk/u-wordbreaks.h b/lib/uniwbrk/u-wordbreaks.h index 5ef4e8c1..b0fd301e 100644 --- a/lib/uniwbrk/u-wordbreaks.h +++ b/lib/uniwbrk/u-wordbreaks.h @@ -1,5 +1,5 @@ /* Word breaks in UTF-8/UTF-16/UTF-32 strings. - Copyright (C) 2009 Free Software Foundation, Inc. + Copyright (C) 2009-2010 Free Software Foundation, Inc. Written by Bruno Haible <bruno@clisp.org>, 2009. This program is free software: you can redistribute it and/or modify it @@ -23,105 +23,105 @@ FUNC (const UNIT *s, size_t n, char *p) const UNIT *s_end = s + n; /* Word break property of the last character. - -1 at the very beginning of the string. */ + -1 at the very beginning of the string. */ int last_char_prop = -1; /* Format and Extend characters are ignored; this means, the mostly used - unit is the complex character (= character with subsequent ignored - characters). - Word break property of the last complex character. - -1 at the very beginning of the string. */ + unit is the complex character (= character with subsequent ignored + characters). + Word break property of the last complex character. + -1 at the very beginning of the string. */ int last_compchar_prop = -1; char *last_compchar_ptr = NULL; /* For recognizing rules involving 3 complex characters: - Word break property of the second-to-last complex character. - -1 at the very beginning of the string. */ + Word break property of the second-to-last complex character. + -1 at the very beginning of the string. */ int secondlast_compchar_prop = -1; /* Don't break inside multibyte characters. */ memset (p, 0, n); while (s < s_end) - { - ucs4_t uc; - int count = U_MBTOUC_UNSAFE (&uc, s, s_end - s); - int prop = uc_wordbreak_property (uc); + { + ucs4_t uc; + int count = U_MBTOUC_UNSAFE (&uc, s, s_end - s); + int prop = uc_wordbreak_property (uc); - /* No break at the start of the string. */ - if (last_char_prop >= 0) - { - /* No break between CR and LF. */ - if (last_char_prop == WBP_CR && prop == WBP_LF) - /* *p = 0 */; - /* Break before and after newlines. */ - else if (last_char_prop >= WBP_NEWLINE - /* same as: - last_char_prop == WBP_CR - || last_char_prop == WBP_LF - || last_char_prop == WBP_NEWLINE */ - || prop >= WBP_NEWLINE - /* same as: - prop == WBP_CR - || prop == WBP_LF - || prop == WBP_NEWLINE */) - *p = 1; - /* Ignore Format and Extend characters. */ - else if (!(prop == WBP_EXTEND || prop == WBP_FORMAT)) - { - /* No break in these situations (see UAX #29): + /* No break at the start of the string. */ + if (last_char_prop >= 0) + { + /* No break between CR and LF. */ + if (last_char_prop == WBP_CR && prop == WBP_LF) + /* *p = 0 */; + /* Break before and after newlines. */ + else if (last_char_prop >= WBP_NEWLINE + /* same as: + last_char_prop == WBP_CR + || last_char_prop == WBP_LF + || last_char_prop == WBP_NEWLINE */ + || prop >= WBP_NEWLINE + /* same as: + prop == WBP_CR + || prop == WBP_LF + || prop == WBP_NEWLINE */) + *p = 1; + /* Ignore Format and Extend characters. */ + else if (!(prop == WBP_EXTEND || prop == WBP_FORMAT)) + { + /* No break in these situations (see UAX #29): - secondlast last current + secondlast last current - ALetter (MidLetter | MidNumLet) × ALetter (WB7) - ALetter × (MidLetter | MidNumLet) ALetter (WB6) - Numeric (MidNum | MidNumLet) × Numeric (WB11) - Numeric × (MidNum | MidNumLet) Numeric (WB12) - ALetter × ALetter (WB5) - ALetter × Numeric (WB9) - Numeric × ALetter (WB10) - Numeric × Numeric (WB8) - Katakana × Katakana (WB13) - (ALetter | Numeric | Katakana) × ExtendNumLet (WB13a) - ExtendNumLet × ExtendNumLet (WB13a) - ExtendNumLet × (ALetter | Numeric | Katakana) (WB13b) - */ - /* No break across certain punctuation. Also, disable word - breaks that were recognized earlier (due to lookahead of - only one complex character). */ - if ((prop == WBP_ALETTER - && (last_compchar_prop == WBP_MIDLETTER - || last_compchar_prop == WBP_MIDNUMLET) - && secondlast_compchar_prop == WBP_ALETTER) - || (prop == WBP_NUMERIC - && (last_compchar_prop == WBP_MIDNUM - || last_compchar_prop == WBP_MIDNUMLET) - && secondlast_compchar_prop == WBP_NUMERIC)) - { - *last_compchar_ptr = 0; - /* *p = 0; */ - } - else - { - /* Perform a single table lookup. */ - if (uniwbrk_table[last_compchar_prop][prop]) - *p = 1; - /* else *p = 0; */ - } - } - } + ALetter (MidLetter | MidNumLet) × ALetter (WB7) + ALetter × (MidLetter | MidNumLet) ALetter (WB6) + Numeric (MidNum | MidNumLet) × Numeric (WB11) + Numeric × (MidNum | MidNumLet) Numeric (WB12) + ALetter × ALetter (WB5) + ALetter × Numeric (WB9) + Numeric × ALetter (WB10) + Numeric × Numeric (WB8) + Katakana × Katakana (WB13) + (ALetter | Numeric | Katakana) × ExtendNumLet (WB13a) + ExtendNumLet × ExtendNumLet (WB13a) + ExtendNumLet × (ALetter | Numeric | Katakana) (WB13b) + */ + /* No break across certain punctuation. Also, disable word + breaks that were recognized earlier (due to lookahead of + only one complex character). */ + if ((prop == WBP_ALETTER + && (last_compchar_prop == WBP_MIDLETTER + || last_compchar_prop == WBP_MIDNUMLET) + && secondlast_compchar_prop == WBP_ALETTER) + || (prop == WBP_NUMERIC + && (last_compchar_prop == WBP_MIDNUM + || last_compchar_prop == WBP_MIDNUMLET) + && secondlast_compchar_prop == WBP_NUMERIC)) + { + *last_compchar_ptr = 0; + /* *p = 0; */ + } + else + { + /* Perform a single table lookup. */ + if (uniwbrk_table[last_compchar_prop][prop]) + *p = 1; + /* else *p = 0; */ + } + } + } - last_char_prop = prop; - /* Ignore Format and Extend characters, except at the start of the string. */ - if (last_compchar_prop < 0 || !(prop == WBP_EXTEND || prop == WBP_FORMAT)) - { - secondlast_compchar_prop = last_compchar_prop; - last_compchar_prop = prop; - last_compchar_ptr = p; - } + last_char_prop = prop; + /* Ignore Format and Extend characters, except at the start of the string. */ + if (last_compchar_prop < 0 || !(prop == WBP_EXTEND || prop == WBP_FORMAT)) + { + secondlast_compchar_prop = last_compchar_prop; + last_compchar_prop = prop; + last_compchar_ptr = p; + } - s += count; - p += count; - } + s += count; + p += count; + } } } diff --git a/lib/uniwbrk/u16-wordbreaks.c b/lib/uniwbrk/u16-wordbreaks.c index 3398fd3a..ea2a53d2 100644 --- a/lib/uniwbrk/u16-wordbreaks.c +++ b/lib/uniwbrk/u16-wordbreaks.c @@ -1,5 +1,5 @@ /* Word breaks in UTF-16 strings. - Copyright (C) 2009 Free Software Foundation, Inc. + Copyright (C) 2009-2010 Free Software Foundation, Inc. Written by Bruno Haible <bruno@clisp.org>, 2009. This program is free software: you can redistribute it and/or modify it diff --git a/lib/uniwbrk/u32-wordbreaks.c b/lib/uniwbrk/u32-wordbreaks.c index 6763fb9e..86a26160 100644 --- a/lib/uniwbrk/u32-wordbreaks.c +++ b/lib/uniwbrk/u32-wordbreaks.c @@ -1,5 +1,5 @@ /* Word breaks in UTF-32 strings. - Copyright (C) 2009 Free Software Foundation, Inc. + Copyright (C) 2009-2010 Free Software Foundation, Inc. Written by Bruno Haible <bruno@clisp.org>, 2009. This program is free software: you can redistribute it and/or modify it diff --git a/lib/uniwbrk/u8-wordbreaks.c b/lib/uniwbrk/u8-wordbreaks.c index 59d2076d..c7edfe9e 100644 --- a/lib/uniwbrk/u8-wordbreaks.c +++ b/lib/uniwbrk/u8-wordbreaks.c @@ -1,5 +1,5 @@ /* Word breaks in UTF-8 strings. - Copyright (C) 2009 Free Software Foundation, Inc. + Copyright (C) 2009-2010 Free Software Foundation, Inc. Written by Bruno Haible <bruno@clisp.org>, 2009. This program is free software: you can redistribute it and/or modify it @@ -50,28 +50,28 @@ read_file (FILE *stream) while (! feof (stream)) { if (size + BUFSIZE > alloc) - { - alloc = alloc + alloc / 2; - if (alloc < size + BUFSIZE) - alloc = size + BUFSIZE; - buf = realloc (buf, alloc); - if (buf == NULL) - { - fprintf (stderr, "out of memory\n"); - exit (1); - } - } + { + alloc = alloc + alloc / 2; + if (alloc < size + BUFSIZE) + alloc = size + BUFSIZE; + buf = realloc (buf, alloc); + if (buf == NULL) + { + fprintf (stderr, "out of memory\n"); + exit (1); + } + } count = fread (buf + size, 1, BUFSIZE, stream); if (count == 0) - { - if (ferror (stream)) - { - perror ("fread"); - exit (1); - } - } + { + if (ferror (stream)) + { + perror ("fread"); + exit (1); + } + } else - size += count; + size += count; } buf = realloc (buf, size + 1); if (buf == NULL) @@ -98,20 +98,20 @@ main (int argc, char * argv[]) u8_wordbreaks ((uint8_t *) input, length, breaks); for (i = 0; i < length; i++) - { - switch (breaks[i]) - { - case 1: - /* U+2027 in UTF-8 encoding */ - putc (0xe2, stdout); putc (0x80, stdout); putc (0xa7, stdout); - break; - case 0: - break; - default: - abort (); - } - putc (input[i], stdout); - } + { + switch (breaks[i]) + { + case 1: + /* U+2027 in UTF-8 encoding */ + putc (0xe2, stdout); putc (0x80, stdout); putc (0xa7, stdout); + break; + case 0: + break; + default: + abort (); + } + putc (input[i], stdout); + } free (breaks); diff --git a/lib/uniwbrk/ulc-wordbreaks.c b/lib/uniwbrk/ulc-wordbreaks.c index cb6e131c..6e17026b 100644 --- a/lib/uniwbrk/ulc-wordbreaks.c +++ b/lib/uniwbrk/ulc-wordbreaks.c @@ -1,5 +1,5 @@ /* Word breaks in strings. - Copyright (C) 2001-2003, 2006-2009 Free Software Foundation, Inc. + Copyright (C) 2001-2003, 2006-2010 Free Software Foundation, Inc. Written by Bruno Haible <bruno@clisp.org>, 2009. This program is free software: you can redistribute it and/or modify it @@ -49,60 +49,60 @@ ulc_wordbreaks (const char *s, size_t n, char *p) const char *encoding = locale_charset (); if (is_utf8_encoding (encoding)) - u8_wordbreaks ((const uint8_t *) s, n, p); + u8_wordbreaks ((const uint8_t *) s, n, p); else - { - /* Convert the string to UTF-8 and build a translation table - from offsets into s to offsets into the translated string. */ - size_t *offsets = (size_t *) malloc (n * sizeof (size_t)); - - if (offsets != NULL) - { - uint8_t *t; - size_t m; - - t = u8_conv_from_encoding (encoding, iconveh_question_mark, - s, n, offsets, NULL, &m); - if (t != NULL) - { - char *q = (char *) (m > 0 ? malloc (m) : NULL); - - if (m == 0 || q != NULL) - { - size_t i; - - /* Determine the word breaks of the UTF-8 string. */ - u8_wordbreaks (t, m, q); - - /* Translate the result back to the original string. */ - memset (p, 0, n); - for (i = 0; i < n; i++) - if (offsets[i] != (size_t)(-1)) - p[i] = q[offsets[i]]; - - free (q); - free (t); - free (offsets); - return; - } - free (t); - } - free (offsets); - } - - /* Impossible to convert. */ + { + /* Convert the string to UTF-8 and build a translation table + from offsets into s to offsets into the translated string. */ + size_t *offsets = (size_t *) malloc (n * sizeof (size_t)); + + if (offsets != NULL) + { + uint8_t *t; + size_t m; + + t = u8_conv_from_encoding (encoding, iconveh_question_mark, + s, n, offsets, NULL, &m); + if (t != NULL) + { + char *q = (char *) (m > 0 ? malloc (m) : NULL); + + if (m == 0 || q != NULL) + { + size_t i; + + /* Determine the word breaks of the UTF-8 string. */ + u8_wordbreaks (t, m, q); + + /* Translate the result back to the original string. */ + memset (p, 0, n); + for (i = 0; i < n; i++) + if (offsets[i] != (size_t)(-1)) + p[i] = q[offsets[i]]; + + free (q); + free (t); + free (offsets); + return; + } + free (t); + } + free (offsets); + } + + /* Impossible to convert. */ #if C_CTYPE_ASCII - if (is_all_ascii (s, n)) - { - /* ASCII is a subset of UTF-8. */ - u8_wordbreaks ((const uint8_t *) s, n, p); - return; - } + if (is_all_ascii (s, n)) + { + /* ASCII is a subset of UTF-8. */ + u8_wordbreaks ((const uint8_t *) s, n, p); + return; + } #endif - /* We have a non-ASCII string and cannot convert it. - Don't produce any word breaks. */ - memset (p, 0, n); - } + /* We have a non-ASCII string and cannot convert it. + Don't produce any word breaks. */ + memset (p, 0, n); + } } } @@ -127,28 +127,28 @@ read_file (FILE *stream) while (! feof (stream)) { if (size + BUFSIZE > alloc) - { - alloc = alloc + alloc / 2; - if (alloc < size + BUFSIZE) - alloc = size + BUFSIZE; - buf = realloc (buf, alloc); - if (buf == NULL) - { - fprintf (stderr, "out of memory\n"); - exit (1); - } - } + { + alloc = alloc + alloc / 2; + if (alloc < size + BUFSIZE) + alloc = size + BUFSIZE; + buf = realloc (buf, alloc); + if (buf == NULL) + { + fprintf (stderr, "out of memory\n"); + exit (1); + } + } count = fread (buf + size, 1, BUFSIZE, stream); if (count == 0) - { - if (ferror (stream)) - { - perror ("fread"); - exit (1); - } - } + { + if (ferror (stream)) + { + perror ("fread"); + exit (1); + } + } else - size += count; + size += count; } buf = realloc (buf, size + 1); if (buf == NULL) @@ -176,19 +176,19 @@ main (int argc, char * argv[]) ulc_wordbreaks (input, length, breaks); for (i = 0; i < length; i++) - { - switch (breaks[i]) - { - case 1: - putc ('|', stdout); - break; - case 0: - break; - default: - abort (); - } - putc (input[i], stdout); - } + { + switch (breaks[i]) + { + case 1: + putc ('|', stdout); + break; + case 0: + break; + default: + abort (); + } + putc (input[i], stdout); + } free (breaks); diff --git a/lib/uniwbrk/wbrkprop.h b/lib/uniwbrk/wbrkprop.h index 3b50e17e..77fd61de 100644 --- a/lib/uniwbrk/wbrkprop.h +++ b/lib/uniwbrk/wbrkprop.h @@ -2,7 +2,7 @@ /* Line breaking properties of Unicode characters. */ /* Generated automatically by gen-uni-tables for Unicode 5.1.0. */ -/* Copyright (C) 2000-2002, 2004, 2007-2009 Free Software Foundation, Inc. +/* Copyright (C) 2000-2002, 2004, 2007-2010 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by diff --git a/lib/uniwbrk/wbrktable.c b/lib/uniwbrk/wbrktable.c index 81a2323e..ff25fb31 100644 --- a/lib/uniwbrk/wbrktable.c +++ b/lib/uniwbrk/wbrktable.c @@ -1,5 +1,5 @@ /* Word break auxiliary table. - Copyright (C) 2009 Free Software Foundation, Inc. + Copyright (C) 2009-2010 Free Software Foundation, Inc. Written by Bruno Haible <bruno@clisp.org>, 2009. This program is free software: you can redistribute it and/or modify it @@ -22,22 +22,22 @@ /* This table contains the following rules (see UAX #29): - last current + last current - ALetter × ALetter (WB5) - ALetter × Numeric (WB9) - Numeric × ALetter (WB10) - Numeric × Numeric (WB8) - Katakana × Katakana (WB13) + ALetter × ALetter (WB5) + ALetter × Numeric (WB9) + Numeric × ALetter (WB10) + Numeric × Numeric (WB8) + Katakana × Katakana (WB13) (ALetter | Numeric | Katakana) × ExtendNumLet (WB13a) - ExtendNumLet × ExtendNumLet (WB13a) - ExtendNumLet × (ALetter | Numeric | Katakana) (WB13b) + ExtendNumLet × ExtendNumLet (WB13a) + ExtendNumLet × (ALetter | Numeric | Katakana) (WB13b) */ const unsigned char uniwbrk_table[10][8] = { /* current: OTHER MIDNUMLET NUMERIC */ - /* KATAKANA MIDLETTER EXTENDNUMLET */ - /* ALETTER MIDNUM */ + /* KATAKANA MIDLETTER EXTENDNUMLET */ + /* ALETTER MIDNUM */ /* last */ /* WBP_OTHER */ { 1, 1, 1, 1, 1, 1, 1, 1 }, /* WBP_KATAKANA */ { 1, 0, 1, 1, 1, 1, 1, 0 }, diff --git a/lib/uniwbrk/wbrktable.h b/lib/uniwbrk/wbrktable.h index 14efee90..8a13378b 100644 --- a/lib/uniwbrk/wbrktable.h +++ b/lib/uniwbrk/wbrktable.h @@ -1,5 +1,5 @@ /* Word break auxiliary table. - Copyright (C) 2009 Free Software Foundation, Inc. + Copyright (C) 2009-2010 Free Software Foundation, Inc. Written by Bruno Haible <bruno@clisp.org>, 2009. This program is free software: you can redistribute it and/or modify it diff --git a/lib/uniwbrk/wordbreak-property.c b/lib/uniwbrk/wordbreak-property.c index 4d0a212d..9d98b0b5 100644 --- a/lib/uniwbrk/wordbreak-property.c +++ b/lib/uniwbrk/wordbreak-property.c @@ -1,5 +1,5 @@ /* Word break property. - Copyright (C) 2001-2003, 2006-2009 Free Software Foundation, Inc. + Copyright (C) 2001-2003, 2006-2010 Free Software Foundation, Inc. Written by Bruno Haible <bruno@clisp.org>, 2009. This program is free software: you can redistribute it and/or modify it @@ -30,15 +30,15 @@ uc_wordbreak_property (ucs4_t uc) { int lookup1 = uniwbrkprop.level1[index1]; if (lookup1 >= 0) - { - unsigned int index2 = (uc >> wbrkprop_header_2) & wbrkprop_header_3; - int lookup2 = uniwbrkprop.level2[lookup1 + index2]; - if (lookup2 >= 0) - { - unsigned int index3 = uc & wbrkprop_header_4; - return uniwbrkprop.level3[lookup2 + index3]; - } - } + { + unsigned int index2 = (uc >> wbrkprop_header_2) & wbrkprop_header_3; + int lookup2 = uniwbrkprop.level2[lookup1 + index2]; + if (lookup2 >= 0) + { + unsigned int index3 = uc & wbrkprop_header_4; + return uniwbrkprop.level3[lookup2 + index3]; + } + } } return WBP_OTHER; } |