diff options
Diffstat (limited to 'lib/uniwbrk')
| -rw-r--r-- | lib/uniwbrk/u-wordbreaks.h | 166 | ||||
| -rw-r--r-- | lib/uniwbrk/u16-wordbreaks.c | 2 | ||||
| -rw-r--r-- | lib/uniwbrk/u32-wordbreaks.c | 2 | ||||
| -rw-r--r-- | lib/uniwbrk/u8-wordbreaks.c | 68 | ||||
| -rw-r--r-- | lib/uniwbrk/ulc-wordbreaks.c | 168 | ||||
| -rw-r--r-- | lib/uniwbrk/wbrkprop.h | 2 | ||||
| -rw-r--r-- | lib/uniwbrk/wbrktable.c | 22 | ||||
| -rw-r--r-- | lib/uniwbrk/wbrktable.h | 2 | ||||
| -rw-r--r-- | lib/uniwbrk/wordbreak-property.c | 20 | 
9 files changed, 226 insertions, 226 deletions
| diff --git a/lib/uniwbrk/u-wordbreaks.h b/lib/uniwbrk/u-wordbreaks.h index 5ef4e8c1..b0fd301e 100644 --- a/lib/uniwbrk/u-wordbreaks.h +++ b/lib/uniwbrk/u-wordbreaks.h @@ -1,5 +1,5 @@  /* Word breaks in UTF-8/UTF-16/UTF-32 strings. -   Copyright (C) 2009 Free Software Foundation, Inc. +   Copyright (C) 2009-2010 Free Software Foundation, Inc.     Written by Bruno Haible <bruno@clisp.org>, 2009.     This program is free software: you can redistribute it and/or modify it @@ -23,105 +23,105 @@ FUNC (const UNIT *s, size_t n, char *p)        const UNIT *s_end = s + n;        /* Word break property of the last character. -	 -1 at the very beginning of the string.  */ +         -1 at the very beginning of the string.  */        int last_char_prop = -1;        /* Format and Extend characters are ignored; this means, the mostly used -	 unit is the complex character (= character with subsequent ignored -	 characters). -	 Word break property of the last complex character. -	 -1 at the very beginning of the string.  */ +         unit is the complex character (= character with subsequent ignored +         characters). +         Word break property of the last complex character. +         -1 at the very beginning of the string.  */        int last_compchar_prop = -1;        char *last_compchar_ptr = NULL;        /* For recognizing rules involving 3 complex characters: -	 Word break property of the second-to-last complex character. -	 -1 at the very beginning of the string.  */ +         Word break property of the second-to-last complex character. +         -1 at the very beginning of the string.  */        int secondlast_compchar_prop = -1;        /* Don't break inside multibyte characters.  */        memset (p, 0, n);        while (s < s_end) -	{ -	  ucs4_t uc; -	  int count = U_MBTOUC_UNSAFE (&uc, s, s_end - s); -	  int prop = uc_wordbreak_property (uc); +        { +          ucs4_t uc; +          int count = U_MBTOUC_UNSAFE (&uc, s, s_end - s); +          int prop = uc_wordbreak_property (uc); -	  /* No break at the start of the string.  */ -	  if (last_char_prop >= 0) -	    { -	      /* No break between CR and LF.  */ -	      if (last_char_prop == WBP_CR && prop == WBP_LF) -		/* *p = 0 */; -	      /* Break before and after newlines.  */ -	      else if (last_char_prop >= WBP_NEWLINE -		       /* same as: -			  last_char_prop == WBP_CR -			  || last_char_prop == WBP_LF -			  || last_char_prop == WBP_NEWLINE */ -		       || prop >= WBP_NEWLINE -			  /* same as: -			     prop == WBP_CR -			     || prop == WBP_LF -			     || prop == WBP_NEWLINE */) -		*p = 1; -	      /* Ignore Format and Extend characters.  */ -	      else if (!(prop == WBP_EXTEND || prop == WBP_FORMAT)) -		{ -		  /* No break in these situations (see UAX #29): +          /* No break at the start of the string.  */ +          if (last_char_prop >= 0) +            { +              /* No break between CR and LF.  */ +              if (last_char_prop == WBP_CR && prop == WBP_LF) +                /* *p = 0 */; +              /* Break before and after newlines.  */ +              else if (last_char_prop >= WBP_NEWLINE +                       /* same as: +                          last_char_prop == WBP_CR +                          || last_char_prop == WBP_LF +                          || last_char_prop == WBP_NEWLINE */ +                       || prop >= WBP_NEWLINE +                          /* same as: +                             prop == WBP_CR +                             || prop == WBP_LF +                             || prop == WBP_NEWLINE */) +                *p = 1; +              /* Ignore Format and Extend characters.  */ +              else if (!(prop == WBP_EXTEND || prop == WBP_FORMAT)) +                { +                  /* No break in these situations (see UAX #29): -		      secondlast          last             current +                      secondlast          last             current -		       ALetter   (MidLetter | MidNumLet) × ALetter      (WB7) -		       ALetter × (MidLetter | MidNumLet)   ALetter      (WB6) -		       Numeric   (MidNum | MidNumLet)    × Numeric      (WB11) -		       Numeric × (MidNum | MidNumLet)      Numeric      (WB12) -						 ALetter × ALetter      (WB5) -						 ALetter × Numeric      (WB9) -						 Numeric × ALetter      (WB10) -						 Numeric × Numeric      (WB8) -						Katakana × Katakana     (WB13) -			  (ALetter | Numeric | Katakana) × ExtendNumLet (WB13a) -					    ExtendNumLet × ExtendNumLet (WB13a) -			 ExtendNumLet × (ALetter | Numeric | Katakana)  (WB13b) -		   */ -		  /* No break across certain punctuation.  Also, disable word -		     breaks that were recognized earlier (due to lookahead of -		     only one complex character).  */ -		  if ((prop == WBP_ALETTER -		       && (last_compchar_prop == WBP_MIDLETTER -			   || last_compchar_prop == WBP_MIDNUMLET) -		       && secondlast_compchar_prop == WBP_ALETTER) -		      || (prop == WBP_NUMERIC -			  && (last_compchar_prop == WBP_MIDNUM -			      || last_compchar_prop == WBP_MIDNUMLET) -			  && secondlast_compchar_prop == WBP_NUMERIC)) -		    { -		      *last_compchar_ptr = 0; -		      /* *p = 0; */ -		    } -		  else -		    { -		      /* Perform a single table lookup.  */ -		      if (uniwbrk_table[last_compchar_prop][prop]) -			*p = 1; -		      /* else *p = 0; */ -		    } -		} -	    } +                       ALetter   (MidLetter | MidNumLet) × ALetter      (WB7) +                       ALetter × (MidLetter | MidNumLet)   ALetter      (WB6) +                       Numeric   (MidNum | MidNumLet)    × Numeric      (WB11) +                       Numeric × (MidNum | MidNumLet)      Numeric      (WB12) +                                                 ALetter × ALetter      (WB5) +                                                 ALetter × Numeric      (WB9) +                                                 Numeric × ALetter      (WB10) +                                                 Numeric × Numeric      (WB8) +                                                Katakana × Katakana     (WB13) +                          (ALetter | Numeric | Katakana) × ExtendNumLet (WB13a) +                                            ExtendNumLet × ExtendNumLet (WB13a) +                         ExtendNumLet × (ALetter | Numeric | Katakana)  (WB13b) +                   */ +                  /* No break across certain punctuation.  Also, disable word +                     breaks that were recognized earlier (due to lookahead of +                     only one complex character).  */ +                  if ((prop == WBP_ALETTER +                       && (last_compchar_prop == WBP_MIDLETTER +                           || last_compchar_prop == WBP_MIDNUMLET) +                       && secondlast_compchar_prop == WBP_ALETTER) +                      || (prop == WBP_NUMERIC +                          && (last_compchar_prop == WBP_MIDNUM +                              || last_compchar_prop == WBP_MIDNUMLET) +                          && secondlast_compchar_prop == WBP_NUMERIC)) +                    { +                      *last_compchar_ptr = 0; +                      /* *p = 0; */ +                    } +                  else +                    { +                      /* Perform a single table lookup.  */ +                      if (uniwbrk_table[last_compchar_prop][prop]) +                        *p = 1; +                      /* else *p = 0; */ +                    } +                } +            } -	  last_char_prop = prop; -	  /* Ignore Format and Extend characters, except at the start of the string.  */ -	  if (last_compchar_prop < 0 || !(prop == WBP_EXTEND || prop == WBP_FORMAT)) -	    { -	      secondlast_compchar_prop = last_compchar_prop; -	      last_compchar_prop = prop; -	      last_compchar_ptr = p; -	    } +          last_char_prop = prop; +          /* Ignore Format and Extend characters, except at the start of the string.  */ +          if (last_compchar_prop < 0 || !(prop == WBP_EXTEND || prop == WBP_FORMAT)) +            { +              secondlast_compchar_prop = last_compchar_prop; +              last_compchar_prop = prop; +              last_compchar_ptr = p; +            } -	  s += count; -	  p += count; -	} +          s += count; +          p += count; +        }      }  } diff --git a/lib/uniwbrk/u16-wordbreaks.c b/lib/uniwbrk/u16-wordbreaks.c index 3398fd3a..ea2a53d2 100644 --- a/lib/uniwbrk/u16-wordbreaks.c +++ b/lib/uniwbrk/u16-wordbreaks.c @@ -1,5 +1,5 @@  /* Word breaks in UTF-16 strings. -   Copyright (C) 2009 Free Software Foundation, Inc. +   Copyright (C) 2009-2010 Free Software Foundation, Inc.     Written by Bruno Haible <bruno@clisp.org>, 2009.     This program is free software: you can redistribute it and/or modify it diff --git a/lib/uniwbrk/u32-wordbreaks.c b/lib/uniwbrk/u32-wordbreaks.c index 6763fb9e..86a26160 100644 --- a/lib/uniwbrk/u32-wordbreaks.c +++ b/lib/uniwbrk/u32-wordbreaks.c @@ -1,5 +1,5 @@  /* Word breaks in UTF-32 strings. -   Copyright (C) 2009 Free Software Foundation, Inc. +   Copyright (C) 2009-2010 Free Software Foundation, Inc.     Written by Bruno Haible <bruno@clisp.org>, 2009.     This program is free software: you can redistribute it and/or modify it diff --git a/lib/uniwbrk/u8-wordbreaks.c b/lib/uniwbrk/u8-wordbreaks.c index 59d2076d..c7edfe9e 100644 --- a/lib/uniwbrk/u8-wordbreaks.c +++ b/lib/uniwbrk/u8-wordbreaks.c @@ -1,5 +1,5 @@  /* Word breaks in UTF-8 strings. -   Copyright (C) 2009 Free Software Foundation, Inc. +   Copyright (C) 2009-2010 Free Software Foundation, Inc.     Written by Bruno Haible <bruno@clisp.org>, 2009.     This program is free software: you can redistribute it and/or modify it @@ -50,28 +50,28 @@ read_file (FILE *stream)    while (! feof (stream))      {        if (size + BUFSIZE > alloc) -	{ -	  alloc = alloc + alloc / 2; -	  if (alloc < size + BUFSIZE) -	    alloc = size + BUFSIZE; -	  buf = realloc (buf, alloc); -	  if (buf == NULL) -	    { -	      fprintf (stderr, "out of memory\n"); -	      exit (1); -	    } -	} +        { +          alloc = alloc + alloc / 2; +          if (alloc < size + BUFSIZE) +            alloc = size + BUFSIZE; +          buf = realloc (buf, alloc); +          if (buf == NULL) +            { +              fprintf (stderr, "out of memory\n"); +              exit (1); +            } +        }        count = fread (buf + size, 1, BUFSIZE, stream);        if (count == 0) -	{ -	  if (ferror (stream)) -	    { -	      perror ("fread"); -	      exit (1); -	    } -	} +        { +          if (ferror (stream)) +            { +              perror ("fread"); +              exit (1); +            } +        }        else -	size += count; +        size += count;      }    buf = realloc (buf, size + 1);    if (buf == NULL) @@ -98,20 +98,20 @@ main (int argc, char * argv[])        u8_wordbreaks ((uint8_t *) input, length, breaks);        for (i = 0; i < length; i++) -	{ -	  switch (breaks[i]) -	    { -	    case 1: -	      /* U+2027 in UTF-8 encoding */ -	      putc (0xe2, stdout); putc (0x80, stdout); putc (0xa7, stdout); -	      break; -	    case 0: -	      break; -	    default: -	      abort (); -	    } -	  putc (input[i], stdout); -	} +        { +          switch (breaks[i]) +            { +            case 1: +              /* U+2027 in UTF-8 encoding */ +              putc (0xe2, stdout); putc (0x80, stdout); putc (0xa7, stdout); +              break; +            case 0: +              break; +            default: +              abort (); +            } +          putc (input[i], stdout); +        }        free (breaks); diff --git a/lib/uniwbrk/ulc-wordbreaks.c b/lib/uniwbrk/ulc-wordbreaks.c index cb6e131c..6e17026b 100644 --- a/lib/uniwbrk/ulc-wordbreaks.c +++ b/lib/uniwbrk/ulc-wordbreaks.c @@ -1,5 +1,5 @@  /* Word breaks in strings. -   Copyright (C) 2001-2003, 2006-2009 Free Software Foundation, Inc. +   Copyright (C) 2001-2003, 2006-2010 Free Software Foundation, Inc.     Written by Bruno Haible <bruno@clisp.org>, 2009.     This program is free software: you can redistribute it and/or modify it @@ -49,60 +49,60 @@ ulc_wordbreaks (const char *s, size_t n, char *p)        const char *encoding = locale_charset ();        if (is_utf8_encoding (encoding)) -	u8_wordbreaks ((const uint8_t *) s, n, p); +        u8_wordbreaks ((const uint8_t *) s, n, p);        else -	{ -	  /* Convert the string to UTF-8 and build a translation table -	     from offsets into s to offsets into the translated string.  */ -	  size_t *offsets = (size_t *) malloc (n * sizeof (size_t)); - -	  if (offsets != NULL) -	    { -	      uint8_t *t; -	      size_t m; - -	      t = u8_conv_from_encoding (encoding, iconveh_question_mark, -					 s, n, offsets, NULL, &m); -	      if (t != NULL) -		{ -		  char *q = (char *) (m > 0 ? malloc (m) : NULL); - -		  if (m == 0 || q != NULL) -		    { -		      size_t i; - -		      /* Determine the word breaks of the UTF-8 string.  */ -		      u8_wordbreaks (t, m, q); - -		      /* Translate the result back to the original string.  */ -		      memset (p, 0, n); -		      for (i = 0; i < n; i++) -			if (offsets[i] != (size_t)(-1)) -			  p[i] = q[offsets[i]]; - -		      free (q); -		      free (t); -		      free (offsets); -		      return; -		    } -		  free (t); -		} -	      free (offsets); -	    } - -	  /* Impossible to convert.  */ +        { +          /* Convert the string to UTF-8 and build a translation table +             from offsets into s to offsets into the translated string.  */ +          size_t *offsets = (size_t *) malloc (n * sizeof (size_t)); + +          if (offsets != NULL) +            { +              uint8_t *t; +              size_t m; + +              t = u8_conv_from_encoding (encoding, iconveh_question_mark, +                                         s, n, offsets, NULL, &m); +              if (t != NULL) +                { +                  char *q = (char *) (m > 0 ? malloc (m) : NULL); + +                  if (m == 0 || q != NULL) +                    { +                      size_t i; + +                      /* Determine the word breaks of the UTF-8 string.  */ +                      u8_wordbreaks (t, m, q); + +                      /* Translate the result back to the original string.  */ +                      memset (p, 0, n); +                      for (i = 0; i < n; i++) +                        if (offsets[i] != (size_t)(-1)) +                          p[i] = q[offsets[i]]; + +                      free (q); +                      free (t); +                      free (offsets); +                      return; +                    } +                  free (t); +                } +              free (offsets); +            } + +          /* Impossible to convert.  */  #if C_CTYPE_ASCII -	  if (is_all_ascii (s, n)) -	    { -	      /* ASCII is a subset of UTF-8.  */ -	      u8_wordbreaks ((const uint8_t *) s, n, p); -	      return; -	    } +          if (is_all_ascii (s, n)) +            { +              /* ASCII is a subset of UTF-8.  */ +              u8_wordbreaks ((const uint8_t *) s, n, p); +              return; +            }  #endif -	  /* We have a non-ASCII string and cannot convert it. -	     Don't produce any word breaks.  */ -	  memset (p, 0, n); -	} +          /* We have a non-ASCII string and cannot convert it. +             Don't produce any word breaks.  */ +          memset (p, 0, n); +        }      }  } @@ -127,28 +127,28 @@ read_file (FILE *stream)    while (! feof (stream))      {        if (size + BUFSIZE > alloc) -	{ -	  alloc = alloc + alloc / 2; -	  if (alloc < size + BUFSIZE) -	    alloc = size + BUFSIZE; -	  buf = realloc (buf, alloc); -	  if (buf == NULL) -	    { -	      fprintf (stderr, "out of memory\n"); -	      exit (1); -	    } -	} +        { +          alloc = alloc + alloc / 2; +          if (alloc < size + BUFSIZE) +            alloc = size + BUFSIZE; +          buf = realloc (buf, alloc); +          if (buf == NULL) +            { +              fprintf (stderr, "out of memory\n"); +              exit (1); +            } +        }        count = fread (buf + size, 1, BUFSIZE, stream);        if (count == 0) -	{ -	  if (ferror (stream)) -	    { -	      perror ("fread"); -	      exit (1); -	    } -	} +        { +          if (ferror (stream)) +            { +              perror ("fread"); +              exit (1); +            } +        }        else -	size += count; +        size += count;      }    buf = realloc (buf, size + 1);    if (buf == NULL) @@ -176,19 +176,19 @@ main (int argc, char * argv[])        ulc_wordbreaks (input, length, breaks);        for (i = 0; i < length; i++) -	{ -	  switch (breaks[i]) -	    { -	    case 1: -	      putc ('|', stdout); -	      break; -	    case 0: -	      break; -	    default: -	      abort (); -	    } -	  putc (input[i], stdout); -	} +        { +          switch (breaks[i]) +            { +            case 1: +              putc ('|', stdout); +              break; +            case 0: +              break; +            default: +              abort (); +            } +          putc (input[i], stdout); +        }        free (breaks); diff --git a/lib/uniwbrk/wbrkprop.h b/lib/uniwbrk/wbrkprop.h index 3b50e17e..77fd61de 100644 --- a/lib/uniwbrk/wbrkprop.h +++ b/lib/uniwbrk/wbrkprop.h @@ -2,7 +2,7 @@  /* Line breaking properties of Unicode characters.  */  /* Generated automatically by gen-uni-tables for Unicode 5.1.0.  */ -/* Copyright (C) 2000-2002, 2004, 2007-2009 Free Software Foundation, Inc. +/* Copyright (C) 2000-2002, 2004, 2007-2010 Free Software Foundation, Inc.     This program is free software: you can redistribute it and/or modify     it under the terms of the GNU Lesser General Public License as published by diff --git a/lib/uniwbrk/wbrktable.c b/lib/uniwbrk/wbrktable.c index 81a2323e..ff25fb31 100644 --- a/lib/uniwbrk/wbrktable.c +++ b/lib/uniwbrk/wbrktable.c @@ -1,5 +1,5 @@  /* Word break auxiliary table. -   Copyright (C) 2009 Free Software Foundation, Inc. +   Copyright (C) 2009-2010 Free Software Foundation, Inc.     Written by Bruno Haible <bruno@clisp.org>, 2009.     This program is free software: you can redistribute it and/or modify it @@ -22,22 +22,22 @@  /* This table contains the following rules (see UAX #29): -		       last         current +                       last         current -			 ALetter × ALetter                         (WB5) -			 ALetter × Numeric                         (WB9) -			 Numeric × ALetter                         (WB10) -			 Numeric × Numeric                         (WB8) -			Katakana × Katakana                        (WB13) +                         ALetter × ALetter                         (WB5) +                         ALetter × Numeric                         (WB9) +                         Numeric × ALetter                         (WB10) +                         Numeric × Numeric                         (WB8) +                        Katakana × Katakana                        (WB13)    (ALetter | Numeric | Katakana) × ExtendNumLet                    (WB13a) -		    ExtendNumLet × ExtendNumLet                    (WB13a) -		    ExtendNumLet × (ALetter | Numeric | Katakana)  (WB13b) +                    ExtendNumLet × ExtendNumLet                    (WB13a) +                    ExtendNumLet × (ALetter | Numeric | Katakana)  (WB13b)   */  const unsigned char uniwbrk_table[10][8] =  {        /* current:      OTHER            MIDNUMLET         NUMERIC         */ -	 /*                   KATAKANA           MIDLETTER      EXTENDNUMLET */ -	 /*                          ALETTER            MIDNUM               */ +         /*                   KATAKANA           MIDLETTER      EXTENDNUMLET */ +         /*                          ALETTER            MIDNUM               */    /* last */    /* WBP_OTHER */        {  1,    1,    1,    1,    1,    1,    1,    1 },    /* WBP_KATAKANA */     {  1,    0,    1,    1,    1,    1,    1,    0 }, diff --git a/lib/uniwbrk/wbrktable.h b/lib/uniwbrk/wbrktable.h index 14efee90..8a13378b 100644 --- a/lib/uniwbrk/wbrktable.h +++ b/lib/uniwbrk/wbrktable.h @@ -1,5 +1,5 @@  /* Word break auxiliary table. -   Copyright (C) 2009 Free Software Foundation, Inc. +   Copyright (C) 2009-2010 Free Software Foundation, Inc.     Written by Bruno Haible <bruno@clisp.org>, 2009.     This program is free software: you can redistribute it and/or modify it diff --git a/lib/uniwbrk/wordbreak-property.c b/lib/uniwbrk/wordbreak-property.c index 4d0a212d..9d98b0b5 100644 --- a/lib/uniwbrk/wordbreak-property.c +++ b/lib/uniwbrk/wordbreak-property.c @@ -1,5 +1,5 @@  /* Word break property. -   Copyright (C) 2001-2003, 2006-2009 Free Software Foundation, Inc. +   Copyright (C) 2001-2003, 2006-2010 Free Software Foundation, Inc.     Written by Bruno Haible <bruno@clisp.org>, 2009.     This program is free software: you can redistribute it and/or modify it @@ -30,15 +30,15 @@ uc_wordbreak_property (ucs4_t uc)      {        int lookup1 = uniwbrkprop.level1[index1];        if (lookup1 >= 0) -	{ -	  unsigned int index2 = (uc >> wbrkprop_header_2) & wbrkprop_header_3; -	  int lookup2 = uniwbrkprop.level2[lookup1 + index2]; -	  if (lookup2 >= 0) -	    { -	      unsigned int index3 = uc & wbrkprop_header_4; -	      return uniwbrkprop.level3[lookup2 + index3]; -	    } -	} +        { +          unsigned int index2 = (uc >> wbrkprop_header_2) & wbrkprop_header_3; +          int lookup2 = uniwbrkprop.level2[lookup1 + index2]; +          if (lookup2 >= 0) +            { +              unsigned int index3 = uc & wbrkprop_header_4; +              return uniwbrkprop.level3[lookup2 + index3]; +            } +        }      }    return WBP_OTHER;  } | 
