diff options
author | Andreas Rottmann <a.rottmann@gmx.at> | 2009-09-14 12:32:44 +0200 |
---|---|---|
committer | Andreas Rottmann <a.rottmann@gmx.at> | 2009-09-14 12:32:44 +0200 |
commit | fa095a4504cbe668e4244547e2c141597bea4ecf (patch) | |
tree | 06135820a286ffec47804e75fbf8a147e92acd2e /lib/unistr |
Imported Upstream version 0.9.1upstream/0.9.1
Diffstat (limited to 'lib/unistr')
151 files changed, 7360 insertions, 0 deletions
diff --git a/lib/unistr/u-cmp2.h b/lib/unistr/u-cmp2.h new file mode 100644 index 00000000..ae3750c4 --- /dev/null +++ b/lib/unistr/u-cmp2.h @@ -0,0 +1,32 @@ +/* Compare pieces of UTF-8/UTF-16/UTF-32 strings. + Copyright (C) 2009 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2009. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +int +FUNC (const UNIT *s1, size_t n1, const UNIT *s2, size_t n2) +{ + int cmp = U_CMP (s1, s2, MIN (n1, n2)); + + if (cmp == 0) + { + if (n1 < n2) + cmp = -1; + else if (n1 > n2) + cmp = 1; + } + + return cmp; +} diff --git a/lib/unistr/u-cpy-alloc.h b/lib/unistr/u-cpy-alloc.h new file mode 100644 index 00000000..dace3e2e --- /dev/null +++ b/lib/unistr/u-cpy-alloc.h @@ -0,0 +1,39 @@ +/* Copy piece of UTF-8/UTF-16/UTF-32 string. + Copyright (C) 1999, 2002, 2006-2007, 2009 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <stdlib.h> +#include <string.h> + +UNIT * +FUNC (const UNIT *s, size_t n) +{ + UNIT *dest; + + dest = (UNIT *) malloc (n > 0 ? n * sizeof (UNIT) : 1); + if (dest != NULL) + { +#if 0 + UNIT *destptr = dest; + + for (; n > 0; n--) + *destptr++ = *s++; +#else + memcpy ((char *) dest, (const char *) s, n * sizeof (UNIT)); +#endif + } + return dest; +} diff --git a/lib/unistr/u-cpy.h b/lib/unistr/u-cpy.h new file mode 100644 index 00000000..c660eae0 --- /dev/null +++ b/lib/unistr/u-cpy.h @@ -0,0 +1,32 @@ +/* Copy piece of UTF-8/UTF-16/UTF-32 string. + Copyright (C) 1999, 2002, 2006 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <string.h> + +UNIT * +FUNC (UNIT *dest, const UNIT *src, size_t n) +{ +#if 0 + UNIT *destptr = dest; + + for (; n > 0; n--) + *destptr++ = *src++; +#else + memcpy ((char *) dest, (const char *) src, n * sizeof (UNIT)); +#endif + return dest; +} diff --git a/lib/unistr/u-endswith.h b/lib/unistr/u-endswith.h new file mode 100644 index 00000000..739bfbb1 --- /dev/null +++ b/lib/unistr/u-endswith.h @@ -0,0 +1,28 @@ +/* Substring test for UTF-8/UTF-16/UTF-32 strings. + Copyright (C) 2002, 2006 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +bool +FUNC (const UNIT *str, const UNIT *suffix) +{ + size_t len = U_STRLEN (str); + size_t suffixlen = U_STRLEN (suffix); + + if (len >= suffixlen) + return (U_CMP (str + (len - suffixlen), suffix, suffixlen) == 0); + else + return false; +} diff --git a/lib/unistr/u-move.h b/lib/unistr/u-move.h new file mode 100644 index 00000000..77b6788b --- /dev/null +++ b/lib/unistr/u-move.h @@ -0,0 +1,44 @@ +/* Copy piece of UTF-8/UTF-16/UTF-32 string. + Copyright (C) 1999, 2002, 2006 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <string.h> + +UNIT * +FUNC (UNIT *dest, const UNIT *src, size_t n) +{ +#if 0 + if (dest < src) + { + UNIT *destptr = dest; + const UNIT *srcptr = src; + + for (; n > 0; n--) + *destptr++ = *srcptr++; + } + else if (dest > src) + { + UNIT *destptr = dest + n - 1; + const UNIT *srcptr = src + n - 1; + + for (; n > 0; n--) + *destptr-- = *srcptr--; + } +#else + memmove ((char *) dest, (const char *) src, n * sizeof (UNIT)); +#endif + return dest; +} diff --git a/lib/unistr/u-set.h b/lib/unistr/u-set.h new file mode 100644 index 00000000..a093e7f6 --- /dev/null +++ b/lib/unistr/u-set.h @@ -0,0 +1,39 @@ +/* Fill UTF-8/UTF-16/UTF-32 string. + Copyright (C) 1999, 2002, 2006 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <errno.h> + +UNIT * +FUNC (UNIT *s, ucs4_t uc, size_t n) +{ + if (n > 0) + { + if (IS_SINGLE_UNIT (uc)) + { + UNIT *ptr = s; + + for (; n > 0; n--) + *ptr++ = uc; + } + else + { + errno = EILSEQ; + return NULL; + } + } + return s; +} diff --git a/lib/unistr/u-startswith.h b/lib/unistr/u-startswith.h new file mode 100644 index 00000000..0486ef88 --- /dev/null +++ b/lib/unistr/u-startswith.h @@ -0,0 +1,30 @@ +/* Substring test for UTF-8/UTF-16/UTF-32 strings. + Copyright (C) 2002, 2006 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +bool +FUNC (const UNIT *str, const UNIT *prefix) +{ + for (;;) + { + UNIT uc1 = *str++; + UNIT uc2 = *prefix++; + if (uc2 == 0) + return true; + if (uc1 != uc2) + return false; + } +} diff --git a/lib/unistr/u-stpcpy.h b/lib/unistr/u-stpcpy.h new file mode 100644 index 00000000..b13e816f --- /dev/null +++ b/lib/unistr/u-stpcpy.h @@ -0,0 +1,24 @@ +/* Copy UTF-8/UTF-16/UTF-32 string. + Copyright (C) 1999, 2002, 2006 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +UNIT * +FUNC (UNIT *dest, const UNIT *src) +{ + for (; (*dest = *src) != 0; src++, dest++) + ; + return dest; +} diff --git a/lib/unistr/u-stpncpy.h b/lib/unistr/u-stpncpy.h new file mode 100644 index 00000000..09bf434c --- /dev/null +++ b/lib/unistr/u-stpncpy.h @@ -0,0 +1,30 @@ +/* Copy UTF-8/UTF-16/UTF-32 string. + Copyright (C) 1999, 2002, 2006 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +UNIT * +FUNC (UNIT *dest, const UNIT *src, size_t n) +{ + for (; n > 0 && (*dest = *src) != 0; src++, dest++, n--) + ; + + /* This behavior is rarely useful, but it is here for consistency with + strncpy and wcsncpy. */ + for (; n > 0; n--) + *dest++ = 0; + + return dest - 1; +} diff --git a/lib/unistr/u-strcat.h b/lib/unistr/u-strcat.h new file mode 100644 index 00000000..e86bbf99 --- /dev/null +++ b/lib/unistr/u-strcat.h @@ -0,0 +1,26 @@ +/* Concatenate UTF-8/UTF-16/UTF-32 strings. + Copyright (C) 1999, 2002, 2006 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +UNIT * +FUNC (UNIT *dest, const UNIT *src) +{ + UNIT *destptr = dest + U_STRLEN (dest); + + for (; (*destptr = *src) != 0; src++, destptr++) + ; + return dest; +} diff --git a/lib/unistr/u-strcoll.h b/lib/unistr/u-strcoll.h new file mode 100644 index 00000000..af404a04 --- /dev/null +++ b/lib/unistr/u-strcoll.h @@ -0,0 +1,81 @@ +/* Compare UTF-8/UTF-16/UTF-32 strings using the collation rules of the current + locale. + Copyright (C) 2009 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2009. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +int +FUNC (const UNIT *s1, const UNIT *s2) +{ + /* When this function succeeds, it sets errno back to its original value. + When it fails, it sets errno, but also returns a meaningful return value, + for the sake of callers which ignore errno. */ + int final_errno = errno; + char *sl1; + char *sl2; + int result; + + sl1 = U_STRCONV_TO_LOCALE (s1); + if (sl1 != NULL) + { + sl2 = U_STRCONV_TO_LOCALE (s2); + if (sl2 != NULL) + { + /* Compare sl1 and sl2. */ + errno = 0; + result = strcoll (sl1, sl2); + if (errno == 0) + { + /* strcoll succeeded. */ + free (sl1); + free (sl2); + } + else + { + /* strcoll failed. */ + final_errno = errno; + free (sl1); + free (sl2); + result = U_STRCMP (s1, s2); + } + } + else + { + /* s1 could be converted to locale encoding, s2 not. */ + final_errno = errno; + free (sl1); + result = -1; + } + } + else + { + final_errno = errno; + sl2 = U_STRCONV_TO_LOCALE (s2); + if (sl2 != NULL) + { + /* s2 could be converted to locale encoding, s1 not. */ + free (sl2); + result = 1; + } + else + { + /* Neither s1 nor s2 could be converted to locale encoding. */ + result = U_STRCMP (s1, s2); + } + } + + errno = final_errno; + return result; +} diff --git a/lib/unistr/u-strcpy.h b/lib/unistr/u-strcpy.h new file mode 100644 index 00000000..153f60ec --- /dev/null +++ b/lib/unistr/u-strcpy.h @@ -0,0 +1,26 @@ +/* Copy UTF-8/UTF-16/UTF-32 string. + Copyright (C) 1999, 2002, 2006 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +UNIT * +FUNC (UNIT *dest, const UNIT *src) +{ + UNIT *destptr = dest; + + for (; (*destptr = *src) != 0; src++, destptr++) + ; + return dest; +} diff --git a/lib/unistr/u-strcspn.h b/lib/unistr/u-strcspn.h new file mode 100644 index 00000000..de326563 --- /dev/null +++ b/lib/unistr/u-strcspn.h @@ -0,0 +1,54 @@ +/* Search for some characters in UTF-8/UTF-16/UTF-32 string. + Copyright (C) 1999, 2002, 2006 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +size_t +FUNC (const UNIT *str, const UNIT *reject) +{ + /* Optimize two cases. */ + if (reject[0] == 0) + return U_STRLEN (str); + { + ucs4_t uc; + int count = U_STRMBTOUC (&uc, reject); + if (count >= 0 && reject[count] == 0) + { + const UNIT *found = U_STRCHR (str, uc); + if (found != NULL) + return found - str; + else + return U_STRLEN (str); + } + } + /* General case. */ + { + const UNIT *ptr = str; + + for (;;) + { + ucs4_t uc; + int count = U_STRMBTOUC (&uc, ptr); + if (count == 0) + return ptr - str; + if (count < 0) + break; + if (U_STRCHR (reject, uc)) + return ptr - str; + ptr += count; + } + return U_STRLEN (str); + } +} diff --git a/lib/unistr/u-strdup.h b/lib/unistr/u-strdup.h new file mode 100644 index 00000000..71e527a7 --- /dev/null +++ b/lib/unistr/u-strdup.h @@ -0,0 +1,40 @@ +/* Copy UTF-8/UTF-16/UTF-32 string. + Copyright (C) 1999, 2002, 2006-2007 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <stdlib.h> +#include <string.h> + +UNIT * +FUNC (const UNIT *s) +{ + size_t n = U_STRLEN (s) + 1; + UNIT *dest; + + dest = (UNIT *) malloc (n * sizeof (UNIT)); + if (dest != NULL) + { +#if 0 + UNIT *destptr = dest; + + for (; n > 0; n--) + *destptr++ = *s++; +#else + memcpy ((char *) dest, (const char *) s, n * sizeof (UNIT)); +#endif + } + return dest; +} diff --git a/lib/unistr/u-strlen.h b/lib/unistr/u-strlen.h new file mode 100644 index 00000000..51dcae00 --- /dev/null +++ b/lib/unistr/u-strlen.h @@ -0,0 +1,26 @@ +/* Determine length of UTF-8/UTF-16/UTF-32 string. + Copyright (C) 1999, 2002, 2006 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +size_t +FUNC (const UNIT *s) +{ + const UNIT *ptr; + + for (ptr = s; *ptr != 0; ptr++) + ; + return ptr - s; +} diff --git a/lib/unistr/u-strncat.h b/lib/unistr/u-strncat.h new file mode 100644 index 00000000..40b442ec --- /dev/null +++ b/lib/unistr/u-strncat.h @@ -0,0 +1,28 @@ +/* Concatenate UTF-8/UTF-16/UTF-32 strings. + Copyright (C) 1999, 2002, 2006 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +UNIT * +FUNC (UNIT *dest, const UNIT *src, size_t n) +{ + UNIT *destptr = dest + U_STRLEN (dest); + + for (; n > 0 && (*destptr = *src) != 0; src++, destptr++, n--) + ; + if (n == 0) + *destptr = 0; + return dest; +} diff --git a/lib/unistr/u-strncpy.h b/lib/unistr/u-strncpy.h new file mode 100644 index 00000000..3d441b51 --- /dev/null +++ b/lib/unistr/u-strncpy.h @@ -0,0 +1,32 @@ +/* Copy UTF-8/UTF-16/UTF-32 string. + Copyright (C) 1999, 2002, 2006 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +UNIT * +FUNC (UNIT *dest, const UNIT *src, size_t n) +{ + UNIT *destptr = dest; + + for (; n > 0 && (*destptr = *src) != 0; src++, destptr++, n--) + ; + + /* This behavior is rarely useful, but it is here for consistency with + strncpy and wcsncpy. */ + for (; n > 0; n--) + *destptr++ = 0; + + return dest; +} diff --git a/lib/unistr/u-strnlen.h b/lib/unistr/u-strnlen.h new file mode 100644 index 00000000..6a1d2ad2 --- /dev/null +++ b/lib/unistr/u-strnlen.h @@ -0,0 +1,26 @@ +/* Determine bounded length of UTF-8/UTF-16/UTF-32 string. + Copyright (C) 1999, 2002, 2006 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +size_t +FUNC (const UNIT *s, size_t maxlen) +{ + const UNIT *ptr; + + for (ptr = s; maxlen > 0 && *ptr != 0; ptr++, maxlen--) + ; + return ptr - s; +} diff --git a/lib/unistr/u-strpbrk.h b/lib/unistr/u-strpbrk.h new file mode 100644 index 00000000..2ff46182 --- /dev/null +++ b/lib/unistr/u-strpbrk.h @@ -0,0 +1,46 @@ +/* Search for some characters in UTF-8/UTF-16/UTF-32 string. + Copyright (C) 1999, 2002, 2006 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +UNIT * +FUNC (const UNIT *str, const UNIT *accept) +{ + /* Optimize two cases. */ + if (accept[0] == 0) + return NULL; + { + ucs4_t uc; + int count = U_STRMBTOUC (&uc, accept); + if (count >= 0 && accept[count] == 0) + return U_STRCHR (str, uc); + } + /* General case. */ + { + const UNIT *ptr = str; + + for (;;) + { + ucs4_t uc; + int count = U_STRMBTOUC (&uc, ptr); + if (count <= 0) + break; + if (U_STRCHR (accept, uc)) + return (UNIT *) ptr; + ptr += count; + } + return NULL; + } +} diff --git a/lib/unistr/u-strspn.h b/lib/unistr/u-strspn.h new file mode 100644 index 00000000..6502ce4b --- /dev/null +++ b/lib/unistr/u-strspn.h @@ -0,0 +1,54 @@ +/* Search for some characters in UTF-8/UTF-16/UTF-32 string. + Copyright (C) 1999, 2002, 2006 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +size_t +FUNC (const UNIT *str, const UNIT *accept) +{ + /* Optimize two cases. */ + if (accept[0] == 0) + return 0; + { + ucs4_t uc; + int count = U_STRMBTOUC (&uc, accept); + if (count >= 0 && accept[count] == 0) + { + const UNIT *ptr = str; + for (; *ptr != 0; ptr += count) + if (U_CMP (ptr, accept, count) != 0) + break; + return ptr - str; + } + } + /* General case. */ + { + const UNIT *ptr = str; + + for (;;) + { + ucs4_t uc; + int count = U_STRMBTOUC (&uc, ptr); + if (count == 0) + return ptr - str; + if (count < 0) + break; + if (!U_STRCHR (accept, uc)) + return ptr - str; + ptr += count; + } + return U_STRLEN (str); + } +} diff --git a/lib/unistr/u-strstr.h b/lib/unistr/u-strstr.h new file mode 100644 index 00000000..55b5a31d --- /dev/null +++ b/lib/unistr/u-strstr.h @@ -0,0 +1,49 @@ +/* Substring test for UTF-8/UTF-16/UTF-32 strings. + Copyright (C) 1999, 2002, 2006 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +UNIT * +FUNC (const UNIT *haystack, const UNIT *needle) +{ + UNIT first = needle[0]; + + /* Is needle empty? */ + if (first == 0) + return (UNIT *) haystack; + + /* Is needle nearly empty? */ + if (needle[1] == 0) + return U_STRCHR (haystack, first); + + /* Search for needle's first unit. */ + for (; *haystack != 0; haystack++) + if (*haystack == first) + { + /* Compare with needle's remaining units. */ + const UNIT *hptr = haystack + 1; + const UNIT *nptr = needle + 1; + for (;;) + { + if (*hptr != *nptr) + break; + hptr++; nptr++; + if (*nptr == 0) + return (UNIT *) haystack; + } + } + + return NULL; +} diff --git a/lib/unistr/u-strtok.h b/lib/unistr/u-strtok.h new file mode 100644 index 00000000..7ed57d6f --- /dev/null +++ b/lib/unistr/u-strtok.h @@ -0,0 +1,52 @@ +/* Tokenize UTF-8/UTF-16/UTF-32 string. + Copyright (C) 1999, 2002, 2006 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +UNIT * +FUNC (UNIT *str, const UNIT *delim, UNIT **ptr) +{ + if (str == NULL) + { + str = *ptr; + if (str == NULL) + return NULL; /* reminder that end of token sequence has been reached */ + } + + /* Skip leading delimiters. */ + str += U_STRSPN (str, delim); + + /* Found a token? */ + if (*str == 0) + { + *ptr = NULL; + return NULL; + } + + /* Move past the token. */ + { + UNIT *token_end = U_STRPBRK (str, delim); + if (token_end) + { + /* NUL-terminate the token. */ + *token_end = 0; + *ptr = token_end + 1; + } + else + *ptr = NULL; + } + + return str; +} diff --git a/lib/unistr/u16-check.c b/lib/unistr/u16-check.c new file mode 100644 index 00000000..380cec2c --- /dev/null +++ b/lib/unistr/u16-check.c @@ -0,0 +1,51 @@ +/* Check UTF-16 string. + Copyright (C) 2002, 2006-2007 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +const uint16_t * +u16_check (const uint16_t *s, size_t n) +{ + const uint16_t *s_end = s + n; + + while (s < s_end) + { + /* Keep in sync with unistr.h and utf16-ucs4.c. */ + uint16_t c = *s; + + if (c < 0xd800 || c >= 0xe000) + { + s++; + continue; + } + if (c < 0xdc00) + { + if (s + 2 <= s_end + && s[1] >= 0xdc00 && s[1] < 0xe000) + { + s += 2; + continue; + } + } + /* invalid or incomplete multibyte character */ + return s; + } + return NULL; +} diff --git a/lib/unistr/u16-chr.c b/lib/unistr/u16-chr.c new file mode 100644 index 00000000..2d7d797e --- /dev/null +++ b/lib/unistr/u16-chr.c @@ -0,0 +1,56 @@ +/* Search character in piece of UTF-16 string. + Copyright (C) 1999, 2002, 2006-2007 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +uint16_t * +u16_chr (const uint16_t *s, size_t n, ucs4_t uc) +{ + uint16_t c[2]; + + if (uc < 0x10000) + { + uint16_t c0 = uc; + + for (; n > 0; s++, n--) + { + if (*s == c0) + return (uint16_t *) s; + } + } + else + switch (u16_uctomb_aux (c, uc, 2)) + { + case 2: + if (n > 1) + { + uint16_t c0 = c[0]; + uint16_t c1 = c[1]; + + for (n--; n > 0; s++, n--) + { + if (*s == c0 && s[1] == c1) + return (uint16_t *) s; + } + } + break; + } + return NULL; +} diff --git a/lib/unistr/u16-cmp.c b/lib/unistr/u16-cmp.c new file mode 100644 index 00000000..0130d271 --- /dev/null +++ b/lib/unistr/u16-cmp.c @@ -0,0 +1,54 @@ +/* Compare pieces of UTF-16 strings. + Copyright (C) 1999, 2002, 2006 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +int +u16_cmp (const uint16_t *s1, const uint16_t *s2, size_t n) +{ + /* Note that the UTF-16 encoding does NOT preserve lexicographic order. + Namely, if uc1 is a 16-bit character and [uc2a,uc2b] is a surrogate pair, + we must enforce uc1 < [uc2a,uc2b], even if uc1 > uc2a. */ + for (; n > 0;) + { + uint16_t c1 = *s1++; + uint16_t c2 = *s2++; + if (c1 == c2) + { + n--; + continue; + } + if (c1 < 0xd800 || c1 >= 0xe000) + { + if (!(c2 < 0xd800 || c2 >= 0xe000)) + /* c2 is a surrogate, but c1 is not. */ + return -1; + } + else + { + if (c2 < 0xd800 || c2 >= 0xe000) + /* c1 is a surrogate, but c2 is not. */ + return 1; + } + return (int)c1 - (int)c2; + /* > 0 if c1 > c2, < 0 if c1 < c2. */ + } + return 0; +} diff --git a/lib/unistr/u16-cmp2.c b/lib/unistr/u16-cmp2.c new file mode 100644 index 00000000..766dcd2d --- /dev/null +++ b/lib/unistr/u16-cmp2.c @@ -0,0 +1,28 @@ +/* Compare pieces of UTF-16 strings. + Copyright (C) 2009 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2009. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +#include "minmax.h" + +#define FUNC u16_cmp2 +#define UNIT uint16_t +#define U_CMP u16_cmp +#include "u-cmp2.h" diff --git a/lib/unistr/u16-cpy-alloc.c b/lib/unistr/u16-cpy-alloc.c new file mode 100644 index 00000000..047977eb --- /dev/null +++ b/lib/unistr/u16-cpy-alloc.c @@ -0,0 +1,25 @@ +/* Copy piece of UTF-16 string. + Copyright (C) 1999, 2002, 2006 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +#define FUNC u16_cpy_alloc +#define UNIT uint16_t +#include "u-cpy-alloc.h" diff --git a/lib/unistr/u16-cpy.c b/lib/unistr/u16-cpy.c new file mode 100644 index 00000000..13e04b81 --- /dev/null +++ b/lib/unistr/u16-cpy.c @@ -0,0 +1,25 @@ +/* Copy piece of UTF-16 string. + Copyright (C) 1999, 2002, 2006 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +#define FUNC u16_cpy +#define UNIT uint16_t +#include "u-cpy.h" diff --git a/lib/unistr/u16-endswith.c b/lib/unistr/u16-endswith.c new file mode 100644 index 00000000..d9abf464 --- /dev/null +++ b/lib/unistr/u16-endswith.c @@ -0,0 +1,27 @@ +/* Substring test for UTF-16 strings. + Copyright (C) 2002, 2006 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +#define FUNC u16_endswith +#define UNIT uint16_t +#define U_STRLEN u16_strlen +#define U_CMP u16_cmp +#include "u-endswith.h" diff --git a/lib/unistr/u16-mblen.c b/lib/unistr/u16-mblen.c new file mode 100644 index 00000000..6bb35ac8 --- /dev/null +++ b/lib/unistr/u16-mblen.c @@ -0,0 +1,49 @@ +/* Look at first character in UTF-16 string. + Copyright (C) 1999-2000, 2002, 2006-2007 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +int +u16_mblen (const uint16_t *s, size_t n) +{ + if (n > 0) + { + /* Keep in sync with unistr.h and utf16-ucs4.c. */ + uint16_t c = *s; + + if (c < 0xd800 || c >= 0xe000) + return (c != 0 ? 1 : 0); +#if CONFIG_UNICODE_SAFETY + if (c < 0xdc00) + { + if (n >= 2 + && s[1] >= 0xdc00 && s[1] < 0xe000) + return 2; + } +#else + { + if (n >= 2) + return 2; + } +#endif + } + /* invalid or incomplete multibyte character */ + return -1; +} diff --git a/lib/unistr/u16-mbsnlen.c b/lib/unistr/u16-mbsnlen.c new file mode 100644 index 00000000..881958e1 --- /dev/null +++ b/lib/unistr/u16-mbsnlen.c @@ -0,0 +1,39 @@ +/* Count characters in UTF-16 string. + Copyright (C) 2007 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2007. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +size_t +u16_mbsnlen (const uint16_t *s, size_t n) +{ + size_t characters; + + characters = 0; + while (n > 0) + { + int count = u16_mblen (s, n); + if (count <= 0) + count = 1; + s += count; + n -= count; + characters++; + } + return characters; +} diff --git a/lib/unistr/u16-mbtouc-aux.c b/lib/unistr/u16-mbtouc-aux.c new file mode 100644 index 00000000..5f35b867 --- /dev/null +++ b/lib/unistr/u16-mbtouc-aux.c @@ -0,0 +1,51 @@ +/* Conversion UTF-16 to UCS-4. + Copyright (C) 2001-2002, 2006-2007, 2009 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2001. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +#if defined IN_LIBUNISTRING || HAVE_INLINE + +int +u16_mbtouc_aux (ucs4_t *puc, const uint16_t *s, size_t n) +{ + uint16_t c = *s; + + if (c < 0xdc00) + { + if (n >= 2) + { + if (s[1] >= 0xdc00 && s[1] < 0xe000) + { + *puc = 0x10000 + ((c - 0xd800) << 10) + (s[1] - 0xdc00); + return 2; + } + /* invalid multibyte character */ + } + else + { + /* incomplete multibyte character */ + } + } + /* invalid multibyte character */ + *puc = 0xfffd; + return 1; +} + +#endif diff --git a/lib/unistr/u16-mbtouc-unsafe-aux.c b/lib/unistr/u16-mbtouc-unsafe-aux.c new file mode 100644 index 00000000..9906e301 --- /dev/null +++ b/lib/unistr/u16-mbtouc-unsafe-aux.c @@ -0,0 +1,55 @@ +/* Conversion UTF-16 to UCS-4. + Copyright (C) 2001-2002, 2006-2007, 2009 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2001. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +#if defined IN_LIBUNISTRING || HAVE_INLINE + +int +u16_mbtouc_unsafe_aux (ucs4_t *puc, const uint16_t *s, size_t n) +{ + uint16_t c = *s; + +#if CONFIG_UNICODE_SAFETY + if (c < 0xdc00) +#endif + { + if (n >= 2) + { +#if CONFIG_UNICODE_SAFETY + if (s[1] >= 0xdc00 && s[1] < 0xe000) +#endif + { + *puc = 0x10000 + ((c - 0xd800) << 10) + (s[1] - 0xdc00); + return 2; + } + /* invalid multibyte character */ + } + else + { + /* incomplete multibyte character */ + } + } + /* invalid multibyte character */ + *puc = 0xfffd; + return 1; +} + +#endif diff --git a/lib/unistr/u16-mbtouc-unsafe.c b/lib/unistr/u16-mbtouc-unsafe.c new file mode 100644 index 00000000..cc858d88 --- /dev/null +++ b/lib/unistr/u16-mbtouc-unsafe.c @@ -0,0 +1,66 @@ +/* Look at first character in UTF-16 string. + Copyright (C) 1999-2002, 2006-2007, 2009 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2001. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +#if defined IN_LIBUNISTRING +/* Tell unistr.h to declare u16_mbtouc_unsafe as 'extern', not + 'static inline'. */ +# include "unistring-notinline.h" +#endif + +/* Specification. */ +#include "unistr.h" + +#if !HAVE_INLINE + +int +u16_mbtouc_unsafe (ucs4_t *puc, const uint16_t *s, size_t n) +{ + uint16_t c = *s; + + if (c < 0xd800 || c >= 0xe000) + { + *puc = c; + return 1; + } +#if CONFIG_UNICODE_SAFETY + if (c < 0xdc00) +#endif + { + if (n >= 2) + { +#if CONFIG_UNICODE_SAFETY + if (s[1] >= 0xdc00 && s[1] < 0xe000) +#endif + { + *puc = 0x10000 + ((c - 0xd800) << 10) + (s[1] - 0xdc00); + return 2; + } + /* invalid multibyte character */ + } + else + { + /* incomplete multibyte character */ + } + } + /* invalid multibyte character */ + *puc = 0xfffd; + return 1; +} + +#endif diff --git a/lib/unistr/u16-mbtouc.c b/lib/unistr/u16-mbtouc.c new file mode 100644 index 00000000..2691db83 --- /dev/null +++ b/lib/unistr/u16-mbtouc.c @@ -0,0 +1,61 @@ +/* Look at first character in UTF-16 string. + Copyright (C) 1999-2002, 2006-2007, 2009 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2001. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +#if defined IN_LIBUNISTRING +/* Tell unistr.h to declare u16_mbtouc as 'extern', not 'static inline'. */ +# include "unistring-notinline.h" +#endif + +/* Specification. */ +#include "unistr.h" + +#if !HAVE_INLINE + +int +u16_mbtouc (ucs4_t *puc, const uint16_t *s, size_t n) +{ + uint16_t c = *s; + + if (c < 0xd800 || c >= 0xe000) + { + *puc = c; + return 1; + } + if (c < 0xdc00) + { + if (n >= 2) + { + if (s[1] >= 0xdc00 && s[1] < 0xe000) + { + *puc = 0x10000 + ((c - 0xd800) << 10) + (s[1] - 0xdc00); + return 2; + } + /* invalid multibyte character */ + } + else + { + /* incomplete multibyte character */ + } + } + /* invalid multibyte character */ + *puc = 0xfffd; + return 1; +} + +#endif diff --git a/lib/unistr/u16-mbtoucr.c b/lib/unistr/u16-mbtoucr.c new file mode 100644 index 00000000..a1bd8ee9 --- /dev/null +++ b/lib/unistr/u16-mbtoucr.c @@ -0,0 +1,54 @@ +/* Look at first character in UTF-16 string, returning an error code. + Copyright (C) 1999-2002, 2006-2007 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2001. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +int +u16_mbtoucr (ucs4_t *puc, const uint16_t *s, size_t n) +{ + uint16_t c = *s; + + if (c < 0xd800 || c >= 0xe000) + { + *puc = c; + return 1; + } + if (c < 0xdc00) + { + if (n >= 2) + { + if (s[1] >= 0xdc00 && s[1] < 0xe000) + { + *puc = 0x10000 + ((c - 0xd800) << 10) + (s[1] - 0xdc00); + return 2; + } + /* invalid multibyte character */ + } + else + { + /* incomplete multibyte character */ + *puc = 0xfffd; + return -2; + } + } + /* invalid multibyte character */ + *puc = 0xfffd; + return -1; +} diff --git a/lib/unistr/u16-move.c b/lib/unistr/u16-move.c new file mode 100644 index 00000000..2bf8c619 --- /dev/null +++ b/lib/unistr/u16-move.c @@ -0,0 +1,25 @@ +/* Copy piece of UTF-16 string. + Copyright (C) 1999, 2002, 2006 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +#define FUNC u16_move +#define UNIT uint16_t +#include "u-move.h" diff --git a/lib/unistr/u16-next.c b/lib/unistr/u16-next.c new file mode 100644 index 00000000..7c49f72f --- /dev/null +++ b/lib/unistr/u16-next.c @@ -0,0 +1,37 @@ +/* Iterate over next character in UTF-16 string. + Copyright (C) 2002, 2006 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +const uint16_t * +u16_next (ucs4_t *puc, const uint16_t *s) +{ + int count; + + count = u16_strmbtouc (puc, s); + if (count > 0) + return s + count; + else + { + if (count < 0) + *puc = 0xfffd; + return NULL; + } +} diff --git a/lib/unistr/u16-prev.c b/lib/unistr/u16-prev.c new file mode 100644 index 00000000..3beecf01 --- /dev/null +++ b/lib/unistr/u16-prev.c @@ -0,0 +1,53 @@ +/* Iterate over previous character in UTF-16 string. + Copyright (C) 2002, 2006-2007 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +const uint16_t * +u16_prev (ucs4_t *puc, const uint16_t *s, const uint16_t *start) +{ + /* Keep in sync with unistr.h and utf16-ucs4.c. */ + if (s != start) + { + uint16_t c_1 = s[-1]; + + if (c_1 < 0xd800 || c_1 >= 0xe000) + { + *puc = c_1; + return s - 1; + } +#if CONFIG_UNICODE_SAFETY + if (c_1 >= 0xdc00) +#endif + if (s - 1 != start) + { + uint16_t c_2 = s[-2]; + +#if CONFIG_UNICODE_SAFETY + if (c_2 >= 0xd800 && c_2 < 0xdc00) +#endif + { + *puc = 0x10000 + ((c_2 - 0xd800) << 10) + (c_1 - 0xdc00); + return s - 2; + } + } + } + return NULL; +} diff --git a/lib/unistr/u16-set.c b/lib/unistr/u16-set.c new file mode 100644 index 00000000..9ef307f2 --- /dev/null +++ b/lib/unistr/u16-set.c @@ -0,0 +1,26 @@ +/* Fill UTF-16 string. + Copyright (C) 1999, 2002, 2006 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +#define FUNC u16_set +#define UNIT uint16_t +#define IS_SINGLE_UNIT(uc) (uc < 0xd800 || (uc < 0x10000 && uc >= 0xe000)) +#include "u-set.h" diff --git a/lib/unistr/u16-startswith.c b/lib/unistr/u16-startswith.c new file mode 100644 index 00000000..2f39d74f --- /dev/null +++ b/lib/unistr/u16-startswith.c @@ -0,0 +1,25 @@ +/* Substring test for UTF-16 strings. + Copyright (C) 2002, 2006 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +#define FUNC u16_startswith +#define UNIT uint16_t +#include "u-startswith.h" diff --git a/lib/unistr/u16-stpcpy.c b/lib/unistr/u16-stpcpy.c new file mode 100644 index 00000000..9207edc1 --- /dev/null +++ b/lib/unistr/u16-stpcpy.c @@ -0,0 +1,25 @@ +/* Copy UTF-16 string. + Copyright (C) 1999, 2002, 2006-2007 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +#define FUNC u16_stpcpy +#define UNIT uint16_t +#include "u-stpcpy.h" diff --git a/lib/unistr/u16-stpncpy.c b/lib/unistr/u16-stpncpy.c new file mode 100644 index 00000000..30ef7e30 --- /dev/null +++ b/lib/unistr/u16-stpncpy.c @@ -0,0 +1,25 @@ +/* Copy UTF-16 string. + Copyright (C) 1999, 2002, 2006 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +#define FUNC u16_stpncpy +#define UNIT uint16_t +#include "u-stpncpy.h" diff --git a/lib/unistr/u16-strcat.c b/lib/unistr/u16-strcat.c new file mode 100644 index 00000000..bb88f1b5 --- /dev/null +++ b/lib/unistr/u16-strcat.c @@ -0,0 +1,26 @@ +/* Concatenate UTF-16 strings. + Copyright (C) 1999, 2002, 2006 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +#define FUNC u16_strcat +#define UNIT uint16_t +#define U_STRLEN u16_strlen +#include "u-strcat.h" diff --git a/lib/unistr/u16-strchr.c b/lib/unistr/u16-strchr.c new file mode 100644 index 00000000..673152f9 --- /dev/null +++ b/lib/unistr/u16-strchr.c @@ -0,0 +1,63 @@ +/* Search character in UTF-16 string. + Copyright (C) 1999, 2002, 2006-2007 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +uint16_t * +u16_strchr (const uint16_t *s, ucs4_t uc) +{ + uint16_t c[2]; + + if (uc < 0x10000) + { + uint16_t c0 = uc; + + for (;; s++) + { + if (*s == c0) + break; + if (*s == 0) + goto notfound; + } + return (uint16_t *) s; + } + else + switch (u16_uctomb_aux (c, uc, 2)) + { + case 2: + if (*s == 0) + goto notfound; + { + uint16_t c0 = c[0]; + uint16_t c1 = c[1]; + + for (;; s++) + { + if (s[1] == 0) + goto notfound; + if (*s == c0 && s[1] == c1) + break; + } + return (uint16_t *) s; + } + } +notfound: + return NULL; +} diff --git a/lib/unistr/u16-strcmp.c b/lib/unistr/u16-strcmp.c new file mode 100644 index 00000000..c161c65b --- /dev/null +++ b/lib/unistr/u16-strcmp.c @@ -0,0 +1,50 @@ +/* Compare UTF-16 strings. + Copyright (C) 1999, 2002, 2006 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +int +u16_strcmp (const uint16_t *s1, const uint16_t *s2) +{ + /* Note that the UTF-16 encoding does NOT preserve lexicographic order. + Namely, if uc1 is a 16-bit character and [uc2a,uc2b] is a surrogate pair, + we must enforce uc1 < [uc2a,uc2b], even if uc1 > uc2a. */ + for (;;) + { + uint16_t c1 = *s1++; + uint16_t c2 = *s2++; + if (c1 != 0 && c1 == c2) + continue; + if (c1 < 0xd800 || c1 >= 0xe000) + { + if (!(c2 < 0xd800 || c2 >= 0xe000)) + /* c2 is a surrogate, but c1 is not. */ + return -1; + } + else + { + if (c2 < 0xd800 || c2 >= 0xe000) + /* c1 is a surrogate, but c2 is not. */ + return 1; + } + return (int)c1 - (int)c2; + /* > 0 if c1 > c2, < 0 if c1 < c2. */ + } +} diff --git a/lib/unistr/u16-strcoll.c b/lib/unistr/u16-strcoll.c new file mode 100644 index 00000000..280ba83e --- /dev/null +++ b/lib/unistr/u16-strcoll.c @@ -0,0 +1,33 @@ +/* Compare UTF-16 strings using the collation rules of the current locale. + Copyright (C) 2009 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2009. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +#include <errno.h> +#include <stdlib.h> +#include <string.h> + +#include "uniconv.h" + +#define FUNC u16_strcoll +#define UNIT uint16_t +#define U_STRCMP u16_strcmp +#define U_STRCONV_TO_LOCALE u16_strconv_to_locale +#include "u-strcoll.h" diff --git a/lib/unistr/u16-strcpy.c b/lib/unistr/u16-strcpy.c new file mode 100644 index 00000000..92c3e7ec --- /dev/null +++ b/lib/unistr/u16-strcpy.c @@ -0,0 +1,25 @@ +/* Copy UTF-16 string. + Copyright (C) 1999, 2002, 2006 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +#define FUNC u16_strcpy +#define UNIT uint16_t +#include "u-strcpy.h" diff --git a/lib/unistr/u16-strcspn.c b/lib/unistr/u16-strcspn.c new file mode 100644 index 00000000..2f5ba435 --- /dev/null +++ b/lib/unistr/u16-strcspn.c @@ -0,0 +1,28 @@ +/* Search for some characters in UTF-16 string. + Copyright (C) 1999, 2002, 2006 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +#define FUNC u16_strcspn +#define UNIT uint16_t +#define U_STRLEN u16_strlen +#define U_STRMBTOUC u16_strmbtouc +#define U_STRCHR u16_strchr +#include "u-strcspn.h" diff --git a/lib/unistr/u16-strdup.c b/lib/unistr/u16-strdup.c new file mode 100644 index 00000000..22242c29 --- /dev/null +++ b/lib/unistr/u16-strdup.c @@ -0,0 +1,26 @@ +/* Copy UTF-16 string. + Copyright (C) 1999, 2002, 2006 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +#define FUNC u16_strdup +#define UNIT uint16_t +#define U_STRLEN u16_strlen +#include "u-strdup.h" diff --git a/lib/unistr/u16-strlen.c b/lib/unistr/u16-strlen.c new file mode 100644 index 00000000..9c05541c --- /dev/null +++ b/lib/unistr/u16-strlen.c @@ -0,0 +1,25 @@ +/* Determine length of UTF-16 string. + Copyright (C) 1999, 2002, 2006 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +#define FUNC u16_strlen +#define UNIT uint16_t +#include "u-strlen.h" diff --git a/lib/unistr/u16-strmblen.c b/lib/unistr/u16-strmblen.c new file mode 100644 index 00000000..e7f36250 --- /dev/null +++ b/lib/unistr/u16-strmblen.c @@ -0,0 +1,43 @@ +/* Look at first character in UTF-16 string. + Copyright (C) 1999-2000, 2002, 2006-2007 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +int +u16_strmblen (const uint16_t *s) +{ + /* Keep in sync with unistr.h and utf16-ucs4.c. */ + uint16_t c = *s; + + if (c < 0xd800 || c >= 0xe000) + return (c != 0 ? 1 : 0); +#if CONFIG_UNICODE_SAFETY + if (c < 0xdc00) + { + if (s[1] >= 0xdc00 && s[1] < 0xe000) + return 2; + } +#else + if (s[1] != 0) + return 2; +#endif + /* invalid or incomplete multibyte character */ + return -1; +} diff --git a/lib/unistr/u16-strmbtouc.c b/lib/unistr/u16-strmbtouc.c new file mode 100644 index 00000000..78d482ee --- /dev/null +++ b/lib/unistr/u16-strmbtouc.c @@ -0,0 +1,50 @@ +/* Look at first character in UTF-16 string. + Copyright (C) 1999-2000, 2002, 2006-2007 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +int +u16_strmbtouc (ucs4_t *puc, const uint16_t *s) +{ + /* Keep in sync with unistr.h and utf16-ucs4.c. */ + uint16_t c = *s; + + if (c < 0xd800 || c >= 0xe000) + { + *puc = c; + return (c != 0 ? 1 : 0); + } +#if CONFIG_UNICODE_SAFETY + if (c < 0xdc00) +#endif + { +#if CONFIG_UNICODE_SAFETY + if (s[1] >= 0xdc00 && s[1] < 0xe000) +#else + if (s[1] != 0) +#endif + { + *puc = 0x10000 + ((c - 0xd800) << 10) + (s[1] - 0xdc00); + return 2; + } + } + /* invalid or incomplete multibyte character */ + return -1; +} diff --git a/lib/unistr/u16-strncat.c b/lib/unistr/u16-strncat.c new file mode 100644 index 00000000..9e14388c --- /dev/null +++ b/lib/unistr/u16-strncat.c @@ -0,0 +1,26 @@ +/* Concatenate UTF-16 strings. + Copyright (C) 1999, 2002, 2006 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +#define FUNC u16_strncat +#define UNIT uint16_t +#define U_STRLEN u16_strlen +#include "u-strncat.h" diff --git a/lib/unistr/u16-strncmp.c b/lib/unistr/u16-strncmp.c new file mode 100644 index 00000000..7278b6f0 --- /dev/null +++ b/lib/unistr/u16-strncmp.c @@ -0,0 +1,54 @@ +/* Compare UTF-16 strings. + Copyright (C) 1999, 2002, 2006 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +int +u16_strncmp (const uint16_t *s1, const uint16_t *s2, size_t n) +{ + /* Note that the UTF-16 encoding does NOT preserve lexicographic order. + Namely, if uc1 is a 16-bit character and [uc2a,uc2b] is a surrogate pair, + we must enforce uc1 < [uc2a,uc2b], even if uc1 > uc2a. */ + for (; n > 0;) + { + uint16_t c1 = *s1++; + uint16_t c2 = *s2++; + if (c1 != 0 && c1 == c2) + { + n--; + continue; + } + if (c1 < 0xd800 || c1 >= 0xe000) + { + if (!(c2 < 0xd800 || c2 >= 0xe000)) + /* c2 is a surrogate, but c1 is not. */ + return -1; + } + else + { + if (c2 < 0xd800 || c2 >= 0xe000) + /* c1 is a surrogate, but c2 is not. */ + return 1; + } + return (int)c1 - (int)c2; + /* > 0 if c1 > c2, < 0 if c1 < c2, = 0 if c1 and c2 are both 0. */ + } + return 0; +} diff --git a/lib/unistr/u16-strncpy.c b/lib/unistr/u16-strncpy.c new file mode 100644 index 00000000..65676aaa --- /dev/null +++ b/lib/unistr/u16-strncpy.c @@ -0,0 +1,25 @@ +/* Copy UTF-16 string. + Copyright (C) 1999, 2002, 2006 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +#define FUNC u16_strncpy +#define UNIT uint16_t +#include "u-strncpy.h" diff --git a/lib/unistr/u16-strnlen.c b/lib/unistr/u16-strnlen.c new file mode 100644 index 00000000..1cd80c41 --- /dev/null +++ b/lib/unistr/u16-strnlen.c @@ -0,0 +1,25 @@ +/* Determine bounded length of UTF-16 string. + Copyright (C) 1999, 2002, 2006 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +#define FUNC u16_strnlen +#define UNIT uint16_t +#include "u-strnlen.h" diff --git a/lib/unistr/u16-strpbrk.c b/lib/unistr/u16-strpbrk.c new file mode 100644 index 00000000..63605819 --- /dev/null +++ b/lib/unistr/u16-strpbrk.c @@ -0,0 +1,27 @@ +/* Search for some characters in UTF-16 string. + Copyright (C) 1999, 2002, 2006 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +#define FUNC u16_strpbrk +#define UNIT uint16_t +#define U_STRMBTOUC u16_strmbtouc +#define U_STRCHR u16_strchr +#include "u-strpbrk.h" diff --git a/lib/unistr/u16-strrchr.c b/lib/unistr/u16-strrchr.c new file mode 100644 index 00000000..b3c7ab03 --- /dev/null +++ b/lib/unistr/u16-strrchr.c @@ -0,0 +1,64 @@ +/* Search character in UTF-16 string. + Copyright (C) 1999, 2002, 2006-2007 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +uint16_t * +u16_strrchr (const uint16_t *s, ucs4_t uc) +{ + /* Calling u16_strlen and then searching from the other end would cause more + memory accesses. Avoid that, at the cost of a few more comparisons. */ + uint16_t *result = NULL; + uint16_t c[2]; + + if (uc < 0x10000) + { + uint16_t c0 = uc; + + for (;; s++) + { + if (*s == c0) + result = (uint16_t *) s; + if (*s == 0) + break; + } + } + else + switch (u16_uctomb_aux (c, uc, 2)) + { + case 2: + if (*s) + { + uint16_t c0 = c[0]; + uint16_t c1 = c[1]; + + /* FIXME: Maybe walking the string via u16_mblen is a win? */ + for (;; s++) + { + if (s[1] == 0) + break; + if (*s == c0 && s[1] == c1) + result = (uint16_t *) s; + } + } + break; + } + return result; +} diff --git a/lib/unistr/u16-strspn.c b/lib/unistr/u16-strspn.c new file mode 100644 index 00000000..04ba8e69 --- /dev/null +++ b/lib/unistr/u16-strspn.c @@ -0,0 +1,29 @@ +/* Search for some characters in UTF-16 string. + Copyright (C) 1999, 2002, 2006 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +#define FUNC u16_strspn +#define UNIT uint16_t +#define U_STRLEN u16_strlen +#define U_STRMBTOUC u16_strmbtouc +#define U_CMP u16_cmp +#define U_STRCHR u16_strchr +#include "u-strspn.h" diff --git a/lib/unistr/u16-strstr.c b/lib/unistr/u16-strstr.c new file mode 100644 index 00000000..bc61233d --- /dev/null +++ b/lib/unistr/u16-strstr.c @@ -0,0 +1,28 @@ +/* Substring test for UTF-16 strings. + Copyright (C) 1999, 2002, 2006 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +/* FIXME: Maybe walking the string via u16_mblen is a win? */ + +#define FUNC u16_strstr +#define UNIT uint16_t +#define U_STRCHR u16_strchr +#include "u-strstr.h" diff --git a/lib/unistr/u16-strtok.c b/lib/unistr/u16-strtok.c new file mode 100644 index 00000000..dd9b247d --- /dev/null +++ b/lib/unistr/u16-strtok.c @@ -0,0 +1,27 @@ +/* Tokenize UTF-16 string. + Copyright (C) 1999, 2002, 2006 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +#define FUNC u16_strtok +#define UNIT uint16_t +#define U_STRSPN u16_strspn +#define U_STRPBRK u16_strpbrk +#include "u-strtok.h" diff --git a/lib/unistr/u16-to-u32.c b/lib/unistr/u16-to-u32.c new file mode 100644 index 00000000..2f39dcd7 --- /dev/null +++ b/lib/unistr/u16-to-u32.c @@ -0,0 +1,125 @@ +/* Convert UTF-16 string to UTF-32 string. + Copyright (C) 2002, 2006-2007 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +#define FUNC u16_to_u32 +#define SRC_UNIT uint16_t +#define DST_UNIT uint32_t + +#include <errno.h> +#include <stdlib.h> +#include <string.h> + +DST_UNIT * +FUNC (const SRC_UNIT *s, size_t n, DST_UNIT *resultbuf, size_t *lengthp) +{ + const SRC_UNIT *s_end = s + n; + /* Output string accumulator. */ + DST_UNIT *result; + size_t allocated; + size_t length; + + if (resultbuf != NULL) + { + result = resultbuf; + allocated = *lengthp; + } + else + { + result = NULL; + allocated = 0; + } + length = 0; + /* Invariants: + result is either == resultbuf or == NULL or malloc-allocated. + If length > 0, then result != NULL. */ + + while (s < s_end) + { + ucs4_t uc; + int count; + + /* Fetch a Unicode character from the input string. */ + count = u16_mbtouc (&uc, s, s_end - s); + if (count < 0) + { + if (!(result == resultbuf || result == NULL)) + free (result); + errno = EILSEQ; + return NULL; + } + s += count; + + /* Store it in the output string. */ + if (length + 1 > allocated) + { + DST_UNIT *memory; + + allocated = (allocated > 0 ? 2 * allocated : 12); + if (length + 1 > allocated) + allocated = length + 1; + if (result == resultbuf || result == NULL) + memory = (DST_UNIT *) malloc (allocated * sizeof (DST_UNIT)); + else + memory = + (DST_UNIT *) realloc (result, allocated * sizeof (DST_UNIT)); + + if (memory == NULL) + { + if (!(result == resultbuf || result == NULL)) + free (result); + errno = ENOMEM; + return NULL; + } + if (result == resultbuf && length > 0) + memcpy ((char *) memory, (char *) result, + length * sizeof (DST_UNIT)); + result = memory; + } + result[length++] = uc; + } + + if (length == 0) + { + if (result == NULL) + { + /* Return a non-NULL value. NULL means error. */ + result = (DST_UNIT *) malloc (1); + if (result == NULL) + { + errno = ENOMEM; + return NULL; + } + } + } + else if (result != resultbuf && length < allocated) + { + /* Shrink the allocated memory if possible. */ + DST_UNIT *memory; + + memory = (DST_UNIT *) realloc (result, length * sizeof (DST_UNIT)); + if (memory != NULL) + result = memory; + } + + *lengthp = length; + return result; +} diff --git a/lib/unistr/u16-to-u8.c b/lib/unistr/u16-to-u8.c new file mode 100644 index 00000000..38b27ec0 --- /dev/null +++ b/lib/unistr/u16-to-u8.c @@ -0,0 +1,136 @@ +/* Convert UTF-16 string to UTF-8 string. + Copyright (C) 2002, 2006-2007 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +#define FUNC u16_to_u8 +#define SRC_UNIT uint16_t +#define DST_UNIT uint8_t + +#include <errno.h> +#include <stdlib.h> +#include <string.h> + +DST_UNIT * +FUNC (const SRC_UNIT *s, size_t n, DST_UNIT *resultbuf, size_t *lengthp) +{ + const SRC_UNIT *s_end = s + n; + /* Output string accumulator. */ + DST_UNIT *result; + size_t allocated; + size_t length; + + if (resultbuf != NULL) + { + result = resultbuf; + allocated = *lengthp; + } + else + { + result = NULL; + allocated = 0; + } + length = 0; + /* Invariants: + result is either == resultbuf or == NULL or malloc-allocated. + If length > 0, then result != NULL. */ + + while (s < s_end) + { + ucs4_t uc; + int count; + + /* Fetch a Unicode character from the input string. */ + count = u16_mbtouc (&uc, s, s_end - s); + if (count < 0) + { + if (!(result == resultbuf || result == NULL)) + free (result); + errno = EILSEQ; + return NULL; + } + s += count; + + /* Store it in the output string. */ + count = u8_uctomb (result + length, uc, allocated - length); + if (count == -1) + { + if (!(result == resultbuf || result == NULL)) + free (result); + errno = EILSEQ; + return NULL; + } + if (count == -2) + { + DST_UNIT *memory; + + allocated = (allocated > 0 ? 2 * allocated : 12); + if (length + 6 > allocated) + allocated = length + 6; + if (result == resultbuf || result == NULL) + memory = (DST_UNIT *) malloc (allocated * sizeof (DST_UNIT)); + else + memory = + (DST_UNIT *) realloc (result, allocated * sizeof (DST_UNIT)); + + if (memory == NULL) + { + if (!(result == resultbuf || result == NULL)) + free (result); + errno = ENOMEM; + return NULL; + } + if (result == resultbuf && length > 0) + memcpy ((char *) memory, (char *) result, + length * sizeof (DST_UNIT)); + result = memory; + count = u8_uctomb (result + length, uc, allocated - length); + if (count < 0) + abort (); + } + length += count; + } + + if (length == 0) + { + if (result == NULL) + { + /* Return a non-NULL value. NULL means error. */ + result = (DST_UNIT *) malloc (1); + if (result == NULL) + { + errno = ENOMEM; + return NULL; + } + } + } + else if (result != resultbuf && length < allocated) + { + /* Shrink the allocated memory if possible. */ + DST_UNIT *memory; + + memory = (DST_UNIT *) realloc (result, length * sizeof (DST_UNIT)); + if (memory != NULL) + result = memory; + } + + *lengthp = length; + return result; +} diff --git a/lib/unistr/u16-uctomb-aux.c b/lib/unistr/u16-uctomb-aux.c new file mode 100644 index 00000000..384452ba --- /dev/null +++ b/lib/unistr/u16-uctomb-aux.c @@ -0,0 +1,58 @@ +/* Conversion UCS-4 to UTF-16. + Copyright (C) 2002, 2006-2007 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +int +u16_uctomb_aux (uint16_t *s, ucs4_t uc, int n) +{ + if (uc < 0xd800) + { + /* The case n >= 1 is already handled by the caller. */ + } + else if (uc < 0x10000) + { + if (uc >= 0xe000) + { + if (n >= 1) + { + s[0] = uc; + return 1; + } + } + else + return -1; + } + else + { + if (uc < 0x110000) + { + if (n >= 2) + { + s[0] = 0xd800 + ((uc - 0x10000) >> 10); + s[1] = 0xdc00 + ((uc - 0x10000) & 0x3ff); + return 2; + } + } + else + return -1; + } + return -2; +} diff --git a/lib/unistr/u16-uctomb.c b/lib/unistr/u16-uctomb.c new file mode 100644 index 00000000..6ac5ada5 --- /dev/null +++ b/lib/unistr/u16-uctomb.c @@ -0,0 +1,72 @@ +/* Store a character in UTF-16 string. + Copyright (C) 2002, 2005-2006, 2009 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +#if defined IN_LIBUNISTRING +/* Tell unistr.h to declare u16_uctomb as 'extern', not 'static inline'. */ +# include "unistring-notinline.h" +#endif + +/* Specification. */ +#include "unistr.h" + +#if !HAVE_INLINE + +int +u16_uctomb (uint16_t *s, ucs4_t uc, int n) +{ + if (uc < 0xd800) + { + if (n > 0) + { + s[0] = uc; + return 1; + } + /* else return -2, below. */ + } + else if (uc < 0x10000) + { + if (uc >= 0xe000) + { + if (n >= 1) + { + s[0] = uc; + return 1; + } + } + else + return -1; + } + else + { + if (uc < 0x110000) + { + if (n >= 2) + { + s[0] = 0xd800 + ((uc - 0x10000) >> 10); + s[1] = 0xdc00 + ((uc - 0x10000) & 0x3ff); + return 2; + } + } + else + return -1; + } + return -2; +} + +#endif diff --git a/lib/unistr/u32-check.c b/lib/unistr/u32-check.c new file mode 100644 index 00000000..87e91271 --- /dev/null +++ b/lib/unistr/u32-check.c @@ -0,0 +1,39 @@ +/* Check UTF-32 string. + Copyright (C) 2002, 2006 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +const uint32_t * +u32_check (const uint32_t *s, size_t n) +{ + const uint32_t *s_end = s + n; + + while (s < s_end) + { + uint32_t c = *s; + + if (c < 0xd800 || (c >= 0xe000 && c < 0x110000)) + s++; + else + /* invalid Unicode character */ + return s; + } + return NULL; +} diff --git a/lib/unistr/u32-chr.c b/lib/unistr/u32-chr.c new file mode 100644 index 00000000..5a594b25 --- /dev/null +++ b/lib/unistr/u32-chr.c @@ -0,0 +1,32 @@ +/* Search character in piece of UTF-32 string. + Copyright (C) 1999, 2002, 2006 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +uint32_t * +u32_chr (const uint32_t *s, size_t n, ucs4_t uc) +{ + for (; n > 0; s++, n--) + { + if (*s == uc) + return (uint32_t *) s; + } + return NULL; +} diff --git a/lib/unistr/u32-cmp.c b/lib/unistr/u32-cmp.c new file mode 100644 index 00000000..a273b10b --- /dev/null +++ b/lib/unistr/u32-cmp.c @@ -0,0 +1,40 @@ +/* Compare pieces of UTF-32 strings. + Copyright (C) 1999, 2002, 2006 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +int +u32_cmp (const uint32_t *s1, const uint32_t *s2, size_t n) +{ + for (; n > 0;) + { + uint32_t uc1 = *s1++; + uint32_t uc2 = *s2++; + if (uc1 == uc2) + { + n--; + continue; + } + /* Note that uc1 and uc2 each have at most 31 bits. */ + return (int)uc1 - (int)uc2; + /* > 0 if uc1 > uc2, < 0 if uc1 < uc2. */ + } + return 0; +} diff --git a/lib/unistr/u32-cmp2.c b/lib/unistr/u32-cmp2.c new file mode 100644 index 00000000..6924c482 --- /dev/null +++ b/lib/unistr/u32-cmp2.c @@ -0,0 +1,28 @@ +/* Compare pieces of UTF-32 strings. + Copyright (C) 2009 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2009. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +#include "minmax.h" + +#define FUNC u32_cmp2 +#define UNIT uint32_t +#define U_CMP u32_cmp +#include "u-cmp2.h" diff --git a/lib/unistr/u32-cpy-alloc.c b/lib/unistr/u32-cpy-alloc.c new file mode 100644 index 00000000..f9c69608 --- /dev/null +++ b/lib/unistr/u32-cpy-alloc.c @@ -0,0 +1,25 @@ +/* Copy piece of UTF-32 string. + Copyright (C) 1999, 2002, 2006 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +#define FUNC u32_cpy_alloc +#define UNIT uint32_t +#include "u-cpy-alloc.h" diff --git a/lib/unistr/u32-cpy.c b/lib/unistr/u32-cpy.c new file mode 100644 index 00000000..2493b65a --- /dev/null +++ b/lib/unistr/u32-cpy.c @@ -0,0 +1,25 @@ +/* Copy piece of UTF-32 string. + Copyright (C) 1999, 2002, 2006 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +#define FUNC u32_cpy +#define UNIT uint32_t +#include "u-cpy.h" diff --git a/lib/unistr/u32-endswith.c b/lib/unistr/u32-endswith.c new file mode 100644 index 00000000..d77fa8cf --- /dev/null +++ b/lib/unistr/u32-endswith.c @@ -0,0 +1,27 @@ +/* Substring test for UTF-32 strings. + Copyright (C) 2002, 2006 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +#define FUNC u32_endswith +#define UNIT uint32_t +#define U_STRLEN u32_strlen +#define U_CMP u32_cmp +#include "u-endswith.h" diff --git a/lib/unistr/u32-mblen.c b/lib/unistr/u32-mblen.c new file mode 100644 index 00000000..a48c8b21 --- /dev/null +++ b/lib/unistr/u32-mblen.c @@ -0,0 +1,37 @@ +/* Look at first character in UTF-32 string. + Copyright (C) 2002, 2006-2007 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +int +u32_mblen (const uint32_t *s, size_t n) +{ + if (n > 0) + { + uint32_t c = *s; + +#if CONFIG_UNICODE_SAFETY + if (c < 0xd800 || (c >= 0xe000 && c < 0x110000)) +#endif + return (c != 0 ? 1 : 0); + } + /* invalid or incomplete multibyte character */ + return -1; +} diff --git a/lib/unistr/u32-mbsnlen.c b/lib/unistr/u32-mbsnlen.c new file mode 100644 index 00000000..b7a5d185 --- /dev/null +++ b/lib/unistr/u32-mbsnlen.c @@ -0,0 +1,27 @@ +/* Count characters in UTF-32 string. + Copyright (C) 2007 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2007. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +size_t +u32_mbsnlen (const uint32_t *s, size_t n) +{ + return n; +} diff --git a/lib/unistr/u32-mbtouc-unsafe.c b/lib/unistr/u32-mbtouc-unsafe.c new file mode 100644 index 00000000..4bd9e817 --- /dev/null +++ b/lib/unistr/u32-mbtouc-unsafe.c @@ -0,0 +1,48 @@ +/* Look at first character in UTF-32 string. + Copyright (C) 2002, 2006-2007, 2009 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +#if defined IN_LIBUNISTRING +/* Tell unistr.h to declare u32_mbtouc_unsafe as 'extern', not + 'static inline'. */ +# include "unistring-notinline.h" +#endif + +/* Specification. */ +#include "unistr.h" + +#if !HAVE_INLINE + +int +u32_mbtouc_unsafe (ucs4_t *puc, const uint32_t *s, size_t n) +{ + uint32_t c = *s; + +#if CONFIG_UNICODE_SAFETY + if (c < 0xd800 || (c >= 0xe000 && c < 0x110000)) +#endif + *puc = c; +#if CONFIG_UNICODE_SAFETY + else + /* invalid multibyte character */ + *puc = 0xfffd; +#endif + return 1; +} + +#endif diff --git a/lib/unistr/u32-mbtouc.c b/lib/unistr/u32-mbtouc.c new file mode 100644 index 00000000..4eeef58c --- /dev/null +++ b/lib/unistr/u32-mbtouc.c @@ -0,0 +1,43 @@ +/* Look at first character in UTF-32 string. + Copyright (C) 2002, 2006-2007, 2009 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +#if defined IN_LIBUNISTRING +/* Tell unistr.h to declare u32_mbtouc as 'extern', not 'static inline'. */ +# include "unistring-notinline.h" +#endif + +/* Specification. */ +#include "unistr.h" + +#if !HAVE_INLINE + +int +u32_mbtouc (ucs4_t *puc, const uint32_t *s, size_t n) +{ + uint32_t c = *s; + + if (c < 0xd800 || (c >= 0xe000 && c < 0x110000)) + *puc = c; + else + /* invalid multibyte character */ + *puc = 0xfffd; + return 1; +} + +#endif diff --git a/lib/unistr/u32-mbtoucr.c b/lib/unistr/u32-mbtoucr.c new file mode 100644 index 00000000..7d7993cc --- /dev/null +++ b/lib/unistr/u32-mbtoucr.c @@ -0,0 +1,39 @@ +/* Look at first character in UTF-32 string, returning an error code. + Copyright (C) 2002, 2006-2007 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +int +u32_mbtoucr (ucs4_t *puc, const uint32_t *s, size_t n) +{ + uint32_t c = *s; + + if (c < 0xd800 || (c >= 0xe000 && c < 0x110000)) + { + *puc = c; + return 1; + } + else + { + /* invalid multibyte character */ + *puc = 0xfffd; + return -1; + } +} diff --git a/lib/unistr/u32-move.c b/lib/unistr/u32-move.c new file mode 100644 index 00000000..d6c08f4b --- /dev/null +++ b/lib/unistr/u32-move.c @@ -0,0 +1,25 @@ +/* Copy piece of UTF-32 string. + Copyright (C) 1999, 2002, 2006 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +#define FUNC u32_move +#define UNIT uint32_t +#include "u-move.h" diff --git a/lib/unistr/u32-next.c b/lib/unistr/u32-next.c new file mode 100644 index 00000000..c529295c --- /dev/null +++ b/lib/unistr/u32-next.c @@ -0,0 +1,39 @@ +/* Iterate over next character in UTF-32 string. + Copyright (C) 2002, 2006-2007 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +const uint32_t * +u32_next (ucs4_t *puc, const uint32_t *s) +{ + int count; + + count = u32_strmbtouc (puc, s); + if (count > 0) + return s + count; + else + { +#if CONFIG_UNICODE_SAFETY + if (count < 0) + *puc = 0xfffd; +#endif + return NULL; + } +} diff --git a/lib/unistr/u32-prev.c b/lib/unistr/u32-prev.c new file mode 100644 index 00000000..bc20c93f --- /dev/null +++ b/lib/unistr/u32-prev.c @@ -0,0 +1,39 @@ +/* Iterate over previous character in UTF-32 string. + Copyright (C) 2002, 2006-2007 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +const uint32_t * +u32_prev (ucs4_t *puc, const uint32_t *s, const uint32_t *start) +{ + if (s != start) + { + uint32_t c_1 = s[-1]; + +#if CONFIG_UNICODE_SAFETY + if (c_1 < 0xd800 || (c_1 >= 0xe000 && c_1 < 0x110000)) +#endif + { + *puc = c_1; + return s - 1; + } + } + return NULL; +} diff --git a/lib/unistr/u32-set.c b/lib/unistr/u32-set.c new file mode 100644 index 00000000..de10e09e --- /dev/null +++ b/lib/unistr/u32-set.c @@ -0,0 +1,26 @@ +/* Fill UTF-32 string. + Copyright (C) 1999, 2002, 2006 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +#define FUNC u32_set +#define UNIT uint32_t +#define IS_SINGLE_UNIT(uc) (uc < 0xd800 || (uc >= 0xe000 && uc < 0x110000)) +#include "u-set.h" diff --git a/lib/unistr/u32-startswith.c b/lib/unistr/u32-startswith.c new file mode 100644 index 00000000..0f9a1859 --- /dev/null +++ b/lib/unistr/u32-startswith.c @@ -0,0 +1,25 @@ +/* Substring test for UTF-32 strings. + Copyright (C) 2002, 2006 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +#define FUNC u32_startswith +#define UNIT uint32_t +#include "u-startswith.h" diff --git a/lib/unistr/u32-stpcpy.c b/lib/unistr/u32-stpcpy.c new file mode 100644 index 00000000..5df82339 --- /dev/null +++ b/lib/unistr/u32-stpcpy.c @@ -0,0 +1,25 @@ +/* Copy UTF-32 string. + Copyright (C) 1999, 2002, 2006 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +#define FUNC u32_stpcpy +#define UNIT uint32_t +#include "u-stpcpy.h" diff --git a/lib/unistr/u32-stpncpy.c b/lib/unistr/u32-stpncpy.c new file mode 100644 index 00000000..19a08943 --- /dev/null +++ b/lib/unistr/u32-stpncpy.c @@ -0,0 +1,25 @@ +/* Copy UTF-32 string. + Copyright (C) 1999, 2002, 2006 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +#define FUNC u32_stpncpy +#define UNIT uint32_t +#include "u-stpncpy.h" diff --git a/lib/unistr/u32-strcat.c b/lib/unistr/u32-strcat.c new file mode 100644 index 00000000..cd4737fd --- /dev/null +++ b/lib/unistr/u32-strcat.c @@ -0,0 +1,26 @@ +/* Concatenate UTF-32 strings. + Copyright (C) 1999, 2002, 2006 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +#define FUNC u32_strcat +#define UNIT uint32_t +#define U_STRLEN u32_strlen +#include "u-strcat.h" diff --git a/lib/unistr/u32-strchr.c b/lib/unistr/u32-strchr.c new file mode 100644 index 00000000..57a3a3cf --- /dev/null +++ b/lib/unistr/u32-strchr.c @@ -0,0 +1,36 @@ +/* Search character in UTF-32 string. + Copyright (C) 1999, 2002, 2006 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +uint32_t * +u32_strchr (const uint32_t *s, ucs4_t uc) +{ + for (;; s++) + { + if (*s == uc) + break; + if (*s == 0) + goto notfound; + } + return (uint32_t *) s; +notfound: + return NULL; +} diff --git a/lib/unistr/u32-strcmp.c b/lib/unistr/u32-strcmp.c new file mode 100644 index 00000000..b82d5ede --- /dev/null +++ b/lib/unistr/u32-strcmp.c @@ -0,0 +1,36 @@ +/* Compare UTF-32 strings. + Copyright (C) 1999, 2002, 2006 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +int +u32_strcmp (const uint32_t *s1, const uint32_t *s2) +{ + for (;;) + { + uint32_t uc1 = *s1++; + uint32_t uc2 = *s2++; + if (uc1 != 0 && uc1 == uc2) + continue; + /* Note that uc1 and uc2 each have at most 31 bits. */ + return (int)uc1 - (int)uc2; + /* > 0 if uc1 > uc2, < 0 if uc1 < uc2. */ + } +} diff --git a/lib/unistr/u32-strcoll.c b/lib/unistr/u32-strcoll.c new file mode 100644 index 00000000..d5e1482c --- /dev/null +++ b/lib/unistr/u32-strcoll.c @@ -0,0 +1,33 @@ +/* Compare UTF-32 strings using the collation rules of the current locale. + Copyright (C) 2009 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2009. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +#include <errno.h> +#include <stdlib.h> +#include <string.h> + +#include "uniconv.h" + +#define FUNC u32_strcoll +#define UNIT uint32_t +#define U_STRCMP u32_strcmp +#define U_STRCONV_TO_LOCALE u32_strconv_to_locale +#include "u-strcoll.h" diff --git a/lib/unistr/u32-strcpy.c b/lib/unistr/u32-strcpy.c new file mode 100644 index 00000000..e5001005 --- /dev/null +++ b/lib/unistr/u32-strcpy.c @@ -0,0 +1,25 @@ +/* Copy UTF-32 string. + Copyright (C) 1999, 2002, 2006 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +#define FUNC u32_strcpy +#define UNIT uint32_t +#include "u-strcpy.h" diff --git a/lib/unistr/u32-strcspn.c b/lib/unistr/u32-strcspn.c new file mode 100644 index 00000000..f93ab2ef --- /dev/null +++ b/lib/unistr/u32-strcspn.c @@ -0,0 +1,51 @@ +/* Search for some characters in UTF-32 string. + Copyright (C) 1999, 2002, 2006 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +#define FUNC u32_strcspn +#define UNIT uint32_t +#define U_STRLEN u32_strlen +#define U_STRCHR u32_strchr + +size_t +FUNC (const UNIT *str, const UNIT *reject) +{ + /* Optimize two cases. */ + if (reject[0] == 0) + return U_STRLEN (str); + if (reject[1] == 0) + { + ucs4_t uc = reject[0]; + const UNIT *ptr = str; + for (; *ptr != 0; ptr++) + if (*ptr == uc) + break; + return ptr - str; + } + /* General case. */ + { + const UNIT *ptr = str; + for (; *ptr != 0; ptr++) + if (U_STRCHR (reject, *ptr)) + break; + return ptr - str; + } +} diff --git a/lib/unistr/u32-strdup.c b/lib/unistr/u32-strdup.c new file mode 100644 index 00000000..66e9393e --- /dev/null +++ b/lib/unistr/u32-strdup.c @@ -0,0 +1,26 @@ +/* Copy UTF-32 string. + Copyright (C) 1999, 2002, 2006 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +#define FUNC u32_strdup +#define UNIT uint32_t +#define U_STRLEN u32_strlen +#include "u-strdup.h" diff --git a/lib/unistr/u32-strlen.c b/lib/unistr/u32-strlen.c new file mode 100644 index 00000000..eb64cc2a --- /dev/null +++ b/lib/unistr/u32-strlen.c @@ -0,0 +1,25 @@ +/* Determine length of UTF-32 string. + Copyright (C) 1999, 2002, 2006 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +#define FUNC u32_strlen +#define UNIT uint32_t +#include "u-strlen.h" diff --git a/lib/unistr/u32-strmblen.c b/lib/unistr/u32-strmblen.c new file mode 100644 index 00000000..4dde3fff --- /dev/null +++ b/lib/unistr/u32-strmblen.c @@ -0,0 +1,36 @@ +/* Look at first character in UTF-32 string. + Copyright (C) 2002, 2006-2007 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +int +u32_strmblen (const uint32_t *s) +{ + uint32_t c = *s; + +#if CONFIG_UNICODE_SAFETY + if (c < 0xd800 || (c >= 0xe000 && c < 0x110000)) +#endif + return (c != 0 ? 1 : 0); +#if CONFIG_UNICODE_SAFETY + /* invalid multibyte character */ + return -1; +#endif +} diff --git a/lib/unistr/u32-strmbtouc.c b/lib/unistr/u32-strmbtouc.c new file mode 100644 index 00000000..e94ac300 --- /dev/null +++ b/lib/unistr/u32-strmbtouc.c @@ -0,0 +1,39 @@ +/* Look at first character in UTF-32 string. + Copyright (C) 2002, 2006-2007 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +int +u32_strmbtouc (ucs4_t *puc, const uint32_t *s) +{ + uint32_t c = *s; + +#if CONFIG_UNICODE_SAFETY + if (c < 0xd800 || (c >= 0xe000 && c < 0x110000)) +#endif + { + *puc = c; + return (c != 0 ? 1 : 0); + } +#if CONFIG_UNICODE_SAFETY + /* invalid multibyte character */ + return -1; +#endif +} diff --git a/lib/unistr/u32-strncat.c b/lib/unistr/u32-strncat.c new file mode 100644 index 00000000..86f895b5 --- /dev/null +++ b/lib/unistr/u32-strncat.c @@ -0,0 +1,26 @@ +/* Concatenate UTF-32 strings. + Copyright (C) 1999, 2002, 2006 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +#define FUNC u32_strncat +#define UNIT uint32_t +#define U_STRLEN u32_strlen +#include "u-strncat.h" diff --git a/lib/unistr/u32-strncmp.c b/lib/unistr/u32-strncmp.c new file mode 100644 index 00000000..026ce50c --- /dev/null +++ b/lib/unistr/u32-strncmp.c @@ -0,0 +1,40 @@ +/* Compare UTF-32 strings. + Copyright (C) 1999, 2002, 2006 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +int +u32_strncmp (const uint32_t *s1, const uint32_t *s2, size_t n) +{ + for (; n > 0;) + { + uint32_t uc1 = *s1++; + uint32_t uc2 = *s2++; + if (uc1 != 0 && uc1 == uc2) + { + n--; + continue; + } + /* Note that uc1 and uc2 each have at most 31 bits. */ + return (int)uc1 - (int)uc2; + /* > 0 if uc1 > uc2, < 0 if uc1 < uc2, = 0 if uc1 and uc2 are both 0. */ + } + return 0; +} diff --git a/lib/unistr/u32-strncpy.c b/lib/unistr/u32-strncpy.c new file mode 100644 index 00000000..c8f409bb --- /dev/null +++ b/lib/unistr/u32-strncpy.c @@ -0,0 +1,25 @@ +/* Copy UTF-32 string. + Copyright (C) 1999, 2002, 2006 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +#define FUNC u32_strncpy +#define UNIT uint32_t +#include "u-strncpy.h" diff --git a/lib/unistr/u32-strnlen.c b/lib/unistr/u32-strnlen.c new file mode 100644 index 00000000..2ead8f26 --- /dev/null +++ b/lib/unistr/u32-strnlen.c @@ -0,0 +1,25 @@ +/* Determine bounded length of UTF-32 string. + Copyright (C) 1999, 2002, 2006 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +#define FUNC u32_strnlen +#define UNIT uint32_t +#include "u-strnlen.h" diff --git a/lib/unistr/u32-strpbrk.c b/lib/unistr/u32-strpbrk.c new file mode 100644 index 00000000..9579a1b0 --- /dev/null +++ b/lib/unistr/u32-strpbrk.c @@ -0,0 +1,50 @@ +/* Search for some characters in UTF-32 string. + Copyright (C) 1999, 2002, 2006 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +#define FUNC u32_strpbrk +#define UNIT uint32_t +#define U_STRCHR u32_strchr + +UNIT * +FUNC (const UNIT *str, const UNIT *accept) +{ + /* Optimize two cases. */ + if (accept[0] == 0) + return NULL; + if (accept[1] == 0) + { + ucs4_t uc = accept[0]; + const UNIT *ptr = str; + for (; *ptr != 0; ptr++) + if (*ptr == uc) + return (UNIT *) ptr; + return NULL; + } + /* General case. */ + { + const UNIT *ptr = str; + for (; *ptr != 0; ptr++) + if (U_STRCHR (accept, *ptr)) + return (UNIT *) ptr; + return NULL; + } +} diff --git a/lib/unistr/u32-strrchr.c b/lib/unistr/u32-strrchr.c new file mode 100644 index 00000000..b4641f33 --- /dev/null +++ b/lib/unistr/u32-strrchr.c @@ -0,0 +1,38 @@ +/* Search character in UTF-32 string. + Copyright (C) 1999, 2002, 2006 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +uint32_t * +u32_strrchr (const uint32_t *s, ucs4_t uc) +{ + /* Calling u32_strlen and then searching from the other end would cause more + memory accesses. Avoid that, at the cost of a few more comparisons. */ + uint32_t *result = NULL; + + for (;; s++) + { + if (*s == uc) + result = (uint32_t *) s; + if (*s == 0) + break; + } + return result; +} diff --git a/lib/unistr/u32-strspn.c b/lib/unistr/u32-strspn.c new file mode 100644 index 00000000..6f802029 --- /dev/null +++ b/lib/unistr/u32-strspn.c @@ -0,0 +1,50 @@ +/* Search for some characters in UTF-32 string. + Copyright (C) 1999, 2002, 2006 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +#define FUNC u32_strspn +#define UNIT uint32_t +#define U_STRCHR u32_strchr + +size_t +FUNC (const UNIT *str, const UNIT *accept) +{ + /* Optimize two cases. */ + if (accept[0] == 0) + return 0; + if (accept[1] == 0) + { + ucs4_t uc = accept[0]; + const UNIT *ptr = str; + for (; *ptr != 0; ptr++) + if (*ptr != uc) + break; + return ptr - str; + } + /* General case. */ + { + const UNIT *ptr = str; + for (; *ptr != 0; ptr++) + if (!U_STRCHR (accept, *ptr)) + break; + return ptr - str; + } +} diff --git a/lib/unistr/u32-strstr.c b/lib/unistr/u32-strstr.c new file mode 100644 index 00000000..285746de --- /dev/null +++ b/lib/unistr/u32-strstr.c @@ -0,0 +1,26 @@ +/* Substring test for UTF-32 strings. + Copyright (C) 1999, 2002, 2006 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +#define FUNC u32_strstr +#define UNIT uint32_t +#define U_STRCHR u32_strchr +#include "u-strstr.h" diff --git a/lib/unistr/u32-strtok.c b/lib/unistr/u32-strtok.c new file mode 100644 index 00000000..3f219192 --- /dev/null +++ b/lib/unistr/u32-strtok.c @@ -0,0 +1,27 @@ +/* Tokenize UTF-32 string. + Copyright (C) 1999, 2002, 2006 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +#define FUNC u32_strtok +#define UNIT uint32_t +#define U_STRSPN u32_strspn +#define U_STRPBRK u32_strpbrk +#include "u-strtok.h" diff --git a/lib/unistr/u32-to-u16.c b/lib/unistr/u32-to-u16.c new file mode 100644 index 00000000..3a32162c --- /dev/null +++ b/lib/unistr/u32-to-u16.c @@ -0,0 +1,130 @@ +/* Convert UTF-32 string to UTF-16 string. + Copyright (C) 2002, 2006-2007 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +#define FUNC u32_to_u16 +#define SRC_UNIT uint32_t +#define DST_UNIT uint16_t + +#include <errno.h> +#include <stdlib.h> +#include <string.h> + +DST_UNIT * +FUNC (const SRC_UNIT *s, size_t n, DST_UNIT *resultbuf, size_t *lengthp) +{ + const SRC_UNIT *s_end = s + n; + /* Output string accumulator. */ + DST_UNIT *result; + size_t allocated; + size_t length; + + if (resultbuf != NULL) + { + result = resultbuf; + allocated = *lengthp; + } + else + { + result = NULL; + allocated = 0; + } + length = 0; + /* Invariants: + result is either == resultbuf or == NULL or malloc-allocated. + If length > 0, then result != NULL. */ + + while (s < s_end) + { + ucs4_t uc; + int count; + + /* Fetch a Unicode character from the input string. */ + uc = *s++; + /* No need to call the safe variant u32_mbtouc, because + u16_uctomb will verify uc anyway. */ + + /* Store it in the output string. */ + count = u16_uctomb (result + length, uc, allocated - length); + if (count == -1) + { + if (!(result == resultbuf || result == NULL)) + free (result); + errno = EILSEQ; + return NULL; + } + if (count == -2) + { + DST_UNIT *memory; + + allocated = (allocated > 0 ? 2 * allocated : 12); + if (length + 2 > allocated) + allocated = length + 2; + if (result == resultbuf || result == NULL) + memory = (DST_UNIT *) malloc (allocated * sizeof (DST_UNIT)); + else + memory = + (DST_UNIT *) realloc (result, allocated * sizeof (DST_UNIT)); + + if (memory == NULL) + { + if (!(result == resultbuf || result == NULL)) + free (result); + errno = ENOMEM; + return NULL; + } + if (result == resultbuf && length > 0) + memcpy ((char *) memory, (char *) result, + length * sizeof (DST_UNIT)); + result = memory; + count = u16_uctomb (result + length, uc, allocated - length); + if (count < 0) + abort (); + } + length += count; + } + + if (length == 0) + { + if (result == NULL) + { + /* Return a non-NULL value. NULL means error. */ + result = (DST_UNIT *) malloc (1); + if (result == NULL) + { + errno = ENOMEM; + return NULL; + } + } + } + else if (result != resultbuf && length < allocated) + { + /* Shrink the allocated memory if possible. */ + DST_UNIT *memory; + + memory = (DST_UNIT *) realloc (result, length * sizeof (DST_UNIT)); + if (memory != NULL) + result = memory; + } + + *lengthp = length; + return result; +} diff --git a/lib/unistr/u32-to-u8.c b/lib/unistr/u32-to-u8.c new file mode 100644 index 00000000..4dce896b --- /dev/null +++ b/lib/unistr/u32-to-u8.c @@ -0,0 +1,130 @@ +/* Convert UTF-32 string to UTF-8 string. + Copyright (C) 2002, 2006-2007 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +#define FUNC u32_to_u8 +#define SRC_UNIT uint32_t +#define DST_UNIT uint8_t + +#include <errno.h> +#include <stdlib.h> +#include <string.h> + +DST_UNIT * +FUNC (const SRC_UNIT *s, size_t n, DST_UNIT *resultbuf, size_t *lengthp) +{ + const SRC_UNIT *s_end = s + n; + /* Output string accumulator. */ + DST_UNIT *result; + size_t allocated; + size_t length; + + if (resultbuf != NULL) + { + result = resultbuf; + allocated = *lengthp; + } + else + { + result = NULL; + allocated = 0; + } + length = 0; + /* Invariants: + result is either == resultbuf or == NULL or malloc-allocated. + If length > 0, then result != NULL. */ + + while (s < s_end) + { + ucs4_t uc; + int count; + + /* Fetch a Unicode character from the input string. */ + uc = *s++; + /* No need to call the safe variant u32_mbtouc, because + u8_uctomb will verify uc anyway. */ + + /* Store it in the output string. */ + count = u8_uctomb (result + length, uc, allocated - length); + if (count == -1) + { + if (!(result == resultbuf || result == NULL)) + free (result); + errno = EILSEQ; + return NULL; + } + if (count == -2) + { + DST_UNIT *memory; + + allocated = (allocated > 0 ? 2 * allocated : 12); + if (length + 6 > allocated) + allocated = length + 6; + if (result == resultbuf || result == NULL) + memory = (DST_UNIT *) malloc (allocated * sizeof (DST_UNIT)); + else + memory = + (DST_UNIT *) realloc (result, allocated * sizeof (DST_UNIT)); + + if (memory == NULL) + { + if (!(result == resultbuf || result == NULL)) + free (result); + errno = ENOMEM; + return NULL; + } + if (result == resultbuf && length > 0) + memcpy ((char *) memory, (char *) result, + length * sizeof (DST_UNIT)); + result = memory; + count = u8_uctomb (result + length, uc, allocated - length); + if (count < 0) + abort (); + } + length += count; + } + + if (length == 0) + { + if (result == NULL) + { + /* Return a non-NULL value. NULL means error. */ + result = (DST_UNIT *) malloc (1); + if (result == NULL) + { + errno = ENOMEM; + return NULL; + } + } + } + else if (result != resultbuf && length < allocated) + { + /* Shrink the allocated memory if possible. */ + DST_UNIT *memory; + + memory = (DST_UNIT *) realloc (result, length * sizeof (DST_UNIT)); + if (memory != NULL) + result = memory; + } + + *lengthp = length; + return result; +} diff --git a/lib/unistr/u32-uctomb.c b/lib/unistr/u32-uctomb.c new file mode 100644 index 00000000..583b3b66 --- /dev/null +++ b/lib/unistr/u32-uctomb.c @@ -0,0 +1,47 @@ +/* Store a character in UTF-32 string. + Copyright (C) 2002, 2005-2006, 2009 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +#if defined IN_LIBUNISTRING +/* Tell unistr.h to declare u32_uctomb as 'extern', not 'static inline'. */ +# include "unistring-notinline.h" +#endif + +/* Specification. */ +#include "unistr.h" + +#if !HAVE_INLINE + +int +u32_uctomb (uint32_t *s, ucs4_t uc, int n) +{ + if (uc < 0xd800 || (uc >= 0xe000 && uc < 0x110000)) + { + if (n > 0) + { + *s = uc; + return 1; + } + else + return -2; + } + else + return -1; +} + +#endif diff --git a/lib/unistr/u8-check.c b/lib/unistr/u8-check.c new file mode 100644 index 00000000..53897fc5 --- /dev/null +++ b/lib/unistr/u8-check.c @@ -0,0 +1,105 @@ +/* Check UTF-8 string. + Copyright (C) 2002, 2006-2007 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +const uint8_t * +u8_check (const uint8_t *s, size_t n) +{ + const uint8_t *s_end = s + n; + + while (s < s_end) + { + /* Keep in sync with unistr.h and utf8-ucs4.c. */ + uint8_t c = *s; + + if (c < 0x80) + { + s++; + continue; + } + if (c >= 0xc2) + { + if (c < 0xe0) + { + if (s + 2 <= s_end + && (s[1] ^ 0x80) < 0x40) + { + s += 2; + continue; + } + } + else if (c < 0xf0) + { + if (s + 3 <= s_end + && (s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 + && (c >= 0xe1 || s[1] >= 0xa0) + && (c != 0xed || s[1] < 0xa0)) + { + s += 3; + continue; + } + } + else if (c < 0xf8) + { + if (s + 4 <= s_end + && (s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 + && (s[3] ^ 0x80) < 0x40 + && (c >= 0xf1 || s[1] >= 0x90) +#if 1 + && (c < 0xf4 || (c == 0xf4 && s[1] < 0x90)) +#endif + ) + { + s += 4; + continue; + } + } +#if 0 + else if (c < 0xfc) + { + if (s + 5 <= s_end + && (s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 + && (s[3] ^ 0x80) < 0x40 && (s[4] ^ 0x80) < 0x40 + && (c >= 0xf9 || s[1] >= 0x88)) + { + s += 5; + continue; + } + } + else if (c < 0xfe) + { + if (s + 6 <= s_end + && (s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 + && (s[3] ^ 0x80) < 0x40 && (s[4] ^ 0x80) < 0x40 + && (s[5] ^ 0x80) < 0x40 + && (c >= 0xfd || s[1] >= 0x84)) + { + s += 6; + continue; + } + } +#endif + } + /* invalid or incomplete multibyte character */ + return s; + } + return NULL; +} diff --git a/lib/unistr/u8-chr.c b/lib/unistr/u8-chr.c new file mode 100644 index 00000000..2c4d7687 --- /dev/null +++ b/lib/unistr/u8-chr.c @@ -0,0 +1,87 @@ +/* Search character in piece of UTF-8 string. + Copyright (C) 1999, 2002, 2006-2007 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +uint8_t * +u8_chr (const uint8_t *s, size_t n, ucs4_t uc) +{ + uint8_t c[6]; + + if (uc < 0x80) + { + uint8_t c0 = uc; + + for (; n > 0; s++, n--) + { + if (*s == c0) + return (uint8_t *) s; + } + } + else + switch (u8_uctomb_aux (c, uc, 6)) + { + case 2: + if (n > 1) + { + uint8_t c0 = c[0]; + uint8_t c1 = c[1]; + + for (n--; n > 0; s++, n--) + { + if (*s == c0 && s[1] == c1) + return (uint8_t *) s; + } + } + break; + + case 3: + if (n > 2) + { + uint8_t c0 = c[0]; + uint8_t c1 = c[1]; + uint8_t c2 = c[2]; + + for (n -= 2; n > 0; s++, n--) + { + if (*s == c0 && s[1] == c1 && s[2] == c2) + return (uint8_t *) s; + } + } + break; + + case 4: + if (n > 3) + { + uint8_t c0 = c[0]; + uint8_t c1 = c[1]; + uint8_t c2 = c[2]; + uint8_t c3 = c[3]; + + for (n -= 3; n > 0; s++, n--) + { + if (*s == c0 && s[1] == c1 && s[2] == c2 && s[3] == c3) + return (uint8_t *) s; + } + } + break; + } + return NULL; +} diff --git a/lib/unistr/u8-cmp.c b/lib/unistr/u8-cmp.c new file mode 100644 index 00000000..77b2402d --- /dev/null +++ b/lib/unistr/u8-cmp.c @@ -0,0 +1,30 @@ +/* Compare pieces of UTF-8 strings. + Copyright (C) 2002, 2006 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +#include <string.h> + +int +u8_cmp (const uint8_t *s1, const uint8_t *s2, size_t n) +{ + /* Use the fact that the UTF-8 encoding preserves lexicographic order. */ + return memcmp ((const char *) s1, (const char *) s2, n); +} diff --git a/lib/unistr/u8-cmp2.c b/lib/unistr/u8-cmp2.c new file mode 100644 index 00000000..99cee9f2 --- /dev/null +++ b/lib/unistr/u8-cmp2.c @@ -0,0 +1,28 @@ +/* Compare pieces of UTF-8 strings. + Copyright (C) 2009 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2009. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +#include "minmax.h" + +#define FUNC u8_cmp2 +#define UNIT uint8_t +#define U_CMP u8_cmp +#include "u-cmp2.h" diff --git a/lib/unistr/u8-cpy-alloc.c b/lib/unistr/u8-cpy-alloc.c new file mode 100644 index 00000000..b2dcc5dc --- /dev/null +++ b/lib/unistr/u8-cpy-alloc.c @@ -0,0 +1,25 @@ +/* Copy piece of UTF-8 string. + Copyright (C) 2002, 2006 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +#define FUNC u8_cpy_alloc +#define UNIT uint8_t +#include "u-cpy-alloc.h" diff --git a/lib/unistr/u8-cpy.c b/lib/unistr/u8-cpy.c new file mode 100644 index 00000000..bf3a55bc --- /dev/null +++ b/lib/unistr/u8-cpy.c @@ -0,0 +1,25 @@ +/* Copy piece of UTF-8 string. + Copyright (C) 2002, 2006 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +#define FUNC u8_cpy +#define UNIT uint8_t +#include "u-cpy.h" diff --git a/lib/unistr/u8-endswith.c b/lib/unistr/u8-endswith.c new file mode 100644 index 00000000..e40613aa --- /dev/null +++ b/lib/unistr/u8-endswith.c @@ -0,0 +1,27 @@ +/* Substring test for UTF-8 strings. + Copyright (C) 2002, 2006 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +#define FUNC u8_endswith +#define UNIT uint8_t +#define U_STRLEN u8_strlen +#define U_CMP u8_cmp +#include "u-endswith.h" diff --git a/lib/unistr/u8-mblen.c b/lib/unistr/u8-mblen.c new file mode 100644 index 00000000..1981befe --- /dev/null +++ b/lib/unistr/u8-mblen.c @@ -0,0 +1,98 @@ +/* Look at first character in UTF-8 string. + Copyright (C) 1999-2000, 2002, 2006-2007 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +int +u8_mblen (const uint8_t *s, size_t n) +{ + if (n > 0) + { + /* Keep in sync with unistr.h and utf8-ucs4.c. */ + uint8_t c = *s; + + if (c < 0x80) + return (c != 0 ? 1 : 0); + if (c >= 0xc2) + { + if (c < 0xe0) + { + if (n >= 2 +#if CONFIG_UNICODE_SAFETY + && (s[1] ^ 0x80) < 0x40 +#endif + ) + return 2; + } + else if (c < 0xf0) + { + if (n >= 3 +#if CONFIG_UNICODE_SAFETY + && (s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 + && (c >= 0xe1 || s[1] >= 0xa0) + && (c != 0xed || s[1] < 0xa0) +#endif + ) + return 3; + } + else if (c < 0xf8) + { + if (n >= 4 +#if CONFIG_UNICODE_SAFETY + && (s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 + && (s[3] ^ 0x80) < 0x40 + && (c >= 0xf1 || s[1] >= 0x90) +#if 1 + && (c < 0xf4 || (c == 0xf4 && s[1] < 0x90)) +#endif +#endif + ) + return 4; + } +#if 0 + else if (c < 0xfc) + { + if (n >= 5 +#if CONFIG_UNICODE_SAFETY + && (s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 + && (s[3] ^ 0x80) < 0x40 && (s[4] ^ 0x80) < 0x40 + && (c >= 0xf9 || s[1] >= 0x88) +#endif + ) + return 5; + } + else if (c < 0xfe) + { + if (n >= 6 +#if CONFIG_UNICODE_SAFETY + && (s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 + && (s[3] ^ 0x80) < 0x40 && (s[4] ^ 0x80) < 0x40 + && (s[5] ^ 0x80) < 0x40 + && (c >= 0xfd || s[1] >= 0x84) +#endif + ) + return 6; + } +#endif + } + } + /* invalid or incomplete multibyte character */ + return -1; +} diff --git a/lib/unistr/u8-mbsnlen.c b/lib/unistr/u8-mbsnlen.c new file mode 100644 index 00000000..29af297f --- /dev/null +++ b/lib/unistr/u8-mbsnlen.c @@ -0,0 +1,39 @@ +/* Count characters in UTF-8 string. + Copyright (C) 2007 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2007. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +size_t +u8_mbsnlen (const uint8_t *s, size_t n) +{ + size_t characters; + + characters = 0; + while (n > 0) + { + int count = u8_mblen (s, n); + if (count <= 0) + count = 1; + s += count; + n -= count; + characters++; + } + return characters; +} diff --git a/lib/unistr/u8-mbtouc-aux.c b/lib/unistr/u8-mbtouc-aux.c new file mode 100644 index 00000000..53d02bf0 --- /dev/null +++ b/lib/unistr/u8-mbtouc-aux.c @@ -0,0 +1,158 @@ +/* Conversion UTF-8 to UCS-4. + Copyright (C) 2001-2002, 2006-2007, 2009 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2001. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +#if defined IN_LIBUNISTRING || HAVE_INLINE + +int +u8_mbtouc_aux (ucs4_t *puc, const uint8_t *s, size_t n) +{ + uint8_t c = *s; + + if (c >= 0xc2) + { + if (c < 0xe0) + { + if (n >= 2) + { + if ((s[1] ^ 0x80) < 0x40) + { + *puc = ((unsigned int) (c & 0x1f) << 6) + | (unsigned int) (s[1] ^ 0x80); + return 2; + } + /* invalid multibyte character */ + } + else + { + /* incomplete multibyte character */ + *puc = 0xfffd; + return n; + } + } + else if (c < 0xf0) + { + if (n >= 3) + { + if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 + && (c >= 0xe1 || s[1] >= 0xa0) + && (c != 0xed || s[1] < 0xa0)) + { + *puc = ((unsigned int) (c & 0x0f) << 12) + | ((unsigned int) (s[1] ^ 0x80) << 6) + | (unsigned int) (s[2] ^ 0x80); + return 3; + } + /* invalid multibyte character */ + } + else + { + /* incomplete multibyte character */ + *puc = 0xfffd; + return n; + } + } + else if (c < 0xf8) + { + if (n >= 4) + { + if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 + && (s[3] ^ 0x80) < 0x40 + && (c >= 0xf1 || s[1] >= 0x90) +#if 1 + && (c < 0xf4 || (c == 0xf4 && s[1] < 0x90)) +#endif + ) + { + *puc = ((unsigned int) (c & 0x07) << 18) + | ((unsigned int) (s[1] ^ 0x80) << 12) + | ((unsigned int) (s[2] ^ 0x80) << 6) + | (unsigned int) (s[3] ^ 0x80); + return 4; + } + /* invalid multibyte character */ + } + else + { + /* incomplete multibyte character */ + *puc = 0xfffd; + return n; + } + } +#if 0 + else if (c < 0xfc) + { + if (n >= 5) + { + if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 + && (s[3] ^ 0x80) < 0x40 && (s[4] ^ 0x80) < 0x40 + && (c >= 0xf9 || s[1] >= 0x88)) + { + *puc = ((unsigned int) (c & 0x03) << 24) + | ((unsigned int) (s[1] ^ 0x80) << 18) + | ((unsigned int) (s[2] ^ 0x80) << 12) + | ((unsigned int) (s[3] ^ 0x80) << 6) + | (unsigned int) (s[4] ^ 0x80); + return 5; + } + /* invalid multibyte character */ + } + else + { + /* incomplete multibyte character */ + *puc = 0xfffd; + return n; + } + } + else if (c < 0xfe) + { + if (n >= 6) + { + if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 + && (s[3] ^ 0x80) < 0x40 && (s[4] ^ 0x80) < 0x40 + && (s[5] ^ 0x80) < 0x40 + && (c >= 0xfd || s[1] >= 0x84)) + { + *puc = ((unsigned int) (c & 0x01) << 30) + | ((unsigned int) (s[1] ^ 0x80) << 24) + | ((unsigned int) (s[2] ^ 0x80) << 18) + | ((unsigned int) (s[3] ^ 0x80) << 12) + | ((unsigned int) (s[4] ^ 0x80) << 6) + | (unsigned int) (s[5] ^ 0x80); + return 6; + } + /* invalid multibyte character */ + } + else + { + /* incomplete multibyte character */ + *puc = 0xfffd; + return n; + } + } +#endif + } + /* invalid multibyte character */ + *puc = 0xfffd; + return 1; +} + +#endif diff --git a/lib/unistr/u8-mbtouc-unsafe-aux.c b/lib/unistr/u8-mbtouc-unsafe-aux.c new file mode 100644 index 00000000..43e4a360 --- /dev/null +++ b/lib/unistr/u8-mbtouc-unsafe-aux.c @@ -0,0 +1,168 @@ +/* Conversion UTF-8 to UCS-4. + Copyright (C) 2001-2002, 2006-2007, 2009 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2001. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +#if defined IN_LIBUNISTRING || HAVE_INLINE + +int +u8_mbtouc_unsafe_aux (ucs4_t *puc, const uint8_t *s, size_t n) +{ + uint8_t c = *s; + + if (c >= 0xc2) + { + if (c < 0xe0) + { + if (n >= 2) + { +#if CONFIG_UNICODE_SAFETY + if ((s[1] ^ 0x80) < 0x40) +#endif + { + *puc = ((unsigned int) (c & 0x1f) << 6) + | (unsigned int) (s[1] ^ 0x80); + return 2; + } + /* invalid multibyte character */ + } + else + { + /* incomplete multibyte character */ + *puc = 0xfffd; + return n; + } + } + else if (c < 0xf0) + { + if (n >= 3) + { +#if CONFIG_UNICODE_SAFETY + if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 + && (c >= 0xe1 || s[1] >= 0xa0) + && (c != 0xed || s[1] < 0xa0)) +#endif + { + *puc = ((unsigned int) (c & 0x0f) << 12) + | ((unsigned int) (s[1] ^ 0x80) << 6) + | (unsigned int) (s[2] ^ 0x80); + return 3; + } + /* invalid multibyte character */ + } + else + { + /* incomplete multibyte character */ + *puc = 0xfffd; + return n; + } + } + else if (c < 0xf8) + { + if (n >= 4) + { +#if CONFIG_UNICODE_SAFETY + if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 + && (s[3] ^ 0x80) < 0x40 + && (c >= 0xf1 || s[1] >= 0x90) +#if 1 + && (c < 0xf4 || (c == 0xf4 && s[1] < 0x90)) +#endif + ) +#endif + { + *puc = ((unsigned int) (c & 0x07) << 18) + | ((unsigned int) (s[1] ^ 0x80) << 12) + | ((unsigned int) (s[2] ^ 0x80) << 6) + | (unsigned int) (s[3] ^ 0x80); + return 4; + } + /* invalid multibyte character */ + } + else + { + /* incomplete multibyte character */ + *puc = 0xfffd; + return n; + } + } +#if 0 + else if (c < 0xfc) + { + if (n >= 5) + { +#if CONFIG_UNICODE_SAFETY + if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 + && (s[3] ^ 0x80) < 0x40 && (s[4] ^ 0x80) < 0x40 + && (c >= 0xf9 || s[1] >= 0x88)) +#endif + { + *puc = ((unsigned int) (c & 0x03) << 24) + | ((unsigned int) (s[1] ^ 0x80) << 18) + | ((unsigned int) (s[2] ^ 0x80) << 12) + | ((unsigned int) (s[3] ^ 0x80) << 6) + | (unsigned int) (s[4] ^ 0x80); + return 5; + } + /* invalid multibyte character */ + } + else + { + /* incomplete multibyte character */ + *puc = 0xfffd; + return n; + } + } + else if (c < 0xfe) + { + if (n >= 6) + { +#if CONFIG_UNICODE_SAFETY + if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 + && (s[3] ^ 0x80) < 0x40 && (s[4] ^ 0x80) < 0x40 + && (s[5] ^ 0x80) < 0x40 + && (c >= 0xfd || s[1] >= 0x84)) +#endif + { + *puc = ((unsigned int) (c & 0x01) << 30) + | ((unsigned int) (s[1] ^ 0x80) << 24) + | ((unsigned int) (s[2] ^ 0x80) << 18) + | ((unsigned int) (s[3] ^ 0x80) << 12) + | ((unsigned int) (s[4] ^ 0x80) << 6) + | (unsigned int) (s[5] ^ 0x80); + return 6; + } + /* invalid multibyte character */ + } + else + { + /* incomplete multibyte character */ + *puc = 0xfffd; + return n; + } + } +#endif + } + /* invalid multibyte character */ + *puc = 0xfffd; + return 1; +} + +#endif diff --git a/lib/unistr/u8-mbtouc-unsafe.c b/lib/unistr/u8-mbtouc-unsafe.c new file mode 100644 index 00000000..46615696 --- /dev/null +++ b/lib/unistr/u8-mbtouc-unsafe.c @@ -0,0 +1,179 @@ +/* Look at first character in UTF-8 string. + Copyright (C) 1999-2002, 2006-2007, 2009 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2001. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +#if defined IN_LIBUNISTRING +/* Tell unistr.h to declare u8_mbtouc_unsafe as 'extern', not + 'static inline'. */ +# include "unistring-notinline.h" +#endif + +/* Specification. */ +#include "unistr.h" + +#if !HAVE_INLINE + +int +u8_mbtouc_unsafe (ucs4_t *puc, const uint8_t *s, size_t n) +{ + uint8_t c = *s; + + if (c < 0x80) + { + *puc = c; + return 1; + } + else if (c >= 0xc2) + { + if (c < 0xe0) + { + if (n >= 2) + { +#if CONFIG_UNICODE_SAFETY + if ((s[1] ^ 0x80) < 0x40) +#endif + { + *puc = ((unsigned int) (c & 0x1f) << 6) + | (unsigned int) (s[1] ^ 0x80); + return 2; + } + /* invalid multibyte character */ + } + else + { + /* incomplete multibyte character */ + *puc = 0xfffd; + return n; + } + } + else if (c < 0xf0) + { + if (n >= 3) + { +#if CONFIG_UNICODE_SAFETY + if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 + && (c >= 0xe1 || s[1] >= 0xa0) + && (c != 0xed || s[1] < 0xa0)) +#endif + { + *puc = ((unsigned int) (c & 0x0f) << 12) + | ((unsigned int) (s[1] ^ 0x80) << 6) + | (unsigned int) (s[2] ^ 0x80); + return 3; + } + /* invalid multibyte character */ + } + else + { + /* incomplete multibyte character */ + *puc = 0xfffd; + return n; + } + } + else if (c < 0xf8) + { + if (n >= 4) + { +#if CONFIG_UNICODE_SAFETY + if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 + && (s[3] ^ 0x80) < 0x40 + && (c >= 0xf1 || s[1] >= 0x90) +#if 1 + && (c < 0xf4 || (c == 0xf4 && s[1] < 0x90)) +#endif + ) +#endif + { + *puc = ((unsigned int) (c & 0x07) << 18) + | ((unsigned int) (s[1] ^ 0x80) << 12) + | ((unsigned int) (s[2] ^ 0x80) << 6) + | (unsigned int) (s[3] ^ 0x80); + return 4; + } + /* invalid multibyte character */ + } + else + { + /* incomplete multibyte character */ + *puc = 0xfffd; + return n; + } + } +#if 0 + else if (c < 0xfc) + { + if (n >= 5) + { +#if CONFIG_UNICODE_SAFETY + if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 + && (s[3] ^ 0x80) < 0x40 && (s[4] ^ 0x80) < 0x40 + && (c >= 0xf9 || s[1] >= 0x88)) +#endif + { + *puc = ((unsigned int) (c & 0x03) << 24) + | ((unsigned int) (s[1] ^ 0x80) << 18) + | ((unsigned int) (s[2] ^ 0x80) << 12) + | ((unsigned int) (s[3] ^ 0x80) << 6) + | (unsigned int) (s[4] ^ 0x80); + return 5; + } + /* invalid multibyte character */ + } + else + { + /* incomplete multibyte character */ + *puc = 0xfffd; + return n; + } + } + else if (c < 0xfe) + { + if (n >= 6) + { +#if CONFIG_UNICODE_SAFETY + if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 + && (s[3] ^ 0x80) < 0x40 && (s[4] ^ 0x80) < 0x40 + && (s[5] ^ 0x80) < 0x40 + && (c >= 0xfd || s[1] >= 0x84)) +#endif + { + *puc = ((unsigned int) (c & 0x01) << 30) + | ((unsigned int) (s[1] ^ 0x80) << 24) + | ((unsigned int) (s[2] ^ 0x80) << 18) + | ((unsigned int) (s[3] ^ 0x80) << 12) + | ((unsigned int) (s[4] ^ 0x80) << 6) + | (unsigned int) (s[5] ^ 0x80); + return 6; + } + /* invalid multibyte character */ + } + else + { + /* incomplete multibyte character */ + *puc = 0xfffd; + return n; + } + } +#endif + } + /* invalid multibyte character */ + *puc = 0xfffd; + return 1; +} + +#endif diff --git a/lib/unistr/u8-mbtouc.c b/lib/unistr/u8-mbtouc.c new file mode 100644 index 00000000..ff624f17 --- /dev/null +++ b/lib/unistr/u8-mbtouc.c @@ -0,0 +1,168 @@ +/* Look at first character in UTF-8 string. + Copyright (C) 1999-2002, 2006-2007, 2009 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2001. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +#if defined IN_LIBUNISTRING +/* Tell unistr.h to declare u8_mbtouc as 'extern', not 'static inline'. */ +# include "unistring-notinline.h" +#endif + +/* Specification. */ +#include "unistr.h" + +#if !HAVE_INLINE + +int +u8_mbtouc (ucs4_t *puc, const uint8_t *s, size_t n) +{ + uint8_t c = *s; + + if (c < 0x80) + { + *puc = c; + return 1; + } + else if (c >= 0xc2) + { + if (c < 0xe0) + { + if (n >= 2) + { + if ((s[1] ^ 0x80) < 0x40) + { + *puc = ((unsigned int) (c & 0x1f) << 6) + | (unsigned int) (s[1] ^ 0x80); + return 2; + } + /* invalid multibyte character */ + } + else + { + /* incomplete multibyte character */ + *puc = 0xfffd; + return n; + } + } + else if (c < 0xf0) + { + if (n >= 3) + { + if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 + && (c >= 0xe1 || s[1] >= 0xa0) + && (c != 0xed || s[1] < 0xa0)) + { + *puc = ((unsigned int) (c & 0x0f) << 12) + | ((unsigned int) (s[1] ^ 0x80) << 6) + | (unsigned int) (s[2] ^ 0x80); + return 3; + } + /* invalid multibyte character */ + } + else + { + /* incomplete multibyte character */ + *puc = 0xfffd; + return n; + } + } + else if (c < 0xf8) + { + if (n >= 4) + { + if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 + && (s[3] ^ 0x80) < 0x40 + && (c >= 0xf1 || s[1] >= 0x90) +#if 1 + && (c < 0xf4 || (c == 0xf4 && s[1] < 0x90)) +#endif + ) + { + *puc = ((unsigned int) (c & 0x07) << 18) + | ((unsigned int) (s[1] ^ 0x80) << 12) + | ((unsigned int) (s[2] ^ 0x80) << 6) + | (unsigned int) (s[3] ^ 0x80); + return 4; + } + /* invalid multibyte character */ + } + else + { + /* incomplete multibyte character */ + *puc = 0xfffd; + return n; + } + } +#if 0 + else if (c < 0xfc) + { + if (n >= 5) + { + if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 + && (s[3] ^ 0x80) < 0x40 && (s[4] ^ 0x80) < 0x40 + && (c >= 0xf9 || s[1] >= 0x88)) + { + *puc = ((unsigned int) (c & 0x03) << 24) + | ((unsigned int) (s[1] ^ 0x80) << 18) + | ((unsigned int) (s[2] ^ 0x80) << 12) + | ((unsigned int) (s[3] ^ 0x80) << 6) + | (unsigned int) (s[4] ^ 0x80); + return 5; + } + /* invalid multibyte character */ + } + else + { + /* incomplete multibyte character */ + *puc = 0xfffd; + return n; + } + } + else if (c < 0xfe) + { + if (n >= 6) + { + if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 + && (s[3] ^ 0x80) < 0x40 && (s[4] ^ 0x80) < 0x40 + && (s[5] ^ 0x80) < 0x40 + && (c >= 0xfd || s[1] >= 0x84)) + { + *puc = ((unsigned int) (c & 0x01) << 30) + | ((unsigned int) (s[1] ^ 0x80) << 24) + | ((unsigned int) (s[2] ^ 0x80) << 18) + | ((unsigned int) (s[3] ^ 0x80) << 12) + | ((unsigned int) (s[4] ^ 0x80) << 6) + | (unsigned int) (s[5] ^ 0x80); + return 6; + } + /* invalid multibyte character */ + } + else + { + /* incomplete multibyte character */ + *puc = 0xfffd; + return n; + } + } +#endif + } + /* invalid multibyte character */ + *puc = 0xfffd; + return 1; +} + +#endif diff --git a/lib/unistr/u8-mbtoucr.c b/lib/unistr/u8-mbtoucr.c new file mode 100644 index 00000000..dd833524 --- /dev/null +++ b/lib/unistr/u8-mbtoucr.c @@ -0,0 +1,285 @@ +/* Look at first character in UTF-8 string, returning an error code. + Copyright (C) 1999-2002, 2006-2007 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2001. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +int +u8_mbtoucr (ucs4_t *puc, const uint8_t *s, size_t n) +{ + uint8_t c = *s; + + if (c < 0x80) + { + *puc = c; + return 1; + } + else if (c >= 0xc2) + { + if (c < 0xe0) + { + if (n >= 2) + { + if ((s[1] ^ 0x80) < 0x40) + { + *puc = ((unsigned int) (c & 0x1f) << 6) + | (unsigned int) (s[1] ^ 0x80); + return 2; + } + /* invalid multibyte character */ + } + else + { + /* incomplete multibyte character */ + *puc = 0xfffd; + return -2; + } + } + else if (c < 0xf0) + { + if (n >= 2) + { + if ((s[1] ^ 0x80) < 0x40 + && (c >= 0xe1 || s[1] >= 0xa0) + && (c != 0xed || s[1] < 0xa0)) + { + if (n >= 3) + { + if ((s[2] ^ 0x80) < 0x40) + { + *puc = ((unsigned int) (c & 0x0f) << 12) + | ((unsigned int) (s[1] ^ 0x80) << 6) + | (unsigned int) (s[2] ^ 0x80); + return 3; + } + /* invalid multibyte character */ + } + else + { + /* incomplete multibyte character */ + *puc = 0xfffd; + return -2; + } + } + /* invalid multibyte character */ + } + else + { + /* incomplete multibyte character */ + *puc = 0xfffd; + return -2; + } + } + else if (c < 0xf8) + { + if (n >= 2) + { + if ((s[1] ^ 0x80) < 0x40 + && (c >= 0xf1 || s[1] >= 0x90) +#if 1 + && (c < 0xf4 || (c == 0xf4 && s[1] < 0x90)) +#endif + ) + { + if (n >= 3) + { + if ((s[2] ^ 0x80) < 0x40) + { + if (n >= 4) + { + if ((s[3] ^ 0x80) < 0x40) + { + *puc = ((unsigned int) (c & 0x07) << 18) + | ((unsigned int) (s[1] ^ 0x80) << 12) + | ((unsigned int) (s[2] ^ 0x80) << 6) + | (unsigned int) (s[3] ^ 0x80); + return 4; + } + /* invalid multibyte character */ + } + else + { + /* incomplete multibyte character */ + *puc = 0xfffd; + return -2; + } + } + /* invalid multibyte character */ + } + else + { + /* incomplete multibyte character */ + *puc = 0xfffd; + return -2; + } + } + /* invalid multibyte character */ + } + else + { + /* incomplete multibyte character */ + *puc = 0xfffd; + return -2; + } + } +#if 0 + else if (c < 0xfc) + { + if (n >= 2) + { + if ((s[1] ^ 0x80) < 0x40 + && (c >= 0xf9 || s[1] >= 0x88)) + { + if (n >= 3) + { + if ((s[2] ^ 0x80) < 0x40) + { + if (n >= 4) + { + if ((s[3] ^ 0x80) < 0x40) + { + if (n >= 5) + { + if ((s[4] ^ 0x80) < 0x40) + { + *puc = ((unsigned int) (c & 0x03) << 24) + | ((unsigned int) (s[1] ^ 0x80) << 18) + | ((unsigned int) (s[2] ^ 0x80) << 12) + | ((unsigned int) (s[3] ^ 0x80) << 6) + | (unsigned int) (s[4] ^ 0x80); + return 5; + } + /* invalid multibyte character */ + } + else + { + /* incomplete multibyte character */ + *puc = 0xfffd; + return -2; + } + } + /* invalid multibyte character */ + } + else + { + /* incomplete multibyte character */ + *puc = 0xfffd; + return -2; + } + } + /* invalid multibyte character */ + } + else + { + /* incomplete multibyte character */ + *puc = 0xfffd; + return -2; + } + } + /* invalid multibyte character */ + } + else + { + /* incomplete multibyte character */ + *puc = 0xfffd; + return -2; + } + } + else if (c < 0xfe) + { + if (n >= 2) + { + if ((s[1] ^ 0x80) < 0x40 + && (c >= 0xfd || s[1] >= 0x84)) + { + if (n >= 3) + { + if ((s[2] ^ 0x80) < 0x40) + { + if (n >= 4) + { + if ((s[3] ^ 0x80) < 0x40) + { + if (n >= 5) + { + if ((s[4] ^ 0x80) < 0x40) + { + if (n >= 6) + { + if ((s[5] ^ 0x80) < 0x40) + { + *puc = ((unsigned int) (c & 0x01) << 30) + | ((unsigned int) (s[1] ^ 0x80) << 24) + | ((unsigned int) (s[2] ^ 0x80) << 18) + | ((unsigned int) (s[3] ^ 0x80) << 12) + | ((unsigned int) (s[4] ^ 0x80) << 6) + | (unsigned int) (s[5] ^ 0x80); + return 6; + } + /* invalid multibyte character */ + } + else + { + /* incomplete multibyte character */ + *puc = 0xfffd; + return -2; + } + } + /* invalid multibyte character */ + } + else + { + /* incomplete multibyte character */ + *puc = 0xfffd; + return -2; + } + } + /* invalid multibyte character */ + } + else + { + /* incomplete multibyte character */ + *puc = 0xfffd; + return -2; + } + } + /* invalid multibyte character */ + } + else + { + /* incomplete multibyte character */ + *puc = 0xfffd; + return -2; + } + } + /* invalid multibyte character */ + } + else + { + /* incomplete multibyte character */ + *puc = 0xfffd; + return -2; + } + } +#endif + } + /* invalid multibyte character */ + *puc = 0xfffd; + return -1; +} diff --git a/lib/unistr/u8-move.c b/lib/unistr/u8-move.c new file mode 100644 index 00000000..5a30be8c --- /dev/null +++ b/lib/unistr/u8-move.c @@ -0,0 +1,25 @@ +/* Copy piece of UTF-8 string. + Copyright (C) 2002, 2006 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +#define FUNC u8_move +#define UNIT uint8_t +#include "u-move.h" diff --git a/lib/unistr/u8-next.c b/lib/unistr/u8-next.c new file mode 100644 index 00000000..ecf4f80a --- /dev/null +++ b/lib/unistr/u8-next.c @@ -0,0 +1,37 @@ +/* Iterate over next character in UTF-8 string. + Copyright (C) 2002, 2006 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +const uint8_t * +u8_next (ucs4_t *puc, const uint8_t *s) +{ + int count; + + count = u8_strmbtouc (puc, s); + if (count > 0) + return s + count; + else + { + if (count < 0) + *puc = 0xfffd; + return NULL; + } +} diff --git a/lib/unistr/u8-prev.c b/lib/unistr/u8-prev.c new file mode 100644 index 00000000..245d22ff --- /dev/null +++ b/lib/unistr/u8-prev.c @@ -0,0 +1,93 @@ +/* Iterate over previous character in UTF-8 string. + Copyright (C) 2002, 2006-2007 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +const uint8_t * +u8_prev (ucs4_t *puc, const uint8_t *s, const uint8_t *start) +{ + /* Keep in sync with unistr.h and utf8-ucs4.c. */ + if (s != start) + { + uint8_t c_1 = s[-1]; + + if (c_1 < 0x80) + { + *puc = c_1; + return s - 1; + } +#if CONFIG_UNICODE_SAFETY + if ((c_1 ^ 0x80) < 0x40) +#endif + if (s - 1 != start) + { + uint8_t c_2 = s[-2]; + + if (c_2 >= 0xc2 && c_2 < 0xe0) + { + *puc = ((unsigned int) (c_2 & 0x1f) << 6) + | (unsigned int) (c_1 ^ 0x80); + return s - 2; + } +#if CONFIG_UNICODE_SAFETY + if ((c_2 ^ 0x80) < 0x40) +#endif + if (s - 2 != start) + { + uint8_t c_3 = s[-3]; + + if (c_3 >= 0xe0 && c_3 < 0xf0 +#if CONFIG_UNICODE_SAFETY + && (c_3 >= 0xe1 || c_2 >= 0xa0) + && (c_3 != 0xed || c_2 < 0xa0) +#endif + ) + { + *puc = ((unsigned int) (c_3 & 0x0f) << 12) + | ((unsigned int) (c_2 ^ 0x80) << 6) + | (unsigned int) (c_1 ^ 0x80); + return s - 3; + } +#if CONFIG_UNICODE_SAFETY + if ((c_3 ^ 0x80) < 0x40) +#endif + if (s - 3 != start) + { + uint8_t c_4 = s[-4]; + + if (c_4 >= 0xf0 && c_4 < 0xf8 +#if CONFIG_UNICODE_SAFETY + && (c_4 >= 0xf1 || c_3 >= 0x90) + && (c_4 < 0xf4 || (c_4 == 0xf4 && c_3 < 0x90)) +#endif + ) + { + *puc = ((unsigned int) (c_4 & 0x07) << 18) + | ((unsigned int) (c_3 ^ 0x80) << 12) + | ((unsigned int) (c_2 ^ 0x80) << 6) + | (unsigned int) (c_1 ^ 0x80); + return s - 4; + } + } + } + } + } + return NULL; +} diff --git a/lib/unistr/u8-set.c b/lib/unistr/u8-set.c new file mode 100644 index 00000000..5bb4fd7a --- /dev/null +++ b/lib/unistr/u8-set.c @@ -0,0 +1,44 @@ +/* Fill UTF-8 string. + Copyright (C) 2002, 2006 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +#define FUNC u8_set +#define UNIT uint8_t +#define IS_SINGLE_UNIT(uc) (uc < 0x80) + +#include <errno.h> +#include <string.h> + +UNIT * +FUNC (UNIT *s, ucs4_t uc, size_t n) +{ + if (n > 0) + { + if (IS_SINGLE_UNIT (uc)) + memset ((char *) s, uc, n); + else + { + errno = EILSEQ; + return NULL; + } + } + return s; +} diff --git a/lib/unistr/u8-startswith.c b/lib/unistr/u8-startswith.c new file mode 100644 index 00000000..671e3d9f --- /dev/null +++ b/lib/unistr/u8-startswith.c @@ -0,0 +1,25 @@ +/* Substring test for UTF-8 strings. + Copyright (C) 2002, 2006 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +#define FUNC u8_startswith +#define UNIT uint8_t +#include "u-startswith.h" diff --git a/lib/unistr/u8-stpcpy.c b/lib/unistr/u8-stpcpy.c new file mode 100644 index 00000000..a5c15c5d --- /dev/null +++ b/lib/unistr/u8-stpcpy.c @@ -0,0 +1,44 @@ +/* Copy UTF-8 string. + Copyright (C) 1999, 2002, 2006 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +/* Ensure stpcpy() gets declared. */ +#ifndef _GNU_SOURCE +# define _GNU_SOURCE 1 +#endif + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +#if HAVE_STPCPY + +# include <string.h> + +uint8_t * +u8_stpcpy (uint8_t *dest, const uint8_t *src) +{ + return (uint8_t *) stpcpy ((char *) dest, (const char *) src); +} + +#else + +# define FUNC u8_stpcpy +# define UNIT uint8_t +# include "u-stpcpy.h" + +#endif diff --git a/lib/unistr/u8-stpncpy.c b/lib/unistr/u8-stpncpy.c new file mode 100644 index 00000000..87f254d7 --- /dev/null +++ b/lib/unistr/u8-stpncpy.c @@ -0,0 +1,44 @@ +/* Copy UTF-8 string. + Copyright (C) 1999, 2002, 2006 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +/* Ensure stpncpy() gets declared. */ +#ifndef _GNU_SOURCE +# define _GNU_SOURCE 1 +#endif + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +#if __GLIBC__ >= 2 + +# include <string.h> + +uint8_t * +u8_stpncpy (uint8_t *dest, const uint8_t *src, size_t n) +{ + return (uint8_t *) stpncpy ((char *) dest, (const char *) src, n); +} + +#else + +# define FUNC u8_stpncpy +# define UNIT uint8_t +# include "u-stpncpy.h" + +#endif diff --git a/lib/unistr/u8-strcat.c b/lib/unistr/u8-strcat.c new file mode 100644 index 00000000..25ec5d37 --- /dev/null +++ b/lib/unistr/u8-strcat.c @@ -0,0 +1,29 @@ +/* Concatenate UTF-8 strings. + Copyright (C) 2002, 2006 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +#include <string.h> + +uint8_t * +u8_strcat (uint8_t *dest, const uint8_t *src) +{ + return (uint8_t *) strcat ((char *) dest, (const char *) src); +} diff --git a/lib/unistr/u8-strchr.c b/lib/unistr/u8-strchr.c new file mode 100644 index 00000000..370a7930 --- /dev/null +++ b/lib/unistr/u8-strchr.c @@ -0,0 +1,100 @@ +/* Search character in UTF-8 string. + Copyright (C) 1999, 2002, 2006-2007 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +uint8_t * +u8_strchr (const uint8_t *s, ucs4_t uc) +{ + uint8_t c[6]; + + if (uc < 0x80) + { + uint8_t c0 = uc; + + for (;; s++) + { + if (*s == c0) + break; + if (*s == 0) + goto notfound; + } + return (uint8_t *) s; + } + else + switch (u8_uctomb_aux (c, uc, 6)) + { + case 2: + if (*s == 0) + goto notfound; + { + uint8_t c0 = c[0]; + uint8_t c1 = c[1]; + + for (;; s++) + { + if (s[1] == 0) + goto notfound; + if (*s == c0 && s[1] == c1) + break; + } + return (uint8_t *) s; + } + + case 3: + if (*s == 0 || s[1] == 0) + goto notfound; + { + uint8_t c0 = c[0]; + uint8_t c1 = c[1]; + uint8_t c2 = c[2]; + + for (;; s++) + { + if (s[2] == 0) + goto notfound; + if (*s == c0 && s[1] == c1 && s[2] == c2) + break; + } + return (uint8_t *) s; + } + + case 4: + if (*s == 0 || s[1] == 0 || s[2] == 0) + goto notfound; + { + uint8_t c0 = c[0]; + uint8_t c1 = c[1]; + uint8_t c2 = c[2]; + uint8_t c3 = c[3]; + + for (;; s++) + { + if (s[3] == 0) + goto notfound; + if (*s == c0 && s[1] == c1 && s[2] == c2 && s[3] == c3) + break; + } + return (uint8_t *) s; + } + } +notfound: + return NULL; +} diff --git a/lib/unistr/u8-strcmp.c b/lib/unistr/u8-strcmp.c new file mode 100644 index 00000000..58b4b8c2 --- /dev/null +++ b/lib/unistr/u8-strcmp.c @@ -0,0 +1,30 @@ +/* Compare UTF-8 strings. + Copyright (C) 2002, 2006 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +#include <string.h> + +int +u8_strcmp (const uint8_t *s1, const uint8_t *s2) +{ + /* Use the fact that the UTF-8 encoding preserves lexicographic order. */ + return strcmp ((const char *) s1, (const char *) s2); +} diff --git a/lib/unistr/u8-strcoll.c b/lib/unistr/u8-strcoll.c new file mode 100644 index 00000000..ca08609e --- /dev/null +++ b/lib/unistr/u8-strcoll.c @@ -0,0 +1,33 @@ +/* Compare UTF-8 strings using the collation rules of the current locale. + Copyright (C) 2009 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2009. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +#include <errno.h> +#include <stdlib.h> +#include <string.h> + +#include "uniconv.h" + +#define FUNC u8_strcoll +#define UNIT uint8_t +#define U_STRCMP u8_strcmp +#define U_STRCONV_TO_LOCALE u8_strconv_to_locale +#include "u-strcoll.h" diff --git a/lib/unistr/u8-strcpy.c b/lib/unistr/u8-strcpy.c new file mode 100644 index 00000000..b93c8cb4 --- /dev/null +++ b/lib/unistr/u8-strcpy.c @@ -0,0 +1,29 @@ +/* Copy UTF-8 string. + Copyright (C) 2002, 2006 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +#include <string.h> + +uint8_t * +u8_strcpy (uint8_t *dest, const uint8_t *src) +{ + return (uint8_t *) strcpy ((char *) dest, (const char *) src); +} diff --git a/lib/unistr/u8-strcspn.c b/lib/unistr/u8-strcspn.c new file mode 100644 index 00000000..a43b53fb --- /dev/null +++ b/lib/unistr/u8-strcspn.c @@ -0,0 +1,28 @@ +/* Search for some characters in UTF-8 string. + Copyright (C) 1999, 2002, 2006 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +#define FUNC u8_strcspn +#define UNIT uint8_t +#define U_STRLEN u8_strlen +#define U_STRMBTOUC u8_strmbtouc +#define U_STRCHR u8_strchr +#include "u-strcspn.h" diff --git a/lib/unistr/u8-strdup.c b/lib/unistr/u8-strdup.c new file mode 100644 index 00000000..feaa8725 --- /dev/null +++ b/lib/unistr/u8-strdup.c @@ -0,0 +1,40 @@ +/* Copy UTF-8 string. + Copyright (C) 2002, 2006 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +#if HAVE_STRDUP + +# include <string.h> + +uint8_t * +u8_strdup (const uint8_t *s) +{ + return (uint8_t *) strdup ((const char *) s); +} + +#else + +# define FUNC u8_strdup +# define UNIT uint8_t +# define U_STRLEN u8_strlen +# include "u-strdup.h" + +#endif diff --git a/lib/unistr/u8-strlen.c b/lib/unistr/u8-strlen.c new file mode 100644 index 00000000..cf5b9ceb --- /dev/null +++ b/lib/unistr/u8-strlen.c @@ -0,0 +1,29 @@ +/* Determine length of UTF-8 string. + Copyright (C) 2002, 2006 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +#include <string.h> + +size_t +u8_strlen (const uint8_t *s) +{ + return strlen ((const char *) s); +} diff --git a/lib/unistr/u8-strmblen.c b/lib/unistr/u8-strmblen.c new file mode 100644 index 00000000..52db99d0 --- /dev/null +++ b/lib/unistr/u8-strmblen.c @@ -0,0 +1,96 @@ +/* Look at first character in UTF-8 string. + Copyright (C) 1999-2000, 2002, 2006-2007 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +int +u8_strmblen (const uint8_t *s) +{ + /* Keep in sync with unistr.h and utf8-ucs4.c. */ + uint8_t c = *s; + + if (c < 0x80) + return (c != 0 ? 1 : 0); + if (c >= 0xc2) + { + if (c < 0xe0) + { +#if CONFIG_UNICODE_SAFETY + if ((s[1] ^ 0x80) < 0x40) +#else + if (s[1] != 0) +#endif + return 2; + } + else if (c < 0xf0) + { +#if CONFIG_UNICODE_SAFETY + if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 + && (c >= 0xe1 || s[1] >= 0xa0) + && (c != 0xed || s[1] < 0xa0)) +#else + if (s[1] != 0 && s[2] != 0) +#endif + return 3; + } + else if (c < 0xf8) + { +#if CONFIG_UNICODE_SAFETY + if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 + && (s[3] ^ 0x80) < 0x40 + && (c >= 0xf1 || s[1] >= 0x90) +#if 1 + && (c < 0xf4 || (c == 0xf4 && s[1] < 0x90)) +#endif + ) +#else + if (s[1] != 0 && s[2] != 0 && s[3] != 0) +#endif + return 4; + } +#if 0 + else if (c < 0xfc) + { +#if CONFIG_UNICODE_SAFETY + if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 + && (s[3] ^ 0x80) < 0x40 && (s[4] ^ 0x80) < 0x40 + && (c >= 0xf9 || s[1] >= 0x88)) +#else + if (s[1] != 0 && s[2] != 0 && s[3] != 0 && s[4] != 0) +#endif + return 5; + } + else if (c < 0xfe) + { +#if CONFIG_UNICODE_SAFETY + if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 + && (s[3] ^ 0x80) < 0x40 && (s[4] ^ 0x80) < 0x40 + && (s[5] ^ 0x80) < 0x40 + && (c >= 0xfd || s[1] >= 0x84)) +#else + if (s[1] != 0 && s[2] != 0 && s[3] != 0 && s[4] != 0 && s[5] != 0) +#endif + return 6; + } +#endif + } + /* invalid or incomplete multibyte character */ + return -1; +} diff --git a/lib/unistr/u8-strmbtouc.c b/lib/unistr/u8-strmbtouc.c new file mode 100644 index 00000000..bc2fdb0d --- /dev/null +++ b/lib/unistr/u8-strmbtouc.c @@ -0,0 +1,129 @@ +/* Look at first character in UTF-8 string. + Copyright (C) 1999-2000, 2002, 2006-2007 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +int +u8_strmbtouc (ucs4_t *puc, const uint8_t *s) +{ + /* Keep in sync with unistr.h and utf8-ucs4.c. */ + uint8_t c = *s; + + if (c < 0x80) + { + *puc = c; + return (c != 0 ? 1 : 0); + } + if (c >= 0xc2) + { + if (c < 0xe0) + { +#if CONFIG_UNICODE_SAFETY + if ((s[1] ^ 0x80) < 0x40) +#else + if (s[1] != 0) +#endif + { + *puc = ((unsigned int) (c & 0x1f) << 6) + | (unsigned int) (s[1] ^ 0x80); + return 2; + } + } + else if (c < 0xf0) + { +#if CONFIG_UNICODE_SAFETY + if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 + && (c >= 0xe1 || s[1] >= 0xa0) + && (c != 0xed || s[1] < 0xa0)) +#else + if (s[1] != 0 && s[2] != 0) +#endif + { + *puc = ((unsigned int) (c & 0x0f) << 12) + | ((unsigned int) (s[1] ^ 0x80) << 6) + | (unsigned int) (s[2] ^ 0x80); + return 3; + } + } + else if (c < 0xf8) + { +#if CONFIG_UNICODE_SAFETY + if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 + && (s[3] ^ 0x80) < 0x40 + && (c >= 0xf1 || s[1] >= 0x90) +#if 1 + && (c < 0xf4 || (c == 0xf4 && s[1] < 0x90)) +#endif + ) +#else + if (s[1] != 0 && s[2] != 0 && s[3] != 0) +#endif + { + *puc = ((unsigned int) (c & 0x07) << 18) + | ((unsigned int) (s[1] ^ 0x80) << 12) + | ((unsigned int) (s[2] ^ 0x80) << 6) + | (unsigned int) (s[3] ^ 0x80); + return 4; + } + } +#if 0 + else if (c < 0xfc) + { +#if CONFIG_UNICODE_SAFETY + if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 + && (s[3] ^ 0x80) < 0x40 && (s[4] ^ 0x80) < 0x40 + && (c >= 0xf9 || s[1] >= 0x88)) +#else + if (s[1] != 0 && s[2] != 0 && s[3] != 0 && s[4] != 0) +#endif + { + *puc = ((unsigned int) (c & 0x03) << 24) + | ((unsigned int) (s[1] ^ 0x80) << 18) + | ((unsigned int) (s[2] ^ 0x80) << 12) + | ((unsigned int) (s[3] ^ 0x80) << 6) + | (unsigned int) (s[4] ^ 0x80); + return 5; + } + } + else if (c < 0xfe) + { +#if CONFIG_UNICODE_SAFETY + if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 + && (s[3] ^ 0x80) < 0x40 && (s[4] ^ 0x80) < 0x40 + && (s[5] ^ 0x80) < 0x40 + && (c >= 0xfd || s[1] >= 0x84)) +#else + if (s[1] != 0 && s[2] != 0 && s[3] != 0 && s[4] != 0 && s[5] != 0) +#endif + { + *puc = ((unsigned int) (c & 0x01) << 30) + | ((unsigned int) (s[1] ^ 0x80) << 24) + | ((unsigned int) (s[2] ^ 0x80) << 18) + | ((unsigned int) (s[3] ^ 0x80) << 12) + | ((unsigned int) (s[4] ^ 0x80) << 6) + | (unsigned int) (s[5] ^ 0x80); + return 6; + } + } +#endif + } + /* invalid or incomplete multibyte character */ + return -1; +} diff --git a/lib/unistr/u8-strncat.c b/lib/unistr/u8-strncat.c new file mode 100644 index 00000000..d9654602 --- /dev/null +++ b/lib/unistr/u8-strncat.c @@ -0,0 +1,29 @@ +/* Concatenate UTF-8 strings. + Copyright (C) 2002, 2006 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +#include <string.h> + +uint8_t * +u8_strncat (uint8_t *dest, const uint8_t *src, size_t n) +{ + return (uint8_t *) strncat ((char *) dest, (const char *) src, n); +} diff --git a/lib/unistr/u8-strncmp.c b/lib/unistr/u8-strncmp.c new file mode 100644 index 00000000..5aad7e76 --- /dev/null +++ b/lib/unistr/u8-strncmp.c @@ -0,0 +1,30 @@ +/* Compare UTF-8 strings. + Copyright (C) 2002, 2006 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +#include <string.h> + +int +u8_strncmp (const uint8_t *s1, const uint8_t *s2, size_t n) +{ + /* Use the fact that the UTF-8 encoding preserves lexicographic order. */ + return strncmp ((const char *) s1, (const char *) s2, n); +} diff --git a/lib/unistr/u8-strncpy.c b/lib/unistr/u8-strncpy.c new file mode 100644 index 00000000..f478fab6 --- /dev/null +++ b/lib/unistr/u8-strncpy.c @@ -0,0 +1,29 @@ +/* Copy UTF-8 string. + Copyright (C) 2002, 2006 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +#include <string.h> + +uint8_t * +u8_strncpy (uint8_t *dest, const uint8_t *src, size_t n) +{ + return (uint8_t *) strncpy ((char *) dest, (const char *) src, n); +} diff --git a/lib/unistr/u8-strnlen.c b/lib/unistr/u8-strnlen.c new file mode 100644 index 00000000..32ce7b88 --- /dev/null +++ b/lib/unistr/u8-strnlen.c @@ -0,0 +1,44 @@ +/* Determine bounded length of UTF-8 string. + Copyright (C) 1999, 2002, 2006 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +/* Ensure strnlen() gets declared. */ +#ifndef _GNU_SOURCE +# define _GNU_SOURCE 1 +#endif + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +#if __GLIBC__ >= 2 + +# include <string.h> + +size_t +u8_strnlen (const uint8_t *s, size_t maxlen) +{ + return strnlen ((const char *) s, maxlen); +} + +#else + +# define FUNC u8_strnlen +# define UNIT uint8_t +# include "u-strnlen.h" + +#endif diff --git a/lib/unistr/u8-strpbrk.c b/lib/unistr/u8-strpbrk.c new file mode 100644 index 00000000..1538faa7 --- /dev/null +++ b/lib/unistr/u8-strpbrk.c @@ -0,0 +1,27 @@ +/* Search for some characters in UTF-8 string. + Copyright (C) 1999, 2002, 2006 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +#define FUNC u8_strpbrk +#define UNIT uint8_t +#define U_STRMBTOUC u8_strmbtouc +#define U_STRCHR u8_strchr +#include "u-strpbrk.h" diff --git a/lib/unistr/u8-strrchr.c b/lib/unistr/u8-strrchr.c new file mode 100644 index 00000000..309f3234 --- /dev/null +++ b/lib/unistr/u8-strrchr.c @@ -0,0 +1,101 @@ +/* Search character in UTF-8 string. + Copyright (C) 1999, 2002, 2006-2007 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +uint8_t * +u8_strrchr (const uint8_t *s, ucs4_t uc) +{ + /* Calling u8_strlen and then searching from the other end would cause more + memory accesses. Avoid that, at the cost of a few more comparisons. */ + uint8_t *result = NULL; + uint8_t c[6]; + + if (uc < 0x80) + { + uint8_t c0 = uc; + + for (;; s++) + { + if (*s == c0) + result = (uint8_t *) s; + if (*s == 0) + break; + } + } + else + switch (u8_uctomb_aux (c, uc, 6)) + { + case 2: + if (*s) + { + uint8_t c0 = c[0]; + uint8_t c1 = c[1]; + + /* FIXME: Maybe walking the string via u8_mblen is a win? */ + for (;; s++) + { + if (s[1] == 0) + break; + if (*s == c0 && s[1] == c1) + result = (uint8_t *) s; + } + } + break; + + case 3: + if (*s && s[1]) + { + uint8_t c0 = c[0]; + uint8_t c1 = c[1]; + uint8_t c2 = c[2]; + + /* FIXME: Maybe walking the string via u8_mblen is a win? */ + for (;; s++) + { + if (s[2] == 0) + break; + if (*s == c0 && s[1] == c1 && s[2] == c2) + result = (uint8_t *) s; + } + } + break; + + case 4: + if (*s && s[1] && s[2]) + { + uint8_t c0 = c[0]; + uint8_t c1 = c[1]; + uint8_t c2 = c[2]; + uint8_t c3 = c[3]; + + /* FIXME: Maybe walking the string via u8_mblen is a win? */ + for (;; s++) + { + if (s[3] == 0) + break; + if (*s == c0 && s[1] == c1 && s[2] == c2 && s[3] == c3) + result = (uint8_t *) s; + } + } + break; + } + return result; +} diff --git a/lib/unistr/u8-strspn.c b/lib/unistr/u8-strspn.c new file mode 100644 index 00000000..582c8259 --- /dev/null +++ b/lib/unistr/u8-strspn.c @@ -0,0 +1,29 @@ +/* Search for some characters in UTF-8 string. + Copyright (C) 1999, 2002, 2006 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +#define FUNC u8_strspn +#define UNIT uint8_t +#define U_STRLEN u8_strlen +#define U_STRMBTOUC u8_strmbtouc +#define U_CMP u8_cmp +#define U_STRCHR u8_strchr +#include "u-strspn.h" diff --git a/lib/unistr/u8-strstr.c b/lib/unistr/u8-strstr.c new file mode 100644 index 00000000..b5cbf249 --- /dev/null +++ b/lib/unistr/u8-strstr.c @@ -0,0 +1,28 @@ +/* Substring test for UTF-8 strings. + Copyright (C) 1999, 2002, 2006 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +/* FIXME: Maybe walking the string via u8_mblen is a win? */ + +#define FUNC u8_strstr +#define UNIT uint8_t +#define U_STRCHR u8_strchr +#include "u-strstr.h" diff --git a/lib/unistr/u8-strtok.c b/lib/unistr/u8-strtok.c new file mode 100644 index 00000000..a8d3821d --- /dev/null +++ b/lib/unistr/u8-strtok.c @@ -0,0 +1,27 @@ +/* Tokenize UTF-8 string. + Copyright (C) 1999, 2002, 2006 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +#define FUNC u8_strtok +#define UNIT uint8_t +#define U_STRSPN u8_strspn +#define U_STRPBRK u8_strpbrk +#include "u-strtok.h" diff --git a/lib/unistr/u8-to-u16.c b/lib/unistr/u8-to-u16.c new file mode 100644 index 00000000..03c66000 --- /dev/null +++ b/lib/unistr/u8-to-u16.c @@ -0,0 +1,136 @@ +/* Convert UTF-8 string to UTF-16 string. + Copyright (C) 2002, 2006-2007 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +#define FUNC u8_to_u16 +#define SRC_UNIT uint8_t +#define DST_UNIT uint16_t + +#include <errno.h> +#include <stdlib.h> +#include <string.h> + +DST_UNIT * +FUNC (const SRC_UNIT *s, size_t n, DST_UNIT *resultbuf, size_t *lengthp) +{ + const SRC_UNIT *s_end = s + n; + /* Output string accumulator. */ + DST_UNIT *result; + size_t allocated; + size_t length; + + if (resultbuf != NULL) + { + result = resultbuf; + allocated = *lengthp; + } + else + { + result = NULL; + allocated = 0; + } + length = 0; + /* Invariants: + result is either == resultbuf or == NULL or malloc-allocated. + If length > 0, then result != NULL. */ + + while (s < s_end) + { + ucs4_t uc; + int count; + + /* Fetch a Unicode character from the input string. */ + count = u8_mbtouc (&uc, s, s_end - s); + if (count < 0) + { + if (!(result == resultbuf || result == NULL)) + free (result); + errno = EILSEQ; + return NULL; + } + s += count; + + /* Store it in the output string. */ + count = u16_uctomb (result + length, uc, allocated - length); + if (count == -1) + { + if (!(result == resultbuf || result == NULL)) + free (result); + errno = EILSEQ; + return NULL; + } + if (count == -2) + { + DST_UNIT *memory; + + allocated = (allocated > 0 ? 2 * allocated : 12); + if (length + 2 > allocated) + allocated = length + 2; + if (result == resultbuf || result == NULL) + memory = (DST_UNIT *) malloc (allocated * sizeof (DST_UNIT)); + else + memory = + (DST_UNIT *) realloc (result, allocated * sizeof (DST_UNIT)); + + if (memory == NULL) + { + if (!(result == resultbuf || result == NULL)) + free (result); + errno = ENOMEM; + return NULL; + } + if (result == resultbuf && length > 0) + memcpy ((char *) memory, (char *) result, + length * sizeof (DST_UNIT)); + result = memory; + count = u16_uctomb (result + length, uc, allocated - length); + if (count < 0) + abort (); + } + length += count; + } + + if (length == 0) + { + if (result == NULL) + { + /* Return a non-NULL value. NULL means error. */ + result = (DST_UNIT *) malloc (1); + if (result == NULL) + { + errno = ENOMEM; + return NULL; + } + } + } + else if (result != resultbuf && length < allocated) + { + /* Shrink the allocated memory if possible. */ + DST_UNIT *memory; + + memory = (DST_UNIT *) realloc (result, length * sizeof (DST_UNIT)); + if (memory != NULL) + result = memory; + } + + *lengthp = length; + return result; +} diff --git a/lib/unistr/u8-to-u32.c b/lib/unistr/u8-to-u32.c new file mode 100644 index 00000000..2a0ad39e --- /dev/null +++ b/lib/unistr/u8-to-u32.c @@ -0,0 +1,125 @@ +/* Convert UTF-8 string to UTF-32 string. + Copyright (C) 2002, 2006-2007 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +#define FUNC u8_to_u32 +#define SRC_UNIT uint8_t +#define DST_UNIT uint32_t + +#include <errno.h> +#include <stdlib.h> +#include <string.h> + +DST_UNIT * +FUNC (const SRC_UNIT *s, size_t n, DST_UNIT *resultbuf, size_t *lengthp) +{ + const SRC_UNIT *s_end = s + n; + /* Output string accumulator. */ + DST_UNIT *result; + size_t allocated; + size_t length; + + if (resultbuf != NULL) + { + result = resultbuf; + allocated = *lengthp; + } + else + { + result = NULL; + allocated = 0; + } + length = 0; + /* Invariants: + result is either == resultbuf or == NULL or malloc-allocated. + If length > 0, then result != NULL. */ + + while (s < s_end) + { + ucs4_t uc; + int count; + + /* Fetch a Unicode character from the input string. */ + count = u8_mbtouc (&uc, s, s_end - s); + if (count < 0) + { + if (!(result == resultbuf || result == NULL)) + free (result); + errno = EILSEQ; + return NULL; + } + s += count; + + /* Store it in the output string. */ + if (length + 1 > allocated) + { + DST_UNIT *memory; + + allocated = (allocated > 0 ? 2 * allocated : 12); + if (length + 1 > allocated) + allocated = length + 1; + if (result == resultbuf || result == NULL) + memory = (DST_UNIT *) malloc (allocated * sizeof (DST_UNIT)); + else + memory = + (DST_UNIT *) realloc (result, allocated * sizeof (DST_UNIT)); + + if (memory == NULL) + { + if (!(result == resultbuf || result == NULL)) + free (result); + errno = ENOMEM; + return NULL; + } + if (result == resultbuf && length > 0) + memcpy ((char *) memory, (char *) result, + length * sizeof (DST_UNIT)); + result = memory; + } + result[length++] = uc; + } + + if (length == 0) + { + if (result == NULL) + { + /* Return a non-NULL value. NULL means error. */ + result = (DST_UNIT *) malloc (1); + if (result == NULL) + { + errno = ENOMEM; + return NULL; + } + } + } + else if (result != resultbuf && length < allocated) + { + /* Shrink the allocated memory if possible. */ + DST_UNIT *memory; + + memory = (DST_UNIT *) realloc (result, length * sizeof (DST_UNIT)); + if (memory != NULL) + result = memory; + } + + *lengthp = length; + return result; +} diff --git a/lib/unistr/u8-uctomb-aux.c b/lib/unistr/u8-uctomb-aux.c new file mode 100644 index 00000000..c42fa501 --- /dev/null +++ b/lib/unistr/u8-uctomb-aux.c @@ -0,0 +1,69 @@ +/* Conversion UCS-4 to UTF-8. + Copyright (C) 2002, 2006-2007 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +/* Specification. */ +#include "unistr.h" + +int +u8_uctomb_aux (uint8_t *s, ucs4_t uc, int n) +{ + int count; + + if (uc < 0x80) + /* The case n >= 1 is already handled by the caller. */ + return -2; + else if (uc < 0x800) + count = 2; + else if (uc < 0x10000) + { + if (uc < 0xd800 || uc >= 0xe000) + count = 3; + else + return -1; + } +#if 0 + else if (uc < 0x200000) + count = 4; + else if (uc < 0x4000000) + count = 5; + else if (uc <= 0x7fffffff) + count = 6; +#else + else if (uc < 0x110000) + count = 4; +#endif + else + return -1; + + if (n < count) + return -2; + + switch (count) /* note: code falls through cases! */ + { +#if 0 + case 6: s[5] = 0x80 | (uc & 0x3f); uc = uc >> 6; uc |= 0x4000000; + case 5: s[4] = 0x80 | (uc & 0x3f); uc = uc >> 6; uc |= 0x200000; +#endif + case 4: s[3] = 0x80 | (uc & 0x3f); uc = uc >> 6; uc |= 0x10000; + case 3: s[2] = 0x80 | (uc & 0x3f); uc = uc >> 6; uc |= 0x800; + case 2: s[1] = 0x80 | (uc & 0x3f); uc = uc >> 6; uc |= 0xc0; + /*case 1:*/ s[0] = uc; + } + return count; +} diff --git a/lib/unistr/u8-uctomb.c b/lib/unistr/u8-uctomb.c new file mode 100644 index 00000000..33921669 --- /dev/null +++ b/lib/unistr/u8-uctomb.c @@ -0,0 +1,88 @@ +/* Store a character in UTF-8 string. + Copyright (C) 2002, 2005-2006, 2009 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2002. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +#if defined IN_LIBUNISTRING +/* Tell unistr.h to declare u8_uctomb as 'extern', not 'static inline'. */ +# include "unistring-notinline.h" +#endif + +/* Specification. */ +#include "unistr.h" + +#if !HAVE_INLINE + +int +u8_uctomb (uint8_t *s, ucs4_t uc, int n) +{ + if (uc < 0x80) + { + if (n > 0) + { + s[0] = uc; + return 1; + } + /* else return -2, below. */ + } + else + { + int count; + + if (uc < 0x800) + count = 2; + else if (uc < 0x10000) + { + if (uc < 0xd800 || uc >= 0xe000) + count = 3; + else + return -1; + } +#if 0 + else if (uc < 0x200000) + count = 4; + else if (uc < 0x4000000) + count = 5; + else if (uc <= 0x7fffffff) + count = 6; +#else + else if (uc < 0x110000) + count = 4; +#endif + else + return -1; + + if (n >= count) + { + switch (count) /* note: code falls through cases! */ + { +#if 0 + case 6: s[5] = 0x80 | (uc & 0x3f); uc = uc >> 6; uc |= 0x4000000; + case 5: s[4] = 0x80 | (uc & 0x3f); uc = uc >> 6; uc |= 0x200000; +#endif + case 4: s[3] = 0x80 | (uc & 0x3f); uc = uc >> 6; uc |= 0x10000; + case 3: s[2] = 0x80 | (uc & 0x3f); uc = uc >> 6; uc |= 0x800; + case 2: s[1] = 0x80 | (uc & 0x3f); uc = uc >> 6; uc |= 0xc0; + /*case 1:*/ s[0] = uc; + } + return count; + } + } + return -2; +} + +#endif |