diff options
Diffstat (limited to 'lib/mbrtoc32.c')
| -rw-r--r-- | lib/mbrtoc32.c | 288 | 
1 files changed, 288 insertions, 0 deletions
| diff --git a/lib/mbrtoc32.c b/lib/mbrtoc32.c new file mode 100644 index 00000000..56e4a860 --- /dev/null +++ b/lib/mbrtoc32.c @@ -0,0 +1,288 @@ +/* Convert multibyte character to 32-bit wide character. +   Copyright (C) 2020-2024 Free Software Foundation, Inc. + +   This file is free software: you can redistribute it and/or modify +   it under the terms of the GNU Lesser General Public License as +   published by the Free Software Foundation; either version 2.1 of the +   License, or (at your option) any later version. + +   This file is distributed in the hope that it will be useful, +   but WITHOUT ANY WARRANTY; without even the implied warranty of +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the +   GNU Lesser General Public License for more details. + +   You should have received a copy of the GNU Lesser General Public License +   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */ + +/* Written by Bruno Haible <bruno@clisp.org>, 2020.  */ + +#include <config.h> + +/* Specification.  */ +#include <uchar.h> + +#include "attribute.h" + +#include <errno.h> +#include <stdlib.h> + +#if GL_CHAR32_T_IS_UNICODE +# include "lc-charset-unicode.h" +#endif + +#if GNULIB_defined_mbstate_t /* AIX, IRIX */ +/* Implement mbrtoc32() on top of mbtowc() for the non-UTF-8 locales +   and directly for the UTF-8 locales.  */ + +/* Note: On AIX (64-bit) we can implement mbrtoc32 in two equivalent ways: +   - in a way that parallels the override of mbrtowc; this is the code branch +     here; +   - in a way that invokes the overridden mbrtowc; this would be the #else +     branch below. +   They are equivalent.  */ + +# if AVOID_ANY_THREADS + +/* The option '--disable-threads' explicitly requests no locking.  */ + +# elif defined _WIN32 && !defined __CYGWIN__ + +#  define WIN32_LEAN_AND_MEAN  /* avoid including junk */ +#  include <windows.h> + +# elif HAVE_PTHREAD_API + +#  include <pthread.h> +#  if HAVE_THREADS_H && HAVE_WEAK_SYMBOLS +#   include <threads.h> +#   pragma weak thrd_exit +#   define c11_threads_in_use() (thrd_exit != NULL) +#  else +#   define c11_threads_in_use() 0 +#  endif + +# elif HAVE_THREADS_H + +#  include <threads.h> + +# endif + +# include "lc-charset-dispatch.h" +# include "mbtowc-lock.h" + +static_assert (sizeof (mbstate_t) >= 4); +static char internal_state[4]; + +size_t +mbrtoc32 (char32_t *pwc, const char *s, size_t n, mbstate_t *ps) +{ +# define FITS_IN_CHAR_TYPE(wc)  1 +# include "mbrtowc-impl.h" +} + +#else /* glibc, macOS, FreeBSD, NetBSD, OpenBSD, HP-UX, Solaris, Cygwin, mingw, MSVC, Minix, Android */ + +/* Implement mbrtoc32() based on the original mbrtoc32() or on mbrtowc().  */ + +# include <wchar.h> + +# include "localcharset.h" +# include "streq.h" + +# if MBRTOC32_IN_C_LOCALE_MAYBE_EILSEQ +#  include "hard-locale.h" +#  include <locale.h> +# endif + +static mbstate_t internal_state; + +size_t +mbrtoc32 (char32_t *pwc, const char *s, size_t n, mbstate_t *ps) +# undef mbrtoc32 +{ +  /* It's simpler to handle the case s == NULL upfront, than to worry about +     this case later, before every test of pwc and n.  */ +  if (s == NULL) +    { +      pwc = NULL; +      s = ""; +      n = 1; +    } + +# if MBRTOC32_EMPTY_INPUT_BUG || _GL_SMALL_WCHAR_T +  if (n == 0) +    return (size_t) -2; +# endif + +  if (ps == NULL) +    ps = &internal_state; + +# if HAVE_WORKING_MBRTOC32 +  /* mbrtoc32() may produce different values for wc than mbrtowc().  Therefore +     use mbrtoc32().  */ + +#  if defined _WIN32 && !defined __CYGWIN__ +  char32_t wc; +  size_t ret = mbrtoc32 (&wc, s, n, ps); +  if (ret < (size_t) -2 && pwc != NULL) +    *pwc = wc; +#  else +  size_t ret = mbrtoc32 (pwc, s, n, ps); +#  endif + +#  if GNULIB_MBRTOC32_REGULAR +  /* Verify that mbrtoc32 is regular.  */ +  if (ret < (size_t) -3 && ! mbsinit (ps)) +    /* This occurs on glibc 2.36.  */ +    mbszero (ps); +  if (ret == (size_t) -3) +    abort (); +#  endif + +#  if MBRTOC32_IN_C_LOCALE_MAYBE_EILSEQ +  if ((size_t) -2 <= ret && n != 0 && ! hard_locale (LC_CTYPE)) +    { +      if (pwc != NULL) +        *pwc = (unsigned char) *s; +      return 1; +    } +#  endif + +  return ret; + +# elif _GL_SMALL_WCHAR_T + +  /* Special-case all encodings that may produce wide character values +     > WCHAR_MAX.  */ +  const char *encoding = locale_charset (); +  if (STREQ_OPT (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0, 0)) +    { +      /* Special-case the UTF-8 encoding.  Assume that the wide-character +         encoding in a UTF-8 locale is UCS-2 or, equivalently, UTF-16.  */ +      /* Here n > 0.  */ +      char *pstate = (char *)ps; +      size_t nstate = pstate[0]; +      char buf[4]; +      const char *p; +      size_t m; +      int res; + +      switch (nstate) +        { +        case 0: +          p = s; +          m = n; +          break; +        case 3: +          buf[2] = pstate[3]; +          FALLTHROUGH; +        case 2: +          buf[1] = pstate[2]; +          FALLTHROUGH; +        case 1: +          buf[0] = pstate[1]; +          p = buf; +          m = nstate; +          buf[m++] = s[0]; +          if (n >= 2 && m < 4) +            { +              buf[m++] = s[1]; +              if (n >= 3 && m < 4) +                buf[m++] = s[2]; +            } +          break; +        default: +          errno = EINVAL; +          return (size_t)(-1); +        } + +      /* Here m > 0.  */ + +      { +#  define FITS_IN_CHAR_TYPE(wc)  1 +#  include "mbrtowc-impl-utf8.h" +      } + +     success: +      if (nstate >= (res > 0 ? res : 1)) +        abort (); +      res -= nstate; +      /* Set *ps to an initial state.  */ +#  if defined _WIN32 && !defined __CYGWIN__ +      /* Native Windows.  */ +      /* MSVC defines 'mbstate_t' as an 8-byte struct; the first 4 bytes matter. +         On mingw, 'mbstate_t' is sometimes defined as 'int', sometimes defined +         as an 8-byte struct, of which the first 4 bytes matter.  */ +      *(unsigned int *)pstate = 0; +#  elif defined __CYGWIN__ +      /* Cygwin defines 'mbstate_t' as an 8-byte struct; the first 4 bytes +         matter.  */ +      ps->__count = 0; +#  else +      pstate[0] = 0; +#  endif +      return res; + +     incomplete: +      { +        size_t k = nstate; +        /* Here 0 <= k < m < 4.  */ +        pstate[++k] = s[0]; +        if (k < m) +          { +            pstate[++k] = s[1]; +            if (k < m) +              pstate[++k] = s[2]; +          } +        if (k != m) +          abort (); +      } +      pstate[0] = m; +      return (size_t)(-2); + +     invalid: +      errno = EILSEQ; +      /* The conversion state is undefined, says POSIX.  */ +      return (size_t)(-1); +    } +  else +    { +      wchar_t wc; +      size_t ret = mbrtowc (&wc, s, n, ps); +      if (ret < (size_t) -2 && pwc != NULL) +        *pwc = wc; +      return ret; +    } + +# else + +  /* char32_t and wchar_t are equivalent.  Use mbrtowc().  */ +  wchar_t wc; +  size_t ret = mbrtowc (&wc, s, n, ps); + +#  if GNULIB_MBRTOC32_REGULAR +  /* Ensure that mbrtoc32 is regular.  */ +  if (ret < (size_t) -2 && ! mbsinit (ps)) +    /* This occurs on glibc 2.12.  */ +    mbszero (ps); +#  endif + +#  if GL_CHAR32_T_IS_UNICODE && GL_CHAR32_T_VS_WCHAR_T_NEEDS_CONVERSION +  if (ret < (size_t) -2 && wc != 0) +    { +      wc = locale_encoding_to_unicode (wc); +      if (wc == 0) +        { +          ret = (size_t) -1; +          errno = EILSEQ; +        } +    } +#  endif +  if (ret < (size_t) -2 && pwc != NULL) +    *pwc = wc; +  return ret; + +# endif +} + +#endif | 
