diff options
Diffstat (limited to 'lib/mbiterf.h')
-rw-r--r-- | lib/mbiterf.h | 204 |
1 files changed, 204 insertions, 0 deletions
diff --git a/lib/mbiterf.h b/lib/mbiterf.h new file mode 100644 index 00000000..28d2f8ce --- /dev/null +++ b/lib/mbiterf.h @@ -0,0 +1,204 @@ +/* Iterating through multibyte strings, faster: macros for multi-byte encodings. + Copyright (C) 2001, 2005, 2007, 2009-2024 Free Software Foundation, Inc. + + This file is free software: you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as + published by the Free Software Foundation; either version 2.1 of the + License, or (at your option) any later version. + + This file is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see <https://www.gnu.org/licenses/>. */ + +/* Written by Bruno Haible <bruno@clisp.org>, + with insights from Paul Eggert. */ + +/* The macros in this file implement forward iteration through a + multi-byte string. + + With these macros, an iteration loop that looks like + + char *iter; + for (iter = buf; iter < buf + buflen; iter++) + { + do_something (*iter); + } + + becomes + + const char *buf_end = buf + buflen; + mbif_state_t state; + [const] char *iter; + for (mbif_init (state), iter = buf; mbif_avail (state, iter, buf_end); ) + { + mbchar_t cur = mbif_next (state, iter, buf_end); + // Note: Here always mb_ptr (cur) == iter. + do_something (iter, mb_len (cur)); + iter += mb_len (cur); + } + + The benefit of these macros over plain use of mbrtowc or mbrtoc32 is: + - Handling of invalid multibyte sequences is possible without + making the code more complicated, while still preserving the + invalid multibyte sequences. + + The benefit of these macros over those from mbiter.h is that it + produces faster code with today's optimizing compilers (because mbif_next + returns its result by value). + + mbif_state_t + is a type usable for variable declarations. + + mbif_init (state) + initializes the state. + + mbif_avail (state, iter, endptr) + returns true if another loop round is needed. + + mbif_next (state, iter, endptr) + returns the next multibyte character. + It asssumes that the state is initialized and that iter < endptr. + + Here are the function prototypes of the macros. + + extern void mbif_init (mbif_state_t state); + extern bool mbif_avail (mbif_state_t state, const char *iter, const char *endptr); + extern mbchar_t mbif_next (mbif_state_t state, const char *iter, const char *endptr); + */ + +#ifndef _MBITERF_H +#define _MBITERF_H 1 + +/* This file uses _GL_INLINE_HEADER_BEGIN, _GL_INLINE, + _GL_ATTRIBUTE_ALWAYS_INLINE. */ +#if !_GL_CONFIG_H_INCLUDED + #error "Please include config.h first." +#endif + +#include <assert.h> +#include <stddef.h> +#include <string.h> +#include <uchar.h> +#include <wchar.h> + +#include "mbchar.h" + +_GL_INLINE_HEADER_BEGIN +#ifndef MBITERF_INLINE +# define MBITERF_INLINE _GL_INLINE _GL_ATTRIBUTE_ALWAYS_INLINE +#endif + +struct mbif_state +{ + #if !GNULIB_MBRTOC32_REGULAR + bool in_shift; /* true if next byte may not be interpreted as ASCII */ + /* If GNULIB_MBRTOC32_REGULAR, it is always false, + so optimize it away. */ + #endif + mbstate_t state; /* if in_shift: current shift state */ + /* If GNULIB_MBRTOC32_REGULAR, it is in an initial state + before and after every mbiterf_next invocation. + */ +}; + +MBITERF_INLINE mbchar_t +mbiterf_next (struct mbif_state *ps, const char *iter, const char *endptr) +{ + #if !GNULIB_MBRTOC32_REGULAR + if (ps->in_shift) + goto with_shift; + #endif + /* Handle most ASCII characters quickly, without calling mbrtowc(). */ + if (is_basic (*iter)) + { + /* These characters are part of the POSIX portable character set. + For most of them, namely those in the ISO C basic character set, + ISO C 99 guarantees that their wide character code is identical to + their char code. For the few other ones, this is the case as well, + in all locale encodings that are in use. The 32-bit wide character + code is the same as well. */ + return (mbchar_t) { .ptr = iter, .bytes = 1, .wc_valid = true, .wc = *iter }; + } + else + { + assert (mbsinit (&ps->state)); + #if !GNULIB_MBRTOC32_REGULAR + ps->in_shift = true; + with_shift:; + #endif + size_t bytes; + char32_t wc; + bytes = mbrtoc32 (&wc, iter, endptr - iter, &ps->state); + if (bytes == (size_t) -1) + { + /* An invalid multibyte sequence was encountered. */ + /* Allow the next invocation to continue from a sane state. */ + #if !GNULIB_MBRTOC32_REGULAR + ps->in_shift = false; + #endif + mbszero (&ps->state); + return (mbchar_t) { .ptr = iter, .bytes = 1, .wc_valid = false }; + } + else if (bytes == (size_t) -2) + { + /* An incomplete multibyte character at the end. */ + #if !GNULIB_MBRTOC32_REGULAR + ps->in_shift = false; + #endif + /* Whether to reset ps->state or not is not important; the string end + is reached anyway. */ + return (mbchar_t) { .ptr = iter, .bytes = endptr - iter, .wc_valid = false }; + } + else + { + if (bytes == 0) + { + /* A null wide character was encountered. */ + bytes = 1; + assert (*iter == '\0'); + assert (wc == 0); + } + #if !GNULIB_MBRTOC32_REGULAR + else if (bytes == (size_t) -3) + /* The previous multibyte sequence produced an additional 32-bit + wide character. */ + bytes = 0; + #endif + + /* When in an initial state, we can go back treating ASCII + characters more quickly. */ + #if !GNULIB_MBRTOC32_REGULAR + if (mbsinit (&ps->state)) + ps->in_shift = false; + #endif + return (mbchar_t) { .ptr = iter, .bytes = bytes, .wc_valid = true, .wc = wc }; + } + } +} + +/* Iteration macros. */ +typedef struct mbif_state mbif_state_t; +#if !GNULIB_MBRTOC32_REGULAR +#define mbif_init(st) \ + ((st).in_shift = false, mbszero (&(st).state)) +#else +/* Optimized: no in_shift. */ +#define mbif_init(st) \ + (mbszero (&(st).state)) +#endif +#if !GNULIB_MBRTOC32_REGULAR +#define mbif_avail(st, iter, endptr) ((st).in_shift || ((iter) < (endptr))) +#else +/* Optimized: no in_shift. */ +#define mbif_avail(st, iter, endptr) ((iter) < (endptr)) +#endif +#define mbif_next(st, iter, endptr) \ + mbiterf_next (&(st), (iter), (endptr)) + +_GL_INLINE_HEADER_END + +#endif /* _MBITERF_H */ |