summaryrefslogtreecommitdiff
path: root/lib/mbiterf.h
diff options
context:
space:
mode:
Diffstat (limited to 'lib/mbiterf.h')
-rw-r--r--lib/mbiterf.h204
1 files changed, 204 insertions, 0 deletions
diff --git a/lib/mbiterf.h b/lib/mbiterf.h
new file mode 100644
index 00000000..28d2f8ce
--- /dev/null
+++ b/lib/mbiterf.h
@@ -0,0 +1,204 @@
+/* Iterating through multibyte strings, faster: macros for multi-byte encodings.
+ Copyright (C) 2001, 2005, 2007, 2009-2024 Free Software Foundation, Inc.
+
+ This file is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Lesser General Public License as
+ published by the Free Software Foundation; either version 2.1 of the
+ License, or (at your option) any later version.
+
+ This file is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public License
+ along with this program. If not, see <https://www.gnu.org/licenses/>. */
+
+/* Written by Bruno Haible <bruno@clisp.org>,
+ with insights from Paul Eggert. */
+
+/* The macros in this file implement forward iteration through a
+ multi-byte string.
+
+ With these macros, an iteration loop that looks like
+
+ char *iter;
+ for (iter = buf; iter < buf + buflen; iter++)
+ {
+ do_something (*iter);
+ }
+
+ becomes
+
+ const char *buf_end = buf + buflen;
+ mbif_state_t state;
+ [const] char *iter;
+ for (mbif_init (state), iter = buf; mbif_avail (state, iter, buf_end); )
+ {
+ mbchar_t cur = mbif_next (state, iter, buf_end);
+ // Note: Here always mb_ptr (cur) == iter.
+ do_something (iter, mb_len (cur));
+ iter += mb_len (cur);
+ }
+
+ The benefit of these macros over plain use of mbrtowc or mbrtoc32 is:
+ - Handling of invalid multibyte sequences is possible without
+ making the code more complicated, while still preserving the
+ invalid multibyte sequences.
+
+ The benefit of these macros over those from mbiter.h is that it
+ produces faster code with today's optimizing compilers (because mbif_next
+ returns its result by value).
+
+ mbif_state_t
+ is a type usable for variable declarations.
+
+ mbif_init (state)
+ initializes the state.
+
+ mbif_avail (state, iter, endptr)
+ returns true if another loop round is needed.
+
+ mbif_next (state, iter, endptr)
+ returns the next multibyte character.
+ It asssumes that the state is initialized and that iter < endptr.
+
+ Here are the function prototypes of the macros.
+
+ extern void mbif_init (mbif_state_t state);
+ extern bool mbif_avail (mbif_state_t state, const char *iter, const char *endptr);
+ extern mbchar_t mbif_next (mbif_state_t state, const char *iter, const char *endptr);
+ */
+
+#ifndef _MBITERF_H
+#define _MBITERF_H 1
+
+/* This file uses _GL_INLINE_HEADER_BEGIN, _GL_INLINE,
+ _GL_ATTRIBUTE_ALWAYS_INLINE. */
+#if !_GL_CONFIG_H_INCLUDED
+ #error "Please include config.h first."
+#endif
+
+#include <assert.h>
+#include <stddef.h>
+#include <string.h>
+#include <uchar.h>
+#include <wchar.h>
+
+#include "mbchar.h"
+
+_GL_INLINE_HEADER_BEGIN
+#ifndef MBITERF_INLINE
+# define MBITERF_INLINE _GL_INLINE _GL_ATTRIBUTE_ALWAYS_INLINE
+#endif
+
+struct mbif_state
+{
+ #if !GNULIB_MBRTOC32_REGULAR
+ bool in_shift; /* true if next byte may not be interpreted as ASCII */
+ /* If GNULIB_MBRTOC32_REGULAR, it is always false,
+ so optimize it away. */
+ #endif
+ mbstate_t state; /* if in_shift: current shift state */
+ /* If GNULIB_MBRTOC32_REGULAR, it is in an initial state
+ before and after every mbiterf_next invocation.
+ */
+};
+
+MBITERF_INLINE mbchar_t
+mbiterf_next (struct mbif_state *ps, const char *iter, const char *endptr)
+{
+ #if !GNULIB_MBRTOC32_REGULAR
+ if (ps->in_shift)
+ goto with_shift;
+ #endif
+ /* Handle most ASCII characters quickly, without calling mbrtowc(). */
+ if (is_basic (*iter))
+ {
+ /* These characters are part of the POSIX portable character set.
+ For most of them, namely those in the ISO C basic character set,
+ ISO C 99 guarantees that their wide character code is identical to
+ their char code. For the few other ones, this is the case as well,
+ in all locale encodings that are in use. The 32-bit wide character
+ code is the same as well. */
+ return (mbchar_t) { .ptr = iter, .bytes = 1, .wc_valid = true, .wc = *iter };
+ }
+ else
+ {
+ assert (mbsinit (&ps->state));
+ #if !GNULIB_MBRTOC32_REGULAR
+ ps->in_shift = true;
+ with_shift:;
+ #endif
+ size_t bytes;
+ char32_t wc;
+ bytes = mbrtoc32 (&wc, iter, endptr - iter, &ps->state);
+ if (bytes == (size_t) -1)
+ {
+ /* An invalid multibyte sequence was encountered. */
+ /* Allow the next invocation to continue from a sane state. */
+ #if !GNULIB_MBRTOC32_REGULAR
+ ps->in_shift = false;
+ #endif
+ mbszero (&ps->state);
+ return (mbchar_t) { .ptr = iter, .bytes = 1, .wc_valid = false };
+ }
+ else if (bytes == (size_t) -2)
+ {
+ /* An incomplete multibyte character at the end. */
+ #if !GNULIB_MBRTOC32_REGULAR
+ ps->in_shift = false;
+ #endif
+ /* Whether to reset ps->state or not is not important; the string end
+ is reached anyway. */
+ return (mbchar_t) { .ptr = iter, .bytes = endptr - iter, .wc_valid = false };
+ }
+ else
+ {
+ if (bytes == 0)
+ {
+ /* A null wide character was encountered. */
+ bytes = 1;
+ assert (*iter == '\0');
+ assert (wc == 0);
+ }
+ #if !GNULIB_MBRTOC32_REGULAR
+ else if (bytes == (size_t) -3)
+ /* The previous multibyte sequence produced an additional 32-bit
+ wide character. */
+ bytes = 0;
+ #endif
+
+ /* When in an initial state, we can go back treating ASCII
+ characters more quickly. */
+ #if !GNULIB_MBRTOC32_REGULAR
+ if (mbsinit (&ps->state))
+ ps->in_shift = false;
+ #endif
+ return (mbchar_t) { .ptr = iter, .bytes = bytes, .wc_valid = true, .wc = wc };
+ }
+ }
+}
+
+/* Iteration macros. */
+typedef struct mbif_state mbif_state_t;
+#if !GNULIB_MBRTOC32_REGULAR
+#define mbif_init(st) \
+ ((st).in_shift = false, mbszero (&(st).state))
+#else
+/* Optimized: no in_shift. */
+#define mbif_init(st) \
+ (mbszero (&(st).state))
+#endif
+#if !GNULIB_MBRTOC32_REGULAR
+#define mbif_avail(st, iter, endptr) ((st).in_shift || ((iter) < (endptr)))
+#else
+/* Optimized: no in_shift. */
+#define mbif_avail(st, iter, endptr) ((iter) < (endptr))
+#endif
+#define mbif_next(st, iter, endptr) \
+ mbiterf_next (&(st), (iter), (endptr))
+
+_GL_INLINE_HEADER_END
+
+#endif /* _MBITERF_H */