diff options
author | Jörg Frings-Fürst <debian@jff.email> | 2022-01-08 11:53:52 +0100 |
---|---|---|
committer | Jörg Frings-Fürst <debian@jff.email> | 2022-01-08 11:53:52 +0100 |
commit | fa838e76139763f902c7d27cb9e1d393ed6a15e4 (patch) | |
tree | 7d0ae09775ea950056193eaa2ca93844299d46f1 /lib/striconveh.c | |
parent | c78359d9542c86b972aac373efcf7bc7a8a560e5 (diff) | |
parent | 2959e59fab3bab834368adefd90bd4b1b094366b (diff) |
Merge branch 'feature/upstream' into develop
Diffstat (limited to 'lib/striconveh.c')
-rw-r--r-- | lib/striconveh.c | 178 |
1 files changed, 103 insertions, 75 deletions
diff --git a/lib/striconveh.c b/lib/striconveh.c index 45d76f88..5b60a7e0 100644 --- a/lib/striconveh.c +++ b/lib/striconveh.c @@ -1,27 +1,18 @@ /* Character set conversion with error handling. - Copyright (C) 2001-2018 Free Software Foundation, Inc. + Copyright (C) 2001-2022 Free Software Foundation, Inc. Written by Bruno Haible and Simon Josefsson. - This program is free software: you can redistribute it and/or - modify it under the terms of either: + This file is free software: you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as + published by the Free Software Foundation; either version 2.1 of the + License, or (at your option) any later version. - * the GNU Lesser General Public License as published by the Free - Software Foundation; either version 3 of the License, or (at your - option) any later version. - - or - - * the GNU General Public License as published by the Free - Software Foundation; either version 2 of the License, or (at your - option) any later version. - - or both in parallel, as here. - This program is distributed in the hope that it will be useful, + This file is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. + GNU Lesser General Public License for more details. - You should have received a copy of the GNU General Public License + You should have received a copy of the GNU Lesser General Public License along with this program. If not, see <https://www.gnu.org/licenses/>. */ #include <config.h> @@ -82,7 +73,7 @@ iconveh_open (const char *to_codeset, const char *from_codeset, iconveh_t *cdp) { int saved_errno = errno; if (cd != (iconv_t)(-1)) - iconv_close (cdp->cd); + iconv_close (cd); errno = saved_errno; return -1; } @@ -466,13 +457,18 @@ mem_cd_iconveh_internal (const char *src, size_t srclen, if (cd2 == (iconv_t)(-1)) { /* TO_CODESET is UTF-8. */ - /* Error handling can produce up to 1 byte of output. */ - if (length + 1 + extra_alloc > allocated) + /* Error handling can produce up to 1 or 3 bytes of + output. */ + size_t extra_need = + (handler == iconveh_replacement_character ? 3 : 1); + if (length + extra_need + extra_alloc > allocated) { char *memory; allocated = 2 * allocated; - if (length + 1 + extra_alloc > allocated) + if (length + extra_need + extra_alloc > allocated) + allocated = 2 * allocated; + if (length + extra_need + extra_alloc > allocated) abort (); if (result == initial_result) memory = (char *) malloc (allocated); @@ -491,7 +487,7 @@ mem_cd_iconveh_internal (const char *src, size_t srclen, grow = false; } /* The input is invalid in FROM_CODESET. Eat up one byte - and emit a question mark. */ + and emit a replacement character or a question mark. */ if (!incremented) { if (insize == 0) @@ -499,8 +495,19 @@ mem_cd_iconveh_internal (const char *src, size_t srclen, inptr++; insize--; } - result[length] = '?'; - length++; + if (handler == iconveh_replacement_character) + { + /* U+FFFD in UTF-8 encoding. */ + result[length+0] = '\357'; + result[length+1] = '\277'; + result[length+2] = '\275'; + length += 3; + } + else + { + result[length] = '?'; + length++; + } } else goto indirectly; @@ -508,11 +515,7 @@ mem_cd_iconveh_internal (const char *src, size_t srclen, else { if (result != initial_result) - { - int saved_errno = errno; - free (result); - errno = saved_errno; - } + free (result); return -1; } } @@ -579,11 +582,7 @@ mem_cd_iconveh_internal (const char *src, size_t srclen, else { if (result != initial_result) - { - int saved_errno = errno; - free (result); - errno = saved_errno; - } + free (result); return -1; } } @@ -611,7 +610,7 @@ mem_cd_iconveh_internal (const char *src, size_t srclen, { const bool slowly = (offsets != NULL || handler == iconveh_error); # define utf8bufsize 4096 /* may also be smaller or larger than tmpbufsize */ - char utf8buf[utf8bufsize + 1]; + char utf8buf[utf8bufsize + 3]; size_t utf8len = 0; const char *in1ptr = src; size_t in1size = srclen; @@ -692,19 +691,15 @@ mem_cd_iconveh_internal (const char *src, size_t srclen, && !(errno == E2BIG || errno == EINVAL || errno == EILSEQ)) { if (result != initial_result) - { - int saved_errno = errno; - free (result); - errno = saved_errno; - } + free (result); return -1; } if (res1 == (size_t)(-1) && errno == EILSEQ && handler != iconveh_error) { /* The input is invalid in FROM_CODESET. Eat up one byte and - emit a question mark. Room for the question mark was allocated - at the end of utf8buf. */ + emit a U+FFFD character or a question mark. Room for this + character was allocated at the end of utf8buf. */ if (!incremented1) { if (in1size == 0) @@ -712,7 +707,16 @@ mem_cd_iconveh_internal (const char *src, size_t srclen, in1ptr++; in1size--; } - *out1ptr++ = '?'; + if (handler == iconveh_replacement_character) + { + /* U+FFFD in UTF-8 encoding. */ + out1ptr[0] = '\357'; + out1ptr[1] = '\277'; + out1ptr[2] = '\275'; + out1ptr += 3; + } + else + *out1ptr++ = '?'; res1 = 0; } errno1 = errno; @@ -777,7 +781,7 @@ mem_cd_iconveh_internal (const char *src, size_t srclen, break; else if (errno == EILSEQ && handler != iconveh_error) { - /* Error handling can produce up to 10 bytes of ASCII + /* Error handling can produce up to 10 bytes of UTF-8 output. But TO_CODESET may be UCS-2, UTF-16 or UCS-4, so use CD2 here as well. */ char scratchbuf[10]; @@ -825,6 +829,14 @@ mem_cd_iconveh_internal (const char *src, size_t srclen, scratchbuf[scratchlen++] = hex[(uc>>4) & 15]; scratchbuf[scratchlen++] = hex[uc & 15]; } + else if (handler == iconveh_replacement_character) + { + /* U+FFFD in UTF-8 encoding. */ + scratchbuf[0] = '\357'; + scratchbuf[1] = '\277'; + scratchbuf[2] = '\275'; + scratchlen = 3; + } else { scratchbuf[0] = '?'; @@ -834,9 +846,45 @@ mem_cd_iconveh_internal (const char *src, size_t srclen, inptr = scratchbuf; insize = scratchlen; if (cd2 != (iconv_t)(-1)) - res = iconv (cd2, - (ICONV_CONST char **) &inptr, &insize, - &out2ptr, &out2size); + { + char *out2ptr_try = out2ptr; + size_t out2size_try = out2size; + res = iconv (cd2, + (ICONV_CONST char **) &inptr, &insize, + &out2ptr_try, &out2size_try); + if (handler == iconveh_replacement_character + && (res == (size_t)(-1) + ? errno == EILSEQ + /* FreeBSD iconv(), NetBSD iconv(), and + Solaris 11 iconv() insert a '?' if they + cannot convert. This is what we want. + But IRIX iconv() inserts a NUL byte if it + cannot convert. + And musl libc iconv() inserts a '*' if it + cannot convert. */ + : (res > 0 + && !(out2ptr_try - out2ptr == 1 + && *out2ptr == '?')))) + { + /* The iconv() call failed. + U+FFFD can't be converted to TO_CODESET. + Use '?' instead. */ + scratchbuf[0] = '?'; + scratchlen = 1; + inptr = scratchbuf; + insize = scratchlen; + res = iconv (cd2, + (ICONV_CONST char **) &inptr, &insize, + &out2ptr, &out2size); + } + else + { + /* Accept the results of the iconv() call. */ + out2ptr = out2ptr_try; + out2size = out2size_try; + res = 0; + } + } else { /* TO_CODESET is UTF-8. */ @@ -901,9 +949,10 @@ mem_cd_iconveh_internal (const char *src, size_t srclen, length = out2ptr - result; } # if !defined _LIBICONV_VERSION && !(defined __GLIBC__ && !defined __UCLIBC__) - /* Irix iconv() inserts a NUL byte if it cannot convert. - NetBSD iconv() inserts a question mark if it cannot - convert. + /* IRIX iconv() inserts a NUL byte if it cannot convert. + FreeBSD iconv(), NetBSD iconv(), and Solaris 11 + iconv() insert a '?' if they cannot convert. + musl libc iconv() inserts a '*' if it cannot convert. Only GNU libiconv and GNU libc are known to prefer to fail rather than doing a lossy conversion. */ if (res != (size_t)(-1) && res > 0) @@ -916,22 +965,14 @@ mem_cd_iconveh_internal (const char *src, size_t srclen, { /* Failure converting the ASCII replacement. */ if (result != initial_result) - { - int saved_errno = errno; - free (result); - errno = saved_errno; - } + free (result); return -1; } } else { if (result != initial_result) - { - int saved_errno = errno; - free (result); - errno = saved_errno; - } + free (result); return -1; } } @@ -1050,12 +1091,7 @@ str_cd_iconveh (const char *src, if (retval < 0) { - if (result != NULL) - { - int saved_errno = errno; - free (result); - errno = saved_errno; - } + free (result); return NULL; } @@ -1127,12 +1163,8 @@ mem_iconveh (const char *src, size_t srclen, { if (iconveh_close (&cd) < 0) { - /* Return -1, but free the allocated memory, and while doing - that, preserve the errno from iconveh_close. */ - int saved_errno = errno; - if (result != *resultp && result != NULL) + if (result != *resultp) free (result); - errno = saved_errno; return -1; } *resultp = result; @@ -1186,11 +1218,7 @@ str_iconveh (const char *src, { if (iconveh_close (&cd) < 0) { - /* Return NULL, but free the allocated memory, and while doing - that, preserve the errno from iconveh_close. */ - int saved_errno = errno; free (result); - errno = saved_errno; return NULL; } } |