Imported Upstream version 0.9.1upstream/0.9.1

author: Andreas Rottmann <a.rottmann@gmx.at> 2009-09-14 12:32:44 +0200
committer: Andreas Rottmann <a.rottmann@gmx.at> 2009-09-14 12:32:44 +0200
commit: fa095a4504cbe668e4244547e2c141597bea4ecf (patch)
tree: 06135820a286ffec47804e75fbf8a147e92acd2e /lib/striconveh.c
1 files changed, 1251 insertions, 0 deletions
diff --git a/lib/striconveh.c b/lib/striconveh.c
new file mode 100644
index 00000000..b39a01f1
--- /dev/null
+++ b/lib/striconveh.c
@@ -0,0 +1,1251 @@
+/* Character set conversion with error handling.
+   Copyright (C) 2001-2008 Free Software Foundation, Inc.
+   Written by Bruno Haible and Simon Josefsson.
+
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU Lesser General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+#include <config.h>
+
+/* Specification.  */
+#include "striconveh.h"
+
+#include <errno.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+
+#if HAVE_ICONV
+# include <iconv.h>
+# include "unistr.h"
+#endif
+
+#include "c-strcase.h"
+#include "c-strcaseeq.h"
+
+#ifndef SIZE_MAX
+# define SIZE_MAX ((size_t) -1)
+#endif
+
+
+#if HAVE_ICONV
+
+/* The caller must provide CD, CD1, CD2, not just CD, because when a conversion
+   error occurs, we may have to determine the Unicode representation of the
+   inconvertible character.  */
+
+/* iconv_carefully is like iconv, except that it stops as soon as it encounters
+   a conversion error, and it returns in *INCREMENTED a boolean telling whether
+   it has incremented the input pointers past the error location.  */
+# if !defined _LIBICONV_VERSION && !defined __GLIBC__
+/* Irix iconv() inserts a NUL byte if it cannot convert.
+   NetBSD iconv() inserts a question mark if it cannot convert.
+   Only GNU libiconv and GNU libc are known to prefer to fail rather
+   than doing a lossy conversion.  */
+static size_t
+iconv_carefully (iconv_t cd,
+		 const char **inbuf, size_t *inbytesleft,
+		 char **outbuf, size_t *outbytesleft,
+		 bool *incremented)
+{
+  const char *inptr = *inbuf;
+  const char *inptr_end = inptr + *inbytesleft;
+  char *outptr = *outbuf;
+  size_t outsize = *outbytesleft;
+  const char *inptr_before;
+  size_t res;
+
+  do
+    {
+      size_t insize;
+
+      inptr_before = inptr;
+      res = (size_t)(-1);
+
+      for (insize = 1; inptr + insize <= inptr_end; insize++)
+	{
+	  res = iconv (cd,
+		       (ICONV_CONST char **) &inptr, &insize,
+		       &outptr, &outsize);
+	  if (!(res == (size_t)(-1) && errno == EINVAL))
+	    break;
+	  /* iconv can eat up a shift sequence but give EINVAL while attempting
+	     to convert the first character.  E.g. libiconv does this.  */
+	  if (inptr > inptr_before)
+	    {
+	      res = 0;
+	      break;
+	    }
+	}
+
+      if (res == 0)
+	{
+	  *outbuf = outptr;
+	  *outbytesleft = outsize;
+	}
+    }
+  while (res == 0 && inptr < inptr_end);
+
+  *inbuf = inptr;
+  *inbytesleft = inptr_end - inptr;
+  if (res != (size_t)(-1) && res > 0)
+    {
+      /* iconv() has already incremented INPTR.  We cannot go back to a
+	 previous INPTR, otherwise the state inside CD would become invalid,
+	 if FROM_CODESET is a stateful encoding.  So, tell the caller that
+	 *INBUF has already been incremented.  */
+      *incremented = (inptr > inptr_before);
+      errno = EILSEQ;
+      return (size_t)(-1);
+    }
+  else
+    {
+      *incremented = false;
+      return res;
+    }
+}
+# else
+#  define iconv_carefully(cd, inbuf, inbytesleft, outbuf, outbytesleft, incremented) \
+     (*(incremented) = false, \
+      iconv (cd, (ICONV_CONST char **) (inbuf), inbytesleft, outbuf, outbytesleft))
+# endif
+
+/* iconv_carefully_1 is like iconv_carefully, except that it stops after
+   converting one character or one shift sequence.  */
+static size_t
+iconv_carefully_1 (iconv_t cd,
+		   const char **inbuf, size_t *inbytesleft,
+		   char **outbuf, size_t *outbytesleft,
+		   bool *incremented)
+{
+  const char *inptr_before = *inbuf;
+  const char *inptr = inptr_before;
+  const char *inptr_end = inptr_before + *inbytesleft;
+  char *outptr = *outbuf;
+  size_t outsize = *outbytesleft;
+  size_t res = (size_t)(-1);
+  size_t insize;
+
+  for (insize = 1; inptr_before + insize <= inptr_end; insize++)
+    {
+      inptr = inptr_before;
+      res = iconv (cd,
+		   (ICONV_CONST char **) &inptr, &insize,
+		   &outptr, &outsize);
+      if (!(res == (size_t)(-1) && errno == EINVAL))
+	break;
+      /* iconv can eat up a shift sequence but give EINVAL while attempting
+	 to convert the first character.  E.g. libiconv does this.  */
+      if (inptr > inptr_before)
+	{
+	  res = 0;
+	  break;
+	}
+    }
+
+  *inbuf = inptr;
+  *inbytesleft = inptr_end - inptr;
+# if !defined _LIBICONV_VERSION && !defined __GLIBC__
+  /* Irix iconv() inserts a NUL byte if it cannot convert.
+     NetBSD iconv() inserts a question mark if it cannot convert.
+     Only GNU libiconv and GNU libc are known to prefer to fail rather
+     than doing a lossy conversion.  */
+  if (res != (size_t)(-1) && res > 0)
+    {
+      /* iconv() has already incremented INPTR.  We cannot go back to a
+	 previous INPTR, otherwise the state inside CD would become invalid,
+	 if FROM_CODESET is a stateful encoding.  So, tell the caller that
+	 *INBUF has already been incremented.  */
+      *incremented = (inptr > inptr_before);
+      errno = EILSEQ;
+      return (size_t)(-1);
+    }
+# endif
+
+  if (res != (size_t)(-1))
+    {
+      *outbuf = outptr;
+      *outbytesleft = outsize;
+    }
+  *incremented = false;
+  return res;
+}
+
+/* utf8conv_carefully is like iconv, except that
+     - it converts from UTF-8 to UTF-8,
+     - it stops as soon as it encounters a conversion error, and it returns
+       in *INCREMENTED a boolean telling whether it has incremented the input
+       pointers past the error location,
+     - if one_character_only is true, it stops after converting one
+       character.  */
+static size_t
+utf8conv_carefully (bool one_character_only,
+		    const char **inbuf, size_t *inbytesleft,
+		    char **outbuf, size_t *outbytesleft,
+		    bool *incremented)
+{
+  const char *inptr = *inbuf;
+  size_t insize = *inbytesleft;
+  char *outptr = *outbuf;
+  size_t outsize = *outbytesleft;
+  size_t res;
+
+  res = 0;
+  do
+    {
+      ucs4_t uc;
+      int n;
+      int m;
+
+      n = u8_mbtoucr (&uc, (const uint8_t *) inptr, insize);
+      if (n < 0)
+	{
+	  errno = (n == -2 ? EINVAL : EILSEQ);
+	  n = u8_mbtouc (&uc, (const uint8_t *) inptr, insize);
+	  inptr += n;
+	  insize -= n;
+	  res = (size_t)(-1);
+	  *incremented = true;
+	  break;
+	}
+      if (outsize == 0)
+	{
+	  errno = E2BIG;
+	  res = (size_t)(-1);
+	  *incremented = false;
+	  break;
+	}
+      m = u8_uctomb ((uint8_t *) outptr, uc, outsize);
+      if (m == -2)
+	{
+	  errno = E2BIG;
+	  res = (size_t)(-1);
+	  *incremented = false;
+	  break;
+	}
+      inptr += n;
+      insize -= n;
+      if (m == -1)
+	{
+	  errno = EILSEQ;
+	  res = (size_t)(-1);
+	  *incremented = true;
+	  break;
+	}
+      outptr += m;
+      outsize -= m;
+    }
+  while (!one_character_only && insize > 0);
+
+  *inbuf = inptr;
+  *inbytesleft = insize;
+  *outbuf = outptr;
+  *outbytesleft = outsize;
+  return res;
+}
+
+static int
+mem_cd_iconveh_internal (const char *src, size_t srclen,
+			 iconv_t cd, iconv_t cd1, iconv_t cd2,
+			 enum iconv_ilseq_handler handler,
+			 size_t extra_alloc,
+			 size_t *offsets,
+			 char **resultp, size_t *lengthp)
+{
+  /* When a conversion error occurs, we cannot start using CD1 and CD2 at
+     this point: FROM_CODESET may be a stateful encoding like ISO-2022-KR.
+     Instead, we have to start afresh from the beginning of SRC.  */
+  /* Use a temporary buffer, so that for small strings, a single malloc()
+     call will be sufficient.  */
+# define tmpbufsize 4096
+  /* The alignment is needed when converting e.g. to glibc's WCHAR_T or
+     libiconv's UCS-4-INTERNAL encoding.  */
+  union { unsigned int align; char buf[tmpbufsize]; } tmp;
+# define tmpbuf tmp.buf
+
+  char *initial_result;
+  char *result;
+  size_t allocated;
+  size_t length;
+  size_t last_length = (size_t)(-1); /* only needed if offsets != NULL */
+
+  if (*resultp != NULL && *lengthp >= sizeof (tmpbuf))
+    {
+      initial_result = *resultp;
+      allocated = *lengthp;
+    }
+  else
+    {
+      initial_result = tmpbuf;
+      allocated = sizeof (tmpbuf);
+    }
+  result = initial_result;
+
+  /* Test whether a direct conversion is possible at all.  */
+  if (cd == (iconv_t)(-1))
+    goto indirectly;
+
+  if (offsets != NULL)
+    {
+      size_t i;
+
+      for (i = 0; i < srclen; i++)
+	offsets[i] = (size_t)(-1);
+
+      last_length = (size_t)(-1);
+    }
+  length = 0;
+
+  /* First, try a direct conversion, and see whether a conversion error
+     occurs at all.  */
+  {
+    const char *inptr = src;
+    size_t insize = srclen;
+
+    /* Avoid glibc-2.1 bug and Solaris 2.7-2.9 bug.  */
+# if defined _LIBICONV_VERSION \
+     || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun)
+    /* Set to the initial state.  */
+    iconv (cd, NULL, NULL, NULL, NULL);
+# endif
+
+    while (insize > 0)
+      {
+	char *outptr = result + length;
+	size_t outsize = allocated - extra_alloc - length;
+	bool incremented;
+	size_t res;
+	bool grow;
+
+	if (offsets != NULL)
+	  {
+	    if (length != last_length) /* ensure that offset[] be increasing */
+	      {
+		offsets[inptr - src] = length;
+		last_length = length;
+	      }
+	    res = iconv_carefully_1 (cd,
+				     &inptr, &insize,
+				     &outptr, &outsize,
+				     &incremented);
+	  }
+	else
+	  /* Use iconv_carefully instead of iconv here, because:
+	     - If TO_CODESET is UTF-8, we can do the error handling in this
+	       loop, no need for a second loop,
+	     - With iconv() implementations other than GNU libiconv and GNU
+	       libc, if we use iconv() in a big swoop, checking for an E2BIG
+	       return, we lose the number of irreversible conversions.  */
+	  res = iconv_carefully (cd,
+				 &inptr, &insize,
+				 &outptr, &outsize,
+				 &incremented);
+
+	length = outptr - result;
+	grow = (length + extra_alloc > allocated / 2);
+	if (res == (size_t)(-1))
+	  {
+	    if (errno == E2BIG)
+	      grow = true;
+	    else if (errno == EINVAL)
+	      break;
+	    else if (errno == EILSEQ && handler != iconveh_error)
+	      {
+		if (cd2 == (iconv_t)(-1))
+		  {
+		    /* TO_CODESET is UTF-8.  */
+		    /* Error handling can produce up to 1 byte of output.  */
+		    if (length + 1 + extra_alloc > allocated)
+		      {
+			char *memory;
+
+			allocated = 2 * allocated;
+			if (length + 1 + extra_alloc > allocated)
+			  abort ();
+			if (result == initial_result)
+			  memory = (char *) malloc (allocated);
+			else
+			  memory = (char *) realloc (result, allocated);
+			if (memory == NULL)
+			  {
+			    if (result != initial_result)
+			      free (result);
+			    errno = ENOMEM;
+			    return -1;
+			  }
+			if (result == initial_result)
+			  memcpy (memory, initial_result, length);
+			result = memory;
+			grow = false;
+		      }
+		    /* The input is invalid in FROM_CODESET.  Eat up one byte
+		       and emit a question mark.  */
+		    if (!incremented)
+		      {
+			if (insize == 0)
+			  abort ();
+			inptr++;
+			insize--;
+		      }
+		    result[length] = '?';
+		    length++;
+		  }
+		else
+		  goto indirectly;
+	      }
+	    else
+	      {
+		if (result != initial_result)
+		  {
+		    int saved_errno = errno;
+		    free (result);
+		    errno = saved_errno;
+		  }
+		return -1;
+	      }
+	  }
+	if (insize == 0)
+	  break;
+	if (grow)
+	  {
+	    char *memory;
+
+	    allocated = 2 * allocated;
+	    if (result == initial_result)
+	      memory = (char *) malloc (allocated);
+	    else
+	      memory = (char *) realloc (result, allocated);
+	    if (memory == NULL)
+	      {
+		if (result != initial_result)
+		  free (result);
+		errno = ENOMEM;
+		return -1;
+	      }
+	    if (result == initial_result)
+	      memcpy (memory, initial_result, length);
+	    result = memory;
+	  }
+      }
+  }
+
+  /* Now get the conversion state back to the initial state.
+     But avoid glibc-2.1 bug and Solaris 2.7 bug.  */
+#if defined _LIBICONV_VERSION \
+    || !((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) || defined __sun)
+  for (;;)
+    {
+      char *outptr = result + length;
+      size_t outsize = allocated - extra_alloc - length;
+      size_t res;
+
+      res = iconv (cd, NULL, NULL, &outptr, &outsize);
+      length = outptr - result;
+      if (res == (size_t)(-1))
+	{
+	  if (errno == E2BIG)
+	    {
+	      char *memory;
+
+	      allocated = 2 * allocated;
+	      if (result == initial_result)
+		memory = (char *) malloc (allocated);
+	      else
+		memory = (char *) realloc (result, allocated);
+	      if (memory == NULL)
+		{
+		  if (result != initial_result)
+		    free (result);
+		  errno = ENOMEM;
+		  return -1;
+		}
+	      if (result == initial_result)
+		memcpy (memory, initial_result, length);
+	      result = memory;
+	    }
+	  else
+	    {
+	      if (result != initial_result)
+		{
+		  int saved_errno = errno;
+		  free (result);
+		  errno = saved_errno;
+		}
+	      return -1;
+	    }
+	}
+      else
+	break;
+    }
+#endif
+
+  /* The direct conversion succeeded.  */
+  goto done;
+
+ indirectly:
+  /* The direct conversion failed.
+     Use a conversion through UTF-8.  */
+  if (offsets != NULL)
+    {
+      size_t i;
+
+      for (i = 0; i < srclen; i++)
+	offsets[i] = (size_t)(-1);
+
+      last_length = (size_t)(-1);
+    }
+  length = 0;
+  {
+    const bool slowly = (offsets != NULL || handler == iconveh_error);
+# define utf8bufsize 4096 /* may also be smaller or larger than tmpbufsize */
+    char utf8buf[utf8bufsize + 1];
+    size_t utf8len = 0;
+    const char *in1ptr = src;
+    size_t in1size = srclen;
+    bool do_final_flush1 = true;
+    bool do_final_flush2 = true;
+
+    /* Avoid glibc-2.1 bug and Solaris 2.7-2.9 bug.  */
+# if defined _LIBICONV_VERSION \
+     || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun)
+    /* Set to the initial state.  */
+    if (cd1 != (iconv_t)(-1))
+      iconv (cd1, NULL, NULL, NULL, NULL);
+    if (cd2 != (iconv_t)(-1))
+      iconv (cd2, NULL, NULL, NULL, NULL);
+# endif
+
+    while (in1size > 0 || do_final_flush1 || utf8len > 0 || do_final_flush2)
+      {
+	char *out1ptr = utf8buf + utf8len;
+	size_t out1size = utf8bufsize - utf8len;
+	bool incremented1;
+	size_t res1;
+	int errno1;
+
+	/* Conversion step 1: from FROM_CODESET to UTF-8.  */
+	if (in1size > 0)
+	  {
+	    if (offsets != NULL
+		&& length != last_length) /* ensure that offset[] be increasing */
+	      {
+		offsets[in1ptr - src] = length;
+		last_length = length;
+	      }
+	    if (cd1 != (iconv_t)(-1))
+	      {
+		if (slowly)
+		  res1 = iconv_carefully_1 (cd1,
+					    &in1ptr, &in1size,
+					    &out1ptr, &out1size,
+					    &incremented1);
+		else
+		  res1 = iconv_carefully (cd1,
+					  &in1ptr, &in1size,
+					  &out1ptr, &out1size,
+					  &incremented1);
+	      }
+	    else
+	      {
+		/* FROM_CODESET is UTF-8.  */
+		res1 = utf8conv_carefully (slowly,
+					   &in1ptr, &in1size,
+					   &out1ptr, &out1size,
+					   &incremented1);
+	      }
+	  }
+	else if (do_final_flush1)
+	  {
+	    /* Now get the conversion state of CD1 back to the initial state.
+	       But avoid glibc-2.1 bug and Solaris 2.7 bug.  */
+# if defined _LIBICONV_VERSION \
+     || !((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) || defined __sun)
+	    if (cd1 != (iconv_t)(-1))
+	      res1 = iconv (cd1, NULL, NULL, &out1ptr, &out1size);
+	    else
+# endif
+	      res1 = 0;
+	    do_final_flush1 = false;
+	    incremented1 = true;
+	  }
+	else
+	  {
+	    res1 = 0;
+	    incremented1 = true;
+	  }
+	if (res1 == (size_t)(-1)
+	    && !(errno == E2BIG || errno == EINVAL || errno == EILSEQ))
+	  {
+	    if (result != initial_result)
+	      {
+		int saved_errno = errno;
+		free (result);
+		errno = saved_errno;
+	      }
+	    return -1;
+	  }
+	if (res1 == (size_t)(-1)
+	    && errno == EILSEQ && handler != iconveh_error)
+	  {
+	    /* The input is invalid in FROM_CODESET.  Eat up one byte and
+	       emit a question mark.  Room for the question mark was allocated
+	       at the end of utf8buf.  */
+	    if (!incremented1)
+	      {
+		if (in1size == 0)
+		  abort ();
+		in1ptr++;
+		in1size--;
+	      }
+	    utf8buf[utf8len++] = '?';
+	  }
+	errno1 = errno;
+	utf8len = out1ptr - utf8buf;
+
+	if (offsets != NULL
+	    || in1size == 0
+	    || utf8len > utf8bufsize / 2
+	    || (res1 == (size_t)(-1) && errno1 == E2BIG))
+	  {
+	    /* Conversion step 2: from UTF-8 to TO_CODESET.  */
+	    const char *in2ptr = utf8buf;
+	    size_t in2size = utf8len;
+
+	    while (in2size > 0
+		   || (in1size == 0 && !do_final_flush1 && do_final_flush2))
+	      {
+		char *out2ptr = result + length;
+		size_t out2size = allocated - extra_alloc - length;
+		bool incremented2;
+		size_t res2;
+		bool grow;
+
+		if (in2size > 0)
+		  {
+		    if (cd2 != (iconv_t)(-1))
+		      res2 = iconv_carefully (cd2,
+					      &in2ptr, &in2size,
+					      &out2ptr, &out2size,
+					      &incremented2);
+		    else
+		      /* TO_CODESET is UTF-8.  */
+		      res2 = utf8conv_carefully (false,
+						 &in2ptr, &in2size,
+						 &out2ptr, &out2size,
+						 &incremented2);
+		  }
+		else /* in1size == 0 && !do_final_flush1
+			&& in2size == 0 && do_final_flush2 */
+		  {
+		    /* Now get the conversion state of CD1 back to the initial
+		       state.  But avoid glibc-2.1 bug and Solaris 2.7 bug.  */
+# if defined _LIBICONV_VERSION \
+     || !((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) || defined __sun)
+		    if (cd2 != (iconv_t)(-1))
+		      res2 = iconv (cd2, NULL, NULL, &out2ptr, &out2size);
+		    else
+# endif
+		      res2 = 0;
+		    do_final_flush2 = false;
+		    incremented2 = true;
+		  }
+
+		length = out2ptr - result;
+		grow = (length + extra_alloc > allocated / 2);
+		if (res2 == (size_t)(-1))
+		  {
+		    if (errno == E2BIG)
+		      grow = true;
+		    else if (errno == EINVAL)
+		      break;
+		    else if (errno == EILSEQ && handler != iconveh_error)
+		      {
+			/* Error handling can produce up to 10 bytes of ASCII
+			   output.  But TO_CODESET may be UCS-2, UTF-16 or
+			   UCS-4, so use CD2 here as well.  */
+			char scratchbuf[10];
+			size_t scratchlen;
+			ucs4_t uc;
+			const char *inptr;
+			size_t insize;
+			size_t res;
+
+			if (incremented2)
+			  {
+			    if (u8_prev (&uc, (const uint8_t *) in2ptr,
+					 (const uint8_t *) utf8buf)
+				== NULL)
+			      abort ();
+			  }
+			else
+			  {
+			    int n;
+			    if (in2size == 0)
+			      abort ();
+			    n = u8_mbtouc_unsafe (&uc, (const uint8_t *) in2ptr,
+						  in2size);
+			    in2ptr += n;
+			    in2size -= n;
+			  }
+
+			if (handler == iconveh_escape_sequence)
+			  {
+			    static char hex[16] = "0123456789ABCDEF";
+			    scratchlen = 0;
+			    scratchbuf[scratchlen++] = '\\';
+			    if (uc < 0x10000)
+			      scratchbuf[scratchlen++] = 'u';
+			    else
+			      {
+				scratchbuf[scratchlen++] = 'U';
+				scratchbuf[scratchlen++] = hex[(uc>>28) & 15];
+				scratchbuf[scratchlen++] = hex[(uc>>24) & 15];
+				scratchbuf[scratchlen++] = hex[(uc>>20) & 15];
+				scratchbuf[scratchlen++] = hex[(uc>>16) & 15];
+			      }
+			    scratchbuf[scratchlen++] = hex[(uc>>12) & 15];
+			    scratchbuf[scratchlen++] = hex[(uc>>8) & 15];
+			    scratchbuf[scratchlen++] = hex[(uc>>4) & 15];
+			    scratchbuf[scratchlen++] = hex[uc & 15];
+			  }
+			else
+			  {
+			    scratchbuf[0] = '?';
+			    scratchlen = 1;
+			  }
+
+			inptr = scratchbuf;
+			insize = scratchlen;
+			if (cd2 != (iconv_t)(-1))
+			  res = iconv (cd2,
+				       (ICONV_CONST char **) &inptr, &insize,
+				       &out2ptr, &out2size);
+			else
+			  {
+			    /* TO_CODESET is UTF-8.  */
+			    if (out2size >= insize)
+			      {
+				memcpy (out2ptr, inptr, insize);
+				out2ptr += insize;
+				out2size -= insize;
+				inptr += insize;
+				insize = 0;
+				res = 0;
+			      }
+			    else
+			      {
+				errno = E2BIG;
+				res = (size_t)(-1);
+			      }
+			  }
+			length = out2ptr - result;
+			if (res == (size_t)(-1) && errno == E2BIG)
+			  {
+			    char *memory;
+
+			    allocated = 2 * allocated;
+			    if (length + 1 + extra_alloc > allocated)
+			      abort ();
+			    if (result == initial_result)
+			      memory = (char *) malloc (allocated);
+			    else
+			      memory = (char *) realloc (result, allocated);
+			    if (memory == NULL)
+			      {
+				if (result != initial_result)
+				  free (result);
+				errno = ENOMEM;
+				return -1;
+			      }
+			    if (result == initial_result)
+			      memcpy (memory, initial_result, length);
+			    result = memory;
+			    grow = false;
+
+			    out2ptr = result + length;
+			    out2size = allocated - extra_alloc - length;
+			    if (cd2 != (iconv_t)(-1))
+			      res = iconv (cd2,
+					   (ICONV_CONST char **) &inptr,
+					   &insize,
+					   &out2ptr, &out2size);
+			    else
+			      {
+				/* TO_CODESET is UTF-8.  */
+				if (!(out2size >= insize))
+				  abort ();
+				memcpy (out2ptr, inptr, insize);
+				out2ptr += insize;
+				out2size -= insize;
+				inptr += insize;
+				insize = 0;
+				res = 0;
+			      }
+			    length = out2ptr - result;
+			  }
+# if !defined _LIBICONV_VERSION && !defined __GLIBC__
+			/* Irix iconv() inserts a NUL byte if it cannot convert.
+			   NetBSD iconv() inserts a question mark if it cannot
+			   convert.
+			   Only GNU libiconv and GNU libc are known to prefer
+			   to fail rather than doing a lossy conversion.  */
+			if (res != (size_t)(-1) && res > 0)
+			  {
+			    errno = EILSEQ;
+			    res = (size_t)(-1);
+			  }
+# endif
+			if (res == (size_t)(-1))
+			  {
+			    /* Failure converting the ASCII replacement.  */
+			    if (result != initial_result)
+			      {
+				int saved_errno = errno;
+				free (result);
+				errno = saved_errno;
+			      }
+			    return -1;
+			  }
+		      }
+		    else
+		      {
+			if (result != initial_result)
+			  {
+			    int saved_errno = errno;
+			    free (result);
+			    errno = saved_errno;
+			  }
+			return -1;
+		      }
+		  }
+		if (!(in2size > 0
+		      || (in1size == 0 && !do_final_flush1 && do_final_flush2)))
+		  break;
+		if (grow)
+		  {
+		    char *memory;
+
+		    allocated = 2 * allocated;
+		    if (result == initial_result)
+		      memory = (char *) malloc (allocated);
+		    else
+		      memory = (char *) realloc (result, allocated);
+		    if (memory == NULL)
+		      {
+			if (result != initial_result)
+			  free (result);
+			errno = ENOMEM;
+			return -1;
+		      }
+		    if (result == initial_result)
+		      memcpy (memory, initial_result, length);
+		    result = memory;
+		  }
+	      }
+
+	    /* Move the remaining bytes to the beginning of utf8buf.  */
+	    if (in2size > 0)
+	      memmove (utf8buf, in2ptr, in2size);
+	    utf8len = in2size;
+	  }
+
+	if (res1 == (size_t)(-1))
+	  {
+	    if (errno1 == EINVAL)
+	      in1size = 0;
+	    else if (errno1 == EILSEQ)
+	      {
+		if (result != initial_result)
+		  free (result);
+		errno = errno1;
+		return -1;
+	      }
+	  }
+      }
+# undef utf8bufsize
+  }
+
+ done:
+  /* Now the final memory allocation.  */
+  if (result == tmpbuf)
+    {
+      size_t memsize = length + extra_alloc;
+      char *memory;
+
+      memory = (char *) malloc (memsize > 0 ? memsize : 1);
+      if (memory != NULL)
+	{
+	  memcpy (memory, tmpbuf, length);
+	  result = memory;
+	}
+      else
+	{
+	  errno = ENOMEM;
+	  return -1;
+        }
+    }
+  else if (result != *resultp && length + extra_alloc < allocated)
+    {
+      /* Shrink the allocated memory if possible.  */
+      size_t memsize = length + extra_alloc;
+      char *memory;
+
+      memory = (char *) realloc (result, memsize > 0 ? memsize : 1);
+      if (memory != NULL)
+	result = memory;
+    }
+  *resultp = result;
+  *lengthp = length;
+  return 0;
+# undef tmpbuf
+# undef tmpbufsize
+}
+
+int
+mem_cd_iconveh (const char *src, size_t srclen,
+		iconv_t cd, iconv_t cd1, iconv_t cd2,
+		enum iconv_ilseq_handler handler,
+		size_t *offsets,
+		char **resultp, size_t *lengthp)
+{
+  return mem_cd_iconveh_internal (src, srclen, cd, cd1, cd2, handler, 0,
+				  offsets, resultp, lengthp);
+}
+
+char *
+str_cd_iconveh (const char *src,
+		iconv_t cd, iconv_t cd1, iconv_t cd2,
+		enum iconv_ilseq_handler handler)
+{
+  /* For most encodings, a trailing NUL byte in the input will be converted
+     to a trailing NUL byte in the output.  But not for UTF-7.  So that this
+     function is usable for UTF-7, we have to exclude the NUL byte from the
+     conversion and add it by hand afterwards.  */
+  char *result = NULL;
+  size_t length = 0;
+  int retval = mem_cd_iconveh_internal (src, strlen (src),
+					cd, cd1, cd2, handler, 1, NULL,
+					&result, &length);
+
+  if (retval < 0)
+    {
+      if (result != NULL)
+	{
+	  int saved_errno = errno;
+	  free (result);
+	  errno = saved_errno;
+	}
+      return NULL;
+    }
+
+  /* Add the terminating NUL byte.  */
+  result[length] = '\0';
+
+  return result;
+}
+
+#endif
+
+int
+mem_iconveh (const char *src, size_t srclen,
+	     const char *from_codeset, const char *to_codeset,
+	     enum iconv_ilseq_handler handler,
+	     size_t *offsets,
+	     char **resultp, size_t *lengthp)
+{
+  if (srclen == 0)
+    {
+      /* Nothing to convert.  */
+      *lengthp = 0;
+      return 0;
+    }
+  else if (offsets == NULL && c_strcasecmp (from_codeset, to_codeset) == 0)
+    {
+      char *result;
+
+      if (*resultp != NULL && *lengthp >= srclen)
+	result = *resultp;
+      else
+	{
+	  result = (char *) malloc (srclen);
+	  if (result == NULL)
+	    {
+	      errno = ENOMEM;
+	      return -1;
+	    }
+	}
+      memcpy (result, src, srclen);
+      *resultp = result;
+      *lengthp = srclen;
+      return 0;
+    }
+  else
+    {
+#if HAVE_ICONV
+      iconv_t cd;
+      iconv_t cd1;
+      iconv_t cd2;
+      char *result;
+      size_t length;
+      int retval;
+
+      /* Avoid glibc-2.1 bug with EUC-KR.  */
+# if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION
+      if (c_strcasecmp (from_codeset, "EUC-KR") == 0
+	  || c_strcasecmp (to_codeset, "EUC-KR") == 0)
+	{
+	  errno = EINVAL;
+	  return -1;
+	}
+# endif
+
+      cd = iconv_open (to_codeset, from_codeset);
+
+      if (STRCASEEQ (from_codeset, "UTF-8", 'U','T','F','-','8',0,0,0,0))
+	cd1 = (iconv_t)(-1);
+      else
+	{
+	  cd1 = iconv_open ("UTF-8", from_codeset);
+	  if (cd1 == (iconv_t)(-1))
+	    {
+	      int saved_errno = errno;
+	      if (cd != (iconv_t)(-1))
+		iconv_close (cd);
+	      errno = saved_errno;
+	      return -1;
+	    }
+	}
+
+      if (STRCASEEQ (to_codeset, "UTF-8", 'U','T','F','-','8',0,0,0,0)
+# if (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2) || __GLIBC__ > 2 || _LIBICONV_VERSION >= 0x0105
+	  || c_strcasecmp (to_codeset, "UTF-8//TRANSLIT") == 0
+# endif
+	 )
+	cd2 = (iconv_t)(-1);
+      else
+	{
+	  cd2 = iconv_open (to_codeset, "UTF-8");
+	  if (cd2 == (iconv_t)(-1))
+	    {
+	      int saved_errno = errno;
+	      if (cd1 != (iconv_t)(-1))
+		iconv_close (cd1);
+	      if (cd != (iconv_t)(-1))
+		iconv_close (cd);
+	      errno = saved_errno;
+	      return -1;
+	    }
+	}
+
+      result = *resultp;
+      length = *lengthp;
+      retval = mem_cd_iconveh (src, srclen, cd, cd1, cd2, handler, offsets,
+			       &result, &length);
+
+      if (retval < 0)
+	{
+	  /* Close cd, cd1, cd2, but preserve the errno from str_cd_iconv.  */
+	  int saved_errno = errno;
+	  if (cd2 != (iconv_t)(-1))
+	    iconv_close (cd2);
+	  if (cd1 != (iconv_t)(-1))
+	    iconv_close (cd1);
+	  if (cd != (iconv_t)(-1))
+	    iconv_close (cd);
+	  errno = saved_errno;
+	}
+      else
+	{
+	  if (cd2 != (iconv_t)(-1) && iconv_close (cd2) < 0)
+	    {
+	      /* Return -1, but free the allocated memory, and while doing
+		 that, preserve the errno from iconv_close.  */
+	      int saved_errno = errno;
+	      if (cd1 != (iconv_t)(-1))
+		iconv_close (cd1);
+	      if (cd != (iconv_t)(-1))
+		iconv_close (cd);
+	      if (result != *resultp && result != NULL)
+		free (result);
+	      errno = saved_errno;
+	      return -1;
+	    }
+	  if (cd1 != (iconv_t)(-1) && iconv_close (cd1) < 0)
+	    {
+	      /* Return -1, but free the allocated memory, and while doing
+		 that, preserve the errno from iconv_close.  */
+	      int saved_errno = errno;
+	      if (cd != (iconv_t)(-1))
+		iconv_close (cd);
+	      if (result != *resultp && result != NULL)
+		free (result);
+	      errno = saved_errno;
+	      return -1;
+	    }
+	  if (cd != (iconv_t)(-1) && iconv_close (cd) < 0)
+	    {
+	      /* Return -1, but free the allocated memory, and while doing
+		 that, preserve the errno from iconv_close.  */
+	      int saved_errno = errno;
+	      if (result != *resultp && result != NULL)
+		free (result);
+	      errno = saved_errno;
+	      return -1;
+	    }
+	  *resultp = result;
+	  *lengthp = length;
+	}
+      return retval;
+#else
+      /* This is a different error code than if iconv_open existed but didn't
+	 support from_codeset and to_codeset, so that the caller can emit
+	 an error message such as
+	   "iconv() is not supported. Installing GNU libiconv and
+	    then reinstalling this package would fix this."  */
+      errno = ENOSYS;
+      return -1;
+#endif
+    }
+}
+
+char *
+str_iconveh (const char *src,
+	     const char *from_codeset, const char *to_codeset,
+	     enum iconv_ilseq_handler handler)
+{
+  if (*src == '\0' || c_strcasecmp (from_codeset, to_codeset) == 0)
+    {
+      char *result = strdup (src);
+
+      if (result == NULL)
+	errno = ENOMEM;
+      return result;
+    }
+  else
+    {
+#if HAVE_ICONV
+      iconv_t cd;
+      iconv_t cd1;
+      iconv_t cd2;
+      char *result;
+
+      /* Avoid glibc-2.1 bug with EUC-KR.  */
+# if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION
+      if (c_strcasecmp (from_codeset, "EUC-KR") == 0
+	  || c_strcasecmp (to_codeset, "EUC-KR") == 0)
+	{
+	  errno = EINVAL;
+	  return NULL;
+	}
+# endif
+
+      cd = iconv_open (to_codeset, from_codeset);
+
+      if (STRCASEEQ (from_codeset, "UTF-8", 'U','T','F','-','8',0,0,0,0))
+	cd1 = (iconv_t)(-1);
+      else
+	{
+	  cd1 = iconv_open ("UTF-8", from_codeset);
+	  if (cd1 == (iconv_t)(-1))
+	    {
+	      int saved_errno = errno;
+	      if (cd != (iconv_t)(-1))
+		iconv_close (cd);
+	      errno = saved_errno;
+	      return NULL;
+	    }
+	}
+
+      if (STRCASEEQ (to_codeset, "UTF-8", 'U','T','F','-','8',0,0,0,0)
+# if (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2) || __GLIBC__ > 2 || _LIBICONV_VERSION >= 0x0105
+	  || c_strcasecmp (to_codeset, "UTF-8//TRANSLIT") == 0
+# endif
+	 )
+	cd2 = (iconv_t)(-1);
+      else
+	{
+	  cd2 = iconv_open (to_codeset, "UTF-8");
+	  if (cd2 == (iconv_t)(-1))
+	    {
+	      int saved_errno = errno;
+	      if (cd1 != (iconv_t)(-1))
+		iconv_close (cd1);
+	      if (cd != (iconv_t)(-1))
+		iconv_close (cd);
+	      errno = saved_errno;
+	      return NULL;
+	    }
+	}
+
+      result = str_cd_iconveh (src, cd, cd1, cd2, handler);
+
+      if (result == NULL)
+	{
+	  /* Close cd, cd1, cd2, but preserve the errno from str_cd_iconv.  */
+	  int saved_errno = errno;
+	  if (cd2 != (iconv_t)(-1))
+	    iconv_close (cd2);
+	  if (cd1 != (iconv_t)(-1))
+	    iconv_close (cd1);
+	  if (cd != (iconv_t)(-1))
+	    iconv_close (cd);
+	  errno = saved_errno;
+	}
+      else
+	{
+	  if (cd2 != (iconv_t)(-1) && iconv_close (cd2) < 0)
+	    {
+	      /* Return NULL, but free the allocated memory, and while doing
+		 that, preserve the errno from iconv_close.  */
+	      int saved_errno = errno;
+	      if (cd1 != (iconv_t)(-1))
+		iconv_close (cd1);
+	      if (cd != (iconv_t)(-1))
+		iconv_close (cd);
+	      free (result);
+	      errno = saved_errno;
+	      return NULL;
+	    }
+	  if (cd1 != (iconv_t)(-1) && iconv_close (cd1) < 0)
+	    {
+	      /* Return NULL, but free the allocated memory, and while doing
+		 that, preserve the errno from iconv_close.  */
+	      int saved_errno = errno;
+	      if (cd != (iconv_t)(-1))
+		iconv_close (cd);
+	      free (result);
+	      errno = saved_errno;
+	      return NULL;
+	    }
+	  if (cd != (iconv_t)(-1) && iconv_close (cd) < 0)
+	    {
+	      /* Return NULL, but free the allocated memory, and while doing
+		 that, preserve the errno from iconv_close.  */
+	      int saved_errno = errno;
+	      free (result);
+	      errno = saved_errno;
+	      return NULL;
+	    }
+	}
+      return result;
+#else
+      /* This is a different error code than if iconv_open existed but didn't
+	 support from_codeset and to_codeset, so that the caller can emit
+	 an error message such as
+	   "iconv() is not supported. Installing GNU libiconv and
+	    then reinstalling this package would fix this."  */
+      errno = ENOSYS;
+      return NULL;
+#endif
+    }
+}
author	Andreas Rottmann <a.rottmann@gmx.at>	2009-09-14 12:32:44 +0200
committer	Andreas Rottmann <a.rottmann@gmx.at>	2009-09-14 12:32:44 +0200
commit	fa095a4504cbe668e4244547e2c141597bea4ecf (patch)
tree	06135820a286ffec47804e75fbf8a147e92acd2e /lib/striconveh.c