1 files changed, 189 insertions, 125 deletions
diff --git a/lib/unilbrk/u8-possible-linebreaks.c b/lib/unilbrk/u8-possible-linebreaks.c
index 57fe4916..923028e3 100644
--- a/lib/unilbrk/u8-possible-linebreaks.c
+++ b/lib/unilbrk/u8-possible-linebreaks.c
@@ -1,33 +1,33 @@
 /* Line breaking of UTF-8 strings.
-   Copyright (C) 2001-2003, 2006-2018 Free Software Foundation, Inc.
+   Copyright (C) 2001-2003, 2006-2022 Free Software Foundation, Inc.
    Written by Bruno Haible <bruno@clisp.org>, 2001.
 
-   This program is free software: you can redistribute it and/or
-   modify it under the terms of either:
+   This file is free software.
+   It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+".
+   You can redistribute it and/or modify it under either
+     - the terms of the GNU Lesser General Public License as published
+       by the Free Software Foundation; either version 3, or (at your
+       option) any later version, or
+     - the terms of the GNU General Public License as published by the
+       Free Software Foundation; either version 2, or (at your option)
+       any later version, or
+     - the same dual license "the GNU LGPLv3+ or the GNU GPLv2+".
 
-     * the GNU Lesser General Public License as published by the Free
-       Software Foundation; either version 3 of the License, or (at your
-       option) any later version.
-
-   or
-
-     * the GNU General Public License as published by the Free
-       Software Foundation; either version 2 of the License, or (at your
-       option) any later version.
-
-   or both in parallel, as here.
-   This program is distributed in the hope that it will be useful,
+   This file is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
+   Lesser General Public License and the GNU General Public License
+   for more details.
 
-   You should have received a copy of the GNU Lesser General Public License
-   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
+   You should have received a copy of the GNU Lesser General Public
+   License and of the GNU General Public License along with this
+   program.  If not, see <https://www.gnu.org/licenses/>.  */
 
 #include <config.h>
 
 /* Specification.  */
 #include "unilbrk.h"
+#include "unilbrk/internal.h"
 
 #include <stdlib.h>
 #include <string.h>
@@ -36,142 +36,202 @@
 #include "uniwidth/cjk.h"
 #include "unistr.h"
 
+/* This file implements
+   Unicode Standard Annex #14 <https://www.unicode.org/reports/tr14/>.  */
+
 void
-u8_possible_linebreaks (const uint8_t *s, size_t n, const char *encoding, char *p)
+u8_possible_linebreaks_loop (const uint8_t *s, size_t n, const char *encoding,
+                             int cr, char *p)
 {
-  int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL);
-  const uint8_t *s_end = s + n;
-  int last_prop = LBP_BK; /* line break property of last non-space character */
-  char *seen_space = NULL; /* Was a space seen after the last non-space character? */
-  char *seen_space2 = NULL; /* At least two spaces after the last non-space? */
+  if (n > 0)
+    {
+      int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID1 : LBP_AL);
+      const uint8_t *s_end = s + n;
+      int prev_prop = LBP_BK; /* line break property of last character */
+      int last_prop = LBP_BK; /* line break property of last non-space character */
+      char *seen_space = NULL; /* Was a space seen after the last non-space character? */
 
-  /* Don't break inside multibyte characters.  */
-  memset (p, UC_BREAK_PROHIBITED, n);
+      /* Don't break inside multibyte characters.  */
+      memset (p, UC_BREAK_PROHIBITED, n);
 
-  while (s < s_end)
-    {
-      ucs4_t uc;
-      int count = u8_mbtouc_unsafe (&uc, s, s_end - s);
-      int prop = unilbrkprop_lookup (uc);
+      /* Number of consecutive regional indicator (RI) characters seen
+         immediately before the current point.  */
+      size_t ri_count = 0;
 
-      if (prop == LBP_BK)
+      do
         {
-          /* Mandatory break.  */
-          *p = UC_BREAK_MANDATORY;
-          last_prop = LBP_BK;
-          seen_space = NULL;
-          seen_space2 = NULL;
-        }
-      else
-        {
-          char *q;
-
-          /* Resolve property values whose behaviour is not fixed.  */
-          switch (prop)
-            {
-            case LBP_AI:
-              /* Resolve ambiguous.  */
-              prop = LBP_AI_REPLACEMENT;
-              break;
-            case LBP_CB:
-              /* This is arbitrary.  */
-              prop = LBP_ID;
-              break;
-            case LBP_SA:
-              /* We don't handle complex scripts yet.
-                 Treat LBP_SA like LBP_XX.  */
-            case LBP_XX:
-              /* This is arbitrary.  */
-              prop = LBP_AL;
-              break;
-            }
+          ucs4_t uc;
+          int count = u8_mbtouc_unsafe (&uc, s, s_end - s);
+          int prop = unilbrkprop_lookup (uc);
 
-          /* Deal with spaces and combining characters.  */
-          q = p;
-          if (prop == LBP_SP)
+          if (prop == LBP_BK || prop == LBP_LF || prop == LBP_CR)
             {
-              /* Don't break just before a space.  */
-              *p = UC_BREAK_PROHIBITED;
-              seen_space2 = seen_space;
-              seen_space = p;
-            }
-          else if (prop == LBP_ZW)
-            {
-              /* Don't break just before a zero-width space.  */
-              *p = UC_BREAK_PROHIBITED;
-              last_prop = LBP_ZW;
+              /* (LB4,LB5,LB6) Mandatory break.  */
+              *p = UC_BREAK_MANDATORY;
+              /* cr is either LBP_CR or -1.  In the first case, recognize
+                 a CR-LF sequence.  */
+              if (prev_prop == cr && prop == LBP_LF)
+                p[-1] = UC_BREAK_CR_BEFORE_LF;
+              prev_prop = prop;
+              last_prop = LBP_BK;
               seen_space = NULL;
-              seen_space2 = NULL;
             }
-          else if (prop == LBP_CM)
+          else
             {
-              /* Don't break just before a combining character, except immediately after a
-                 zero-width space.  */
-              if (last_prop == LBP_ZW)
+              /* Resolve property values whose behaviour is not fixed.  */
+              switch (prop)
                 {
-                  /* Break after zero-width space.  */
-                  *p = UC_BREAK_POSSIBLE;
-                  /* A combining character turns a preceding space into LBP_ID.  */
-                  last_prop = LBP_ID;
+                case LBP_AI:
+                  /* Resolve ambiguous.  */
+                  prop = LBP_AI_REPLACEMENT;
+                  break;
+                case LBP_CB:
+                  /* This is arbitrary.  */
+                  prop = LBP_ID1;
+                  break;
+                case LBP_SA:
+                  /* We don't handle complex scripts yet.
+                     Treat LBP_SA like LBP_XX.  */
+                case LBP_XX:
+                  /* This is arbitrary.  */
+                  prop = LBP_AL;
+                  break;
                 }
-              else
+
+              /* Deal with spaces and combining characters.  */
+              if (prop == LBP_SP)
                 {
+                  /* (LB7) Don't break just before a space.  */
                   *p = UC_BREAK_PROHIBITED;
-                  /* A combining character turns a preceding space into LBP_ID.  */
-                  if (seen_space != NULL)
-                    {
-                      q = seen_space;
-                      seen_space = seen_space2;
-                      prop = LBP_ID;
-                      goto lookup_via_table;
-                    }
+                  seen_space = p;
                 }
-            }
-          else
-            {
-             lookup_via_table:
-              /* prop must be usable as an index for table 7.3 of UTR #14.  */
-              if (!(prop >= 0 && prop < sizeof (unilbrk_table) / sizeof (unilbrk_table[0])))
-                abort ();
-
-              if (last_prop == LBP_BK)
+              else if (prop == LBP_ZW)
                 {
-                  /* Don't break at the beginning of a line.  */
-                  *q = UC_BREAK_PROHIBITED;
+                  /* (LB7) Don't break just before a zero-width space.  */
+                  *p = UC_BREAK_PROHIBITED;
+                  last_prop = LBP_ZW;
+                  seen_space = NULL;
                 }
-              else if (last_prop == LBP_ZW)
+              else if (prop == LBP_CM || prop == LBP_ZWJ)
                 {
-                  /* Break after zero-width space.  */
-                  *q = UC_BREAK_POSSIBLE;
+                  /* (LB9) Don't break just before a combining character or
+                     zero-width joiner, except immediately after a mandatory
+                     break character, space, or zero-width space.  */
+                  if (last_prop == LBP_BK)
+                    {
+                      /* (LB4,LB5,LB6) Don't break at the beginning of a line.  */
+                      *p = UC_BREAK_PROHIBITED;
+                      /* (LB10) Treat CM or ZWJ as AL.  */
+                      last_prop = LBP_AL;
+                      seen_space = NULL;
+                    }
+                  else if (last_prop == LBP_ZW || seen_space != NULL)
+                    {
+                      /* (LB8) Break after zero-width space.  */
+                      /* (LB18) Break after spaces.
+                         We do *not* implement the "legacy support for space
+                         character as base for combining marks" because now the
+                         NBSP CM sequence is recommended instead of SP CM.  */
+                      *p = UC_BREAK_POSSIBLE;
+                      /* (LB10) Treat CM or ZWJ as AL.  */
+                      last_prop = LBP_AL;
+                      seen_space = NULL;
+                    }
+                  else
+                    {
+                      /* Treat X CM as if it were X.  */
+                      *p = UC_BREAK_PROHIBITED;
+                    }
                 }
               else
                 {
-                  switch (unilbrk_table [last_prop] [prop])
+                  /* prop must be usable as an index for table 7.3 of UTR #14.  */
+                  if (!(prop >= 0 && prop < sizeof (unilbrk_table) / sizeof (unilbrk_table[0])))
+                    abort ();
+
+                  if (last_prop == LBP_BK)
+                    {
+                      /* (LB4,LB5,LB6) Don't break at the beginning of a line.  */
+                      *p = UC_BREAK_PROHIBITED;
+                    }
+                  else if (last_prop == LBP_ZW)
+                    {
+                      /* (LB8) Break after zero-width space.  */
+                      *p = UC_BREAK_POSSIBLE;
+                    }
+                  else if (prev_prop == LBP_ZWJ)
                     {
-                    case D:
-                      *q = UC_BREAK_POSSIBLE;
-                      break;
-                    case I:
-                      *q = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED);
-                      break;
-                    case P:
-                      *q = UC_BREAK_PROHIBITED;
-                      break;
-                    default:
-                      abort ();
+                      /* (LB8a) Don't break right after a zero-width joiner.  */
+                      *p = UC_BREAK_PROHIBITED;
                     }
+                  else if (last_prop == LBP_RI && prop == LBP_RI)
+                    {
+                      /* (LB30a) Break between two regional indicator symbols
+                         if and only if there are an even number of regional
+                         indicators preceding the position of the break.  */
+                      *p = (seen_space != NULL || (ri_count % 2) == 0
+                            ? UC_BREAK_POSSIBLE
+                            : UC_BREAK_PROHIBITED);
+                    }
+                  else if (prev_prop == LBP_HL_BA)
+                    {
+                      /* (LB21a) Don't break after Hebrew + Hyphen/Break-After.  */
+                      *p = UC_BREAK_PROHIBITED;
+                    }
+                  else
+                    {
+                      switch (unilbrk_table [last_prop] [prop])
+                        {
+                        case D:
+                          *p = UC_BREAK_POSSIBLE;
+                          break;
+                        case I:
+                          *p = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED);
+                          break;
+                        case P:
+                          *p = UC_BREAK_PROHIBITED;
+                          break;
+                        default:
+                          abort ();
+                        }
+                    }
+                  last_prop = prop;
+                  seen_space = NULL;
                 }
-              last_prop = prop;
-              seen_space = NULL;
-              seen_space2 = NULL;
+
+              prev_prop = (prev_prop == LBP_HL && (prop == LBP_HY || prop == LBP_BA)
+                           ? LBP_HL_BA
+                           : prop);
             }
-        }
 
-      s += count;
-      p += count;
+          if (prop == LBP_RI)
+            ri_count++;
+          else
+            ri_count = 0;
+
+          s += count;
+          p += count;
+        }
+      while (s < s_end);
     }
 }
 
+#undef u8_possible_linebreaks
+
+void
+u8_possible_linebreaks (const uint8_t *s, size_t n, const char *encoding,
+                        char *p)
+{
+  u8_possible_linebreaks_loop (s, n, encoding, -1, p);
+}
+
+void
+u8_possible_linebreaks_v2 (const uint8_t *s, size_t n, const char *encoding,
+                           char *p)
+{
+  u8_possible_linebreaks_loop (s, n, encoding, LBP_CR, p);
+}
+
 
 #ifdef TEST
 
@@ -237,7 +297,7 @@ main (int argc, char * argv[])
       char *breaks = malloc (length);
       int i;
 
-      u8_possible_linebreaks ((uint8_t *) input, length, "UTF-8", breaks);
+      u8_possible_linebreaks_v2 ((uint8_t *) input, length, "UTF-8", breaks);
 
       for (i = 0; i < length; i++)
         {
@@ -251,6 +311,10 @@ main (int argc, char * argv[])
               /* U+21B2 (or U+21B5) in UTF-8 encoding */
               putc (0xe2, stdout); putc (0x86, stdout); putc (0xb2, stdout);
               break;
+            case UC_BREAK_CR_BEFORE_LF:
+              /* U+21E4 in UTF-8 encoding */
+              putc (0xe2, stdout); putc (0x87, stdout); putc (0xa4, stdout);
+              break;
             case UC_BREAK_PROHIBITED:
               break;
             default: