diff options
Diffstat (limited to 'tests/unilbrk/test-uc-possible-linebreaks.c')
| -rw-r--r-- | tests/unilbrk/test-uc-possible-linebreaks.c | 57 |
1 files changed, 31 insertions, 26 deletions
diff --git a/tests/unilbrk/test-uc-possible-linebreaks.c b/tests/unilbrk/test-uc-possible-linebreaks.c index 31bab9eb..bf268286 100644 --- a/tests/unilbrk/test-uc-possible-linebreaks.c +++ b/tests/unilbrk/test-uc-possible-linebreaks.c @@ -1,5 +1,5 @@ /* Line break function test, using test data from UCD. - Copyright (C) 2024 Free Software Foundation, Inc. + Copyright (C) 2024-2026 Free Software Foundation, Inc. This file is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published @@ -118,25 +118,33 @@ main (int argc, char *argv[]) u32_possible_linebreaks (input, i - 1, "UTF-8", breaks); int matches = 1; - { - int j; - for (j = 0; j < i - 1; j++) - { - /* The character U+FFFC has line break property CB, which according - to rule (LB1) is resolved "into other line breaking classes - depending on criteria outside the scope of this algorithm". - Thus it makes no sense to check the breaks[] entry before or - after such a character. */ - if (!(input[j] == 0xFFFC - || (j > 0 && input[j - 1] == 0xFFFC) - /* Also consider intervening characters with property LBP_CM - or LBP_ZWJ, per (LB9). */ - || (j > 1 && (input[j - 1] == 0x0308 || input[j - 1] == 0x200D) - && input[j - 2] == 0xFFFC))) - /* A regional indicator with a combining character is nonsense, - because regional indicators are supposed to come in pairs. */ - if (!(j >= 2 && (input[0] >= 0x1F1E6 && input[0] <= 0x1F1FF) - && input[1] == 0x0308)) + for (int j = 0; j < i - 1; j++) + { + /* The character U+FFFC has line break property CB, which according + to rule (LB1) is resolved "into other line breaking classes + depending on criteria outside the scope of this algorithm". + Thus it makes no sense to check the breaks[] entry before or + after such a character. */ + if (!(input[j] == 0xFFFC + || (j > 0 && input[j - 1] == 0xFFFC) + /* Also consider intervening characters with property LBP_CM + or LBP_ZWJ, per (LB9). */ + || (j > 1 && (input[j - 1] == 0x0308 || input[j - 1] == 0x200D) + && input[j - 2] == 0xFFFC))) + /* A regional indicator with a combining character is nonsense, + because regional indicators are supposed to come in pairs. */ + if (!(j >= 2 && (input[0] >= 0x1F1E6 && input[0] <= 0x1F1FF) + && input[1] == 0x0308)) + /* It is nonsense to treat U+1F8FF differently than U+1F02C. + Both are unassigned Extended_Pictographic characters and + should therefore be treated like LBP_EB (or LBP_ID, if you + want), not like LBP_AL. See rule (LB30b). */ + if (!(input[j] == 0x1F8FF + || (j > 0 && input[j - 1] == 0x1F8FF) + /* Also consider intervening characters with property LBP_CM + or LBP_ZWJ, per (LB9). */ + || (j > 1 && (input[j - 1] == 0x0308 || input[j - 1] == 0x200D) + && input[j - 2] == 0x1F8FF))) /* There is a disagreement regarding whether to allow a line break after a U+0020 SPACE character at the start of the text. We consider that the start of the text is equivalent to the @@ -159,20 +167,17 @@ main (int argc, char *argv[]) || breaks[j] == UC_BREAK_CR_BEFORE_LF) || (j > 0 && breaks[j - 1] == UC_BREAK_MANDATORY)) == breaks_expected[j]; - } - } + } if (!matches) { - int j; - fprintf (stderr, "%s:%d: expected: ", filename, lineno); - for (j = 0; j < i - 1; j++) + for (int j = 0; j < i - 1; j++) fprintf (stderr, "%s U+%04X ", breaks_expected[j] == 1 ? "\303\267" : "\303\227", input[j]); fprintf (stderr, "\n"); fprintf (stderr, "%s:%d: actual: ", filename, lineno); - for (j = 0; j < i - 1; j++) + for (int j = 0; j < i - 1; j++) fprintf (stderr, "%s U+%04X ", (!(breaks[j] == UC_BREAK_PROHIBITED || breaks[j] == UC_BREAK_MANDATORY |
