diff options
Diffstat (limited to 'tests/unilbrk/test-uc-possible-linebreaks.c')
| -rw-r--r-- | tests/unilbrk/test-uc-possible-linebreaks.c | 111 |
1 files changed, 53 insertions, 58 deletions
diff --git a/tests/unilbrk/test-uc-possible-linebreaks.c b/tests/unilbrk/test-uc-possible-linebreaks.c index a54699b6..bf268286 100644 --- a/tests/unilbrk/test-uc-possible-linebreaks.c +++ b/tests/unilbrk/test-uc-possible-linebreaks.c @@ -1,5 +1,5 @@ /* Line break function test, using test data from UCD. - Copyright (C) 2024-2025 Free Software Foundation, Inc. + Copyright (C) 2024-2026 Free Software Foundation, Inc. This file is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published @@ -118,71 +118,66 @@ main (int argc, char *argv[]) u32_possible_linebreaks (input, i - 1, "UTF-8", breaks); int matches = 1; - { - int j; - for (j = 0; j < i - 1; j++) - { - /* The character U+FFFC has line break property CB, which according - to rule (LB1) is resolved "into other line breaking classes - depending on criteria outside the scope of this algorithm". - Thus it makes no sense to check the breaks[] entry before or - after such a character. */ - if (!(input[j] == 0xFFFC - || (j > 0 && input[j - 1] == 0xFFFC) - /* Also consider intervening characters with property LBP_CM - or LBP_ZWJ, per (LB9). */ - || (j > 1 && (input[j - 1] == 0x0308 || input[j - 1] == 0x200D) - && input[j - 2] == 0xFFFC))) - /* A regional indicator with a combining character is nonsense, - because regional indicators are supposed to come in pairs. */ - if (!(j >= 2 && (input[0] >= 0x1F1E6 && input[0] <= 0x1F1FF) - && input[1] == 0x0308)) - /* It is nonsense to treat U+1F8FF differently than U+1F02C. - Both are unassigned Extended_Pictographic characters and - should therefore be treated like LBP_EB (or LBP_ID, if you - want), not like LBP_AL. See rule (LB30b). */ - if (!(input[j] == 0x1F8FF - || (j > 0 && input[j - 1] == 0x1F8FF) - /* Also consider intervening characters with property LBP_CM - or LBP_ZWJ, per (LB9). */ - || (j > 1 && (input[j - 1] == 0x0308 || input[j - 1] == 0x200D) - && input[j - 2] == 0x1F8FF))) - /* There is a disagreement regarding whether to allow a line break - after a U+0020 SPACE character at the start of the text. - We consider that the start of the text is equivalent to the - state after a newline was seen; hence the loop starts with - property LBP_BK. By the rules (LB4,LB5,LB6) an extra line - break after a mandatory line break is undesired, even with - intervening spaces (because these rules come before (LB18)). - Whereas the LineBreakTest.txt file allows a line break after - the space. - Similarly when the first two characters at the start of the - text have property LBP_CM and LBP_ZWJ, respectively. (LB9). */ - if (!(((j == 1 || (j > 1 && ((input[j - 2] >= 0x000A && input[j - 2] <= 0x000D) || input[j - 2] == 0x0085))) - && input[j - 1] == 0x0020) - || ((j == 2 || (j > 2 && ((input[j - 3] >= 0x000A && input[j - 3] <= 0x000D) || input[j - 3] == 0x0085))) - && ((input[j - 2] == 0x0020 && input[j - 1] == 0x0020) - || (input[j - 2] == 0x0308 && input[j - 1] == 0x200D) - || (input[j - 2] == 0x200D && input[j - 1] == 0x0308))))) - matches &= (!(breaks[j] == UC_BREAK_PROHIBITED - || breaks[j] == UC_BREAK_MANDATORY - || breaks[j] == UC_BREAK_CR_BEFORE_LF) - || (j > 0 && breaks[j - 1] == UC_BREAK_MANDATORY)) - == breaks_expected[j]; - } - } + for (int j = 0; j < i - 1; j++) + { + /* The character U+FFFC has line break property CB, which according + to rule (LB1) is resolved "into other line breaking classes + depending on criteria outside the scope of this algorithm". + Thus it makes no sense to check the breaks[] entry before or + after such a character. */ + if (!(input[j] == 0xFFFC + || (j > 0 && input[j - 1] == 0xFFFC) + /* Also consider intervening characters with property LBP_CM + or LBP_ZWJ, per (LB9). */ + || (j > 1 && (input[j - 1] == 0x0308 || input[j - 1] == 0x200D) + && input[j - 2] == 0xFFFC))) + /* A regional indicator with a combining character is nonsense, + because regional indicators are supposed to come in pairs. */ + if (!(j >= 2 && (input[0] >= 0x1F1E6 && input[0] <= 0x1F1FF) + && input[1] == 0x0308)) + /* It is nonsense to treat U+1F8FF differently than U+1F02C. + Both are unassigned Extended_Pictographic characters and + should therefore be treated like LBP_EB (or LBP_ID, if you + want), not like LBP_AL. See rule (LB30b). */ + if (!(input[j] == 0x1F8FF + || (j > 0 && input[j - 1] == 0x1F8FF) + /* Also consider intervening characters with property LBP_CM + or LBP_ZWJ, per (LB9). */ + || (j > 1 && (input[j - 1] == 0x0308 || input[j - 1] == 0x200D) + && input[j - 2] == 0x1F8FF))) + /* There is a disagreement regarding whether to allow a line break + after a U+0020 SPACE character at the start of the text. + We consider that the start of the text is equivalent to the + state after a newline was seen; hence the loop starts with + property LBP_BK. By the rules (LB4,LB5,LB6) an extra line + break after a mandatory line break is undesired, even with + intervening spaces (because these rules come before (LB18)). + Whereas the LineBreakTest.txt file allows a line break after + the space. + Similarly when the first two characters at the start of the + text have property LBP_CM and LBP_ZWJ, respectively. (LB9). */ + if (!(((j == 1 || (j > 1 && ((input[j - 2] >= 0x000A && input[j - 2] <= 0x000D) || input[j - 2] == 0x0085))) + && input[j - 1] == 0x0020) + || ((j == 2 || (j > 2 && ((input[j - 3] >= 0x000A && input[j - 3] <= 0x000D) || input[j - 3] == 0x0085))) + && ((input[j - 2] == 0x0020 && input[j - 1] == 0x0020) + || (input[j - 2] == 0x0308 && input[j - 1] == 0x200D) + || (input[j - 2] == 0x200D && input[j - 1] == 0x0308))))) + matches &= (!(breaks[j] == UC_BREAK_PROHIBITED + || breaks[j] == UC_BREAK_MANDATORY + || breaks[j] == UC_BREAK_CR_BEFORE_LF) + || (j > 0 && breaks[j - 1] == UC_BREAK_MANDATORY)) + == breaks_expected[j]; + } if (!matches) { - int j; - fprintf (stderr, "%s:%d: expected: ", filename, lineno); - for (j = 0; j < i - 1; j++) + for (int j = 0; j < i - 1; j++) fprintf (stderr, "%s U+%04X ", breaks_expected[j] == 1 ? "\303\267" : "\303\227", input[j]); fprintf (stderr, "\n"); fprintf (stderr, "%s:%d: actual: ", filename, lineno); - for (j = 0; j < i - 1; j++) + for (int j = 0; j < i - 1; j++) fprintf (stderr, "%s U+%04X ", (!(breaks[j] == UC_BREAK_PROHIBITED || breaks[j] == UC_BREAK_MANDATORY |
