diff options
Diffstat (limited to 'app/tools/halibut/charset/iso2022s.c')
-rw-r--r-- | app/tools/halibut/charset/iso2022s.c | 544 |
1 files changed, 544 insertions, 0 deletions
diff --git a/app/tools/halibut/charset/iso2022s.c b/app/tools/halibut/charset/iso2022s.c new file mode 100644 index 0000000..a1eceb8 --- /dev/null +++ b/app/tools/halibut/charset/iso2022s.c @@ -0,0 +1,544 @@ +/* + * iso2022s.c - support for ISO-2022 subset encodings. + */ + +#ifndef ENUM_CHARSETS + +#include <stdio.h> +#include <string.h> +#include <assert.h> + +#include "charset.h" +#include "internal.h" +#include "sbcsdat.h" + +#define SO (0x0E) +#define SI (0x0F) +#define ESC (0x1B) + +/* Functional description of a single ISO 2022 escape sequence. */ +struct iso2022_escape { + char const *sequence; + unsigned long andbits, xorbits; + /* + * For output, these variables help us figure out which escape + * sequences we need to get where we want to be. + * + * `container' should be in the range 0-3, but can also be ORed + * with the bit flag RO to indicate that this is not a + * preferred container to use for this charset during output. + */ + int container, subcharset; +}; +#define RO 0x80 + +struct iso2022 { + /* + * List of escape sequences supported in this subset. Must be + * in ASCII order, so that we can narrow down the list as + * necessary. + */ + const struct iso2022_escape *escapes;/* must be sorted in ASCII order! */ + int nescapes; + + /* + * We assign indices from 0 upwards to the sub-charsets of a + * given ISO 2022 subset. nbytes[i] tells us how many bytes per + * character are required by sub-charset i. (It's a string + * mainly because that makes it easier to declare in C syntax + * than an int array.) + */ + char const *nbytes; + + /* + * The characters in this string are indices-plus-one (so that + * NUL can still terminate) of escape sequences in `escapes'. + * These escapes are output in the given sequence to reset the + * encoding state, unless it turns out that a given escape + * would not change the state at all. + */ + char const *reset; + + /* + * Initial value of s1, in case the default container contents + * needs to be something other than charset 0 in all cases. + * (Note that this must have the top bit set!) + */ + unsigned long s1; + + /* + * For output, some ISO 2022 subsets _mandate_ an initial shift + * sequence. If so, here it is so we can output it. (For the + * sake of basic sanity we won't bother to _require_ it on + * input, although it should of course be listed under + * `escapes' above so that we ignore it when present.) + */ + char const *initial_sequence; + + /* + * Is this an 8-bit ISO 2022 subset? + */ + int eightbit; + + /* + * Function calls to do the actual translation. + */ + long int (*to_ucs)(int subcharset, unsigned long bytes); + int (*from_ucs)(long int ucs, int *subcharset, unsigned long *bytes); +}; + +static void read_iso2022s(charset_spec const *charset, long int input_chr, + charset_state *state, + void (*emit)(void *ctx, long int output), + void *emitctx) +{ + struct iso2022 const *iso = (struct iso2022 *)charset->data; + + /* + * For reading ISO-2022 subsets, we divide up our state + * variables as follows: + * + * - The top byte of s0 (bits 31:24) indicates, if nonzero, + * that we are part-way through a recognised ISO-2022 escape + * sequence. Five of those bits (31:27) give the index of + * the first member of the escapes list matching what we + * have so far; the remaining three (26:24) give the number + * of characters we have seen so far. + * + * - The top bit of s1 (bit 31) is non-zero at all times, to + * indicate that we have performed any necessary + * initialisation. When we start, we detect a zero s1 and + * respond to it by initialising the default container + * contents. + * + * - The next three bits of s1 (bits 30:28) indicate which + * _container_ is currently selected. This isn't quite as + * simple as it sounds, since we have to preserve memory of + * which of the SI/SO containers we came from when we're + * temporarily in SS2/SS3. Hence, what happens is: + * + bit 28 indicates SI/SO. + * + if we're in an SS2/SS3 container, that's indicated by + * the two bits above that being nonzero and holding + * either 2 or 3. + * + Hence: 0 is SI, 1 is SO, 4 is SS2-from-SI, 5 is + * SS2-from-SO, 6 is SS3-from-SI, 7 is SS3-from-SO. + * + For added fun: in an _8-bit_ ISO 2022 subset, we have + * the further special value 2, which means that we're + * theoretically in SI but the current character being + * accumulated is composed of 8-bit characters and will + * therefore be interpreted as if in SO. + * + * - The next nibble of s1 (27:24) indicates how many bytes + * have been accumulated in the current character. + * + * - The remaining three bytes of s1 are divided into four + * six-bit sections, and each section gives the current + * sub-charset selected in one of the possible containers. + * (Those containers are SI, SO, SS2 and SS3, respectively + * and in order from the bottom of s0 to the top.) + * + * - The bottom 24 bits of s0 give the accumulated character + * data so far. + * + * (Note that this means s1 contains all the parts of the state + * which might need to be operated on by escape sequences. + * Cunning, eh?) + */ + + if (!(state->s1 & 0x80000000)) { + state->s1 = iso->s1; + } + + /* + * So. Firstly, we process escape sequences, if we're in the + * middle of one or if we see a possible introducer (SI, SO, + * ESC). + */ + if ((state->s0 >> 24) || + (input_chr == SO || input_chr == SI || input_chr == ESC)) { + int n = (state->s0 >> 24) & 7, i = (state->s0 >> 27), oi = i, j; + + /* + * If this is the start of an escape sequence, we might be + * in mid-character. If so, clear the character state and + * emit an error token for the incomplete character. + */ + if (state->s1 & 0x0F000000) { + state->s1 &= ~0x0F000000; + state->s0 &= 0xFF000000; + /* + * If we were in the SS2 or SS3 container, we + * automatically exit it. + */ + if (state->s1 & 0x60000000) + state->s1 &= 0x9FFFFFFF; + emit(emitctx, ERROR); + } + + j = i; + while (j < iso->nescapes && + !memcmp(iso->escapes[j].sequence, + iso->escapes[oi].sequence, n)) { + if (iso->escapes[j].sequence[n] < input_chr) + i = ++j; + else + break; + } + if (i >= iso->nescapes || + memcmp(iso->escapes[i].sequence, + iso->escapes[oi].sequence, n) || + iso->escapes[i].sequence[n] != input_chr) { + /* + * This character does not appear in any valid escape + * sequence. Therefore, we must emit all the characters + * we had previously swallowed, plus this one, and + * return to non-escape-sequence state. + */ + for (j = 0; j < n; j++) + emit(emitctx, iso->escapes[oi].sequence[j]); + emit(emitctx, input_chr); + state->s0 = 0; + return; + } + + /* + * Otherwise, we have found an additional character in our + * escape sequence. See if we have reached the _end_ of our + * sequence (and therefore must process the sequence). + */ + n++; + if (!iso->escapes[i].sequence[n]) { + state->s0 = 0; + state->s1 &= iso->escapes[i].andbits; + state->s1 ^= iso->escapes[i].xorbits; + return; + } + + /* + * Failing _that_, we simply update our escape-sequence- + * tracking state. + */ + assert(i < 32 && n < 8); + state->s0 = (i << 27) | (n << 24); + return; + } + + /* + * If this isn't an escape sequence, it must be part of a + * character. One possibility is that it's a control character + * (00-20 or 7F-9F; also in non-8-bit ISO 2022 subsets I'm + * going to treat all top-half characters as controls), in + * which case we output it verbatim. + */ + if (input_chr < 0x21 || + (input_chr > 0x7E && (!iso->eightbit || input_chr < 0xA0))) { + /* + * We might be in mid-multibyte-character. If so, clear the + * character state and emit an error token for the + * incomplete character. + */ + if (state->s1 & 0x0F000000) { + state->s1 &= ~0x0F000000; + state->s0 &= 0xFF000000; + emit(emitctx, ERROR); + /* + * If we were in the SS2 or SS3 container, we + * automatically exit it. + */ + if (state->s1 & 0x60000000) + state->s1 &= 0x9FFFFFFF; + } + + emit(emitctx, input_chr); + return; + } + + /* + * Otherwise, accumulate character data. + */ + { + unsigned long chr; + int chrlen, cont, subcharset, bytes; + + /* + * Verify that we've seen the right kind of character for + * what we're currently doing. This only matters in 8-bit + * subsets. + */ + if (iso->eightbit) { + cont = (state->s1 >> 28) & 7; + /* + * If cont==0, we're entitled to see either GL or GR + * characters. If cont==2, we expect only GR; otherwise + * we expect only GL. + * + * If we see a GR character while cont==0, we set + * cont=2 immediately. + */ + if ((cont == 2 && !(input_chr & 0x80)) || + (cont != 0 && cont != 2 && (input_chr & 0x80))) { + /* + * Clear the previous character; it was prematurely + * terminated by this error. + */ + state->s1 &= ~0x0F000000; + state->s0 &= 0xFF000000; + emit(emitctx, ERROR); + /* + * If we were in the SS2 or SS3 container, we + * automatically exit it. + */ + if (state->s1 & 0x60000000) + state->s1 &= 0x9FFFFFFF; + } + + if (cont == 0 && (input_chr & 0x80)) { + state->s1 |= 0x20000000; + } + } + + /* The current character and its length. */ + chr = ((state->s0 & 0x00FFFFFF) << 8) | (input_chr & 0x7F); + chrlen = ((state->s1 >> 24) & 0xF) + 1; + /* The current sub-charset. */ + cont = (state->s1 >> 28) & 7; + if (cont > 1) cont >>= 1; + subcharset = (state->s1 >> (6*cont)) & 0x3F; + /* The number of bytes-per-character in that sub-charset. */ + bytes = iso->nbytes[subcharset]; + + /* + * If this character is now complete, we convert and emit + * it. Otherwise, we simply update the state and return. + */ + if (chrlen >= bytes) { + emit(emitctx, iso->to_ucs(subcharset, chr)); + chr = chrlen = 0; + /* + * If we were in the SS2 or SS3 container, we + * automatically exit it. + */ + if (state->s1 & 0x60000000) + state->s1 &= 0x9FFFFFFF; + } + state->s0 = (state->s0 & 0xFF000000) | chr; + state->s1 = (state->s1 & 0xF0FFFFFF) | (chrlen << 24); + } +} + +static int write_iso2022s(charset_spec const *charset, long int input_chr, + charset_state *state, + void (*emit)(void *ctx, long int output), + void *emitctx) +{ + struct iso2022 const *iso = (struct iso2022 *)charset->data; + int subcharset, len, i, j, cont, topbit = 0; + unsigned long bytes; + + /* + * For output, our s1 state variable contains most of the same + * stuff as it did for input - initial-state indicator bit, + * current container, and current subcharset selected in each + * container. + */ + + /* + * Analyse the character and find out what subcharset it needs + * to go in. + */ + if (input_chr >= 0 && !iso->from_ucs(input_chr, &subcharset, &bytes)) + return FALSE; + + if (!(state->s1 & 0x80000000)) { + state->s1 = iso->s1; + if (iso->initial_sequence) + for (i = 0; iso->initial_sequence[i]; i++) + emit(emitctx, iso->initial_sequence[i]); + } + + if (input_chr == -1) { + unsigned long oldstate; + int k; + + /* + * Special case: reset encoding state. + */ + for (i = 0; iso->reset[i]; i++) { + j = iso->reset[i] - 1; + oldstate = state->s1; + state->s1 &= iso->escapes[j].andbits; + state->s1 ^= iso->escapes[j].xorbits; + if (state->s1 != oldstate) { + /* We must actually emit this sequence. */ + for (k = 0; iso->escapes[j].sequence[k]; k++) + emit(emitctx, iso->escapes[j].sequence[k]); + } + } + + return TRUE; + } + + /* + * Now begins the fun. We now know what subcharset we want. So + * we must find out which container we should select it into, + * select it into it if necessary, select that _container_ if + * necessary, and then output the given bytes. + */ + for (i = 0; i < iso->nescapes; i++) + if (iso->escapes[i].subcharset == subcharset && + !(iso->escapes[i].container & RO)) + break; + assert(i < iso->nescapes); + + /* + * We've found the escape sequence which would select this + * subcharset into a container. However, that subcharset might + * already _be_ selected in that container! Check before we go + * to the effort of emitting the sequence. + */ + cont = iso->escapes[i].container &~ RO; + if (((state->s1 >> (6*cont)) & 0x3F) != (unsigned)subcharset) { + for (j = 0; iso->escapes[i].sequence[j]; j++) + emit(emitctx, iso->escapes[i].sequence[j]); + state->s1 &= iso->escapes[i].andbits; + state->s1 ^= iso->escapes[i].xorbits; + } + + /* + * Now we know what container our subcharset is in, so we want + * to select that container. + */ + if (cont > 1) { + /* SS2 or SS3; just output the sequence and be done. */ + emit(emitctx, ESC); + emit(emitctx, 'L' + cont); /* comes out to 'N' or 'O' */ + } else { + /* + * Emit SI or SO, but only if the current container isn't already + * the right one. + * + * Also, in an 8-bit subset, we need not do this; we'll + * just use 8-bit characters to output SO-container + * characters. + */ + if (iso->eightbit && cont == 1 && ((state->s1 >> 28) & 7) == 0) { + topbit = 0x80; + } else if (((state->s1 >> 28) & 7) != (unsigned)cont) { + emit(emitctx, cont ? SO : SI); + state->s1 = (state->s1 & 0x8FFFFFFF) | (cont << 28); + } + } + + /* + * We're done. Subcharset is selected in container, container + * is selected. All we need now is to write out the bytes. + */ + len = iso->nbytes[subcharset]; + while (len--) + emit(emitctx, ((bytes >> (8*len)) & 0xFF) | topbit); + + return TRUE; +} + +/* + * ISO-2022-JP, defined in RFC 1468. + */ +static long int iso2022jp_to_ucs(int subcharset, unsigned long bytes) +{ + switch (subcharset) { + case 1: /* JIS X 0201 bottom half */ + if (bytes == 0x5C) + return 0xA5; + else if (bytes == 0x7E) + return 0x203E; + /* else fall through to ASCII */ + case 0: return bytes; /* one-byte ASCII */ + /* (no break needed since all control paths have returned) */ + case 2: return jisx0208_to_unicode(((bytes >> 8) & 0xFF) - 0x21, + ((bytes ) & 0xFF) - 0x21); + default: return ERROR; + } +} +static int iso2022jp_from_ucs(long int ucs, int *subcharset, + unsigned long *bytes) +{ + int r, c; + if (ucs < 0x80) { + *subcharset = 0; + *bytes = ucs; + return 1; + } else if (ucs == 0xA5 || ucs == 0x203E) { + *subcharset = 1; + *bytes = (ucs == 0xA5 ? 0x5C : 0x7E); + return 1; + } else if (unicode_to_jisx0208(ucs, &r, &c)) { + *subcharset = 2; + *bytes = ((r+0x21) << 8) | (c+0x21); + return 1; + } else { + return 0; + } +} +static const struct iso2022_escape iso2022jp_escapes[] = { + {"\033$@", 0xFFFFFFC0, 0x00000002, -1, -1}, /* we ignore this one */ + {"\033$B", 0xFFFFFFC0, 0x00000002, 0, 2}, + {"\033(B", 0xFFFFFFC0, 0x00000000, 0, 0}, + {"\033(J", 0xFFFFFFC0, 0x00000001, 0, 1}, +}; +static const struct iso2022 iso2022jp = { + iso2022jp_escapes, lenof(iso2022jp_escapes), + "\1\1\2", "\3", 0x80000000, NULL, FALSE, + iso2022jp_to_ucs, iso2022jp_from_ucs +}; +const charset_spec charset_CS_ISO2022_JP = { + CS_ISO2022_JP, read_iso2022s, write_iso2022s, &iso2022jp +}; + +/* + * ISO-2022-KR, defined in RFC 1557. + */ +static long int iso2022kr_to_ucs(int subcharset, unsigned long bytes) +{ + switch (subcharset) { + case 0: return bytes; /* one-byte ASCII */ + case 1: return ksx1001_to_unicode(((bytes >> 8) & 0xFF) - 0x21, + ((bytes ) & 0xFF) - 0x21); + default: return ERROR; + } +} +static int iso2022kr_from_ucs(long int ucs, int *subcharset, + unsigned long *bytes) +{ + int r, c; + if (ucs < 0x80) { + *subcharset = 0; + *bytes = ucs; + return 1; + } else if (unicode_to_ksx1001(ucs, &r, &c)) { + *subcharset = 1; + *bytes = ((r+0x21) << 8) | (c+0x21); + return 1; + } else { + return 0; + } +} +static const struct iso2022_escape iso2022kr_escapes[] = { + {"\016", 0x8FFFFFFF, 0x10000000, -1, -1}, + {"\017", 0x8FFFFFFF, 0x00000000, 0, 0}, + {"\033$)C", 0xFFFFF03F, 0x00000040, 1, 1}, /* bits[11:6] <- 1 */ +}; +static const struct iso2022 iso2022kr = { + iso2022kr_escapes, lenof(iso2022kr_escapes), + "\1\2", "\2", 0x80000040, "\033$)C", FALSE, + iso2022kr_to_ucs, iso2022kr_from_ucs +}; +const charset_spec charset_CS_ISO2022_KR = { + CS_ISO2022_KR, read_iso2022s, write_iso2022s, &iso2022kr +}; + +#else /* ENUM_CHARSETS */ + +ENUM_CHARSET(CS_ISO2022_JP) +ENUM_CHARSET(CS_ISO2022_KR) + +#endif /* ENUM_CHARSETS */ |