summaryrefslogtreecommitdiff
path: root/src/util/string.vala
blob: 5ca4680fd6e1973dd38728dc897fa2f5da2f1361 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
/* Copyright 2016 Software Freedom Conservancy Inc.
 *
 * This software is licensed under the GNU Lesser General Public License
 * (version 2.1 or later).  See the COPYING file in this distribution.
 */

public const int DEFAULT_USER_TEXT_INPUT_LENGTH = 1024;

public inline bool is_string_empty(string? s) {
    return (s == null || s[0] == '\0');
}

// utf8 case sensitive compare
public int utf8_cs_compare(string a, string b) {
    return a.collate(b);
}

// utf8 case insensitive compare
public int utf8_ci_compare(string a, string b) {
    return a.down().collate(b.down());
}

// utf8 array to string
public string uchar_array_to_string(uchar[] data, int length = -1) {
    if (length < 0)
        length = data.length;
    
    StringBuilder builder = new StringBuilder();
    for (int ctr = 0; ctr < length; ctr++) {
        if (data[ctr] != '\0')
            builder.append_c((char) data[ctr]);
        else
            break;
    }
    
    return builder.str;
}

// string to uchar array
public uchar[] string_to_uchar_array(string str) {
    uchar[] data = new uchar[0];
    for (int ctr = 0; ctr < str.length; ctr++)
        data += (uchar) str[ctr];
    
    return data;
}

// Markup.escape_text() will crash if the UTF-8 text is not valid; it relies on a call to 
// g_utf8_next_char(), which demands that the string be validated before use, which escape_text()
// does not do.  This handles this problem by kicking back an empty string if the text is not
// valid.  Text should be validated upon entry to the system as well to guard against this
// problem.
//
// Null strings are accepted; they will result in an empty string returned.
public inline string guarded_markup_escape_text(string? plain) {
    return (!is_string_empty(plain) && plain.validate()) ? Markup.escape_text(plain) : "";
}

public long find_last_offset(string str, char c) {
    long offset = str.length;
    while (--offset >= 0) {
        if (str[offset] == c)
            return offset;
    }
    
    return -1;
}

// Helper function for searching an array of case-insensitive strings.  The array should be
// all lowercase.
public bool is_in_ci_array(string str, string[] strings) {
    string strdown = str.down();
    foreach (string str_element in strings) {
        if (strdown == str_element)
            return true;
    }
    
    return false;
}

[Flags]
public enum PrepareInputTextOptions {
    EMPTY_IS_NULL,
    VALIDATE,
    INVALID_IS_NULL,
    STRIP,
    STRIP_CRLF,
    NORMALIZE,
    DEFAULT = EMPTY_IS_NULL | VALIDATE | INVALID_IS_NULL | STRIP_CRLF | STRIP | NORMALIZE;
}

private string? guess_convert(string text) {
    string? output = null;
    size_t bytes_read = 0;
    unowned string charset = null;
    debug ("CONVERT: Text did not validate as UTF-8, trying conversion");

    // Try with locale
    if (!GLib.get_charset(out charset)) {
        output = text.locale_to_utf8(text.length, out bytes_read, null, null);
        if (bytes_read == text.length) {
            debug ("CONVERT: Locale is not UTF-8, convert from %s", charset);
            return output;
        }
    }

    try {
        output = GLib.convert (text, text.length, "UTF-8", "WINDOWS-1252", out bytes_read);
        charset = "WINDOWS-1252";
    } catch (ConvertError error) {
        if (error is ConvertError.NO_CONVERSION) {
            try {
                output = GLib.convert (text, text.length, "UTF-8", "ISO-8859-1", out bytes_read);
                charset = "ISO-8859-1";
            } catch (Error error) { /* do nothing */ }
        }
    }

    if (bytes_read == text.length) {
        debug ("CONVERT: Guessed conversion from %s", charset);

        return output;
    }

    return null;
}

public string? prepare_input_text(string? text, PrepareInputTextOptions options, int dest_length) {
    if (text == null)
        return null;
    
    string? prepped = text;
    if (PrepareInputTextOptions.VALIDATE in options) {
        if (!text.validate()) {
            prepped = guess_convert (text);

            if (prepped == null) {
                return (options & PrepareInputTextOptions.INVALID_IS_NULL) != 0 ? null : "";
            }
        }
    }

    // Using composed form rather than GLib's default (decomposed) as NFC is the preferred form in
    // Linux and WWW.  More importantly, Pango seems to have serious problems displaying decomposed
    // forms of Korean language glyphs (and perhaps others).  See:
    // https://bugzilla.gnome.org/show_bug.cgi?id=716914
    if ((options & PrepareInputTextOptions.NORMALIZE) != 0)
        prepped = prepped.normalize(-1, NormalizeMode.NFC);
    
    if ((options & PrepareInputTextOptions.STRIP) != 0)
        prepped = prepped.strip();
        
    // Ticket #3245 - Prevent carriage return mayhem
    // in image titles, tag names, etc.
    if ((options & PrepareInputTextOptions.STRIP_CRLF) != 0)
        prepped = prepped.delimit("\n\r", ' ');
    
    if ((options & PrepareInputTextOptions.EMPTY_IS_NULL) != 0 && is_string_empty(prepped))
        return null;
    
    // Ticket #3196 - Allow calling functions to limit the length of the 
    // string we return to them. Passing any negative value is interpreted 
    // as 'do not truncate'.
    if (dest_length >= 0) { 
        StringBuilder sb = new StringBuilder(prepped);
        sb.truncate(dest_length);
        return sb.str;
    }
    
    // otherwise, return normally.
    return prepped;
}

namespace String {

public inline bool contains_char(string haystack, unichar needle) {
    return haystack.index_of_char(needle) >= 0;
}

// Note that this method currently turns a word of all zeros into empty space ("000" -> "")
public string strip_leading_zeroes(string str) {
    StringBuilder stripped = new StringBuilder();
    bool prev_is_space = true;
    for (unowned string iter = str; iter.get_char() != 0; iter = iter.next_char()) {
        unichar ch = iter.get_char();
        
        if (!prev_is_space || ch != '0') {
            stripped.append_unichar(ch);
            prev_is_space = ch.isspace();
        }
    }
    
    return stripped.str;
}

public string remove_diacritics(string istring) {
    var builder = new StringBuilder ();
    unichar ch;
    int i = 0;
    while(istring.normalize().get_next_char(ref i, out ch)) {
        switch(ch.type()) {
            case UnicodeType.CONTROL:
            case UnicodeType.FORMAT:
            case UnicodeType.UNASSIGNED:
            case UnicodeType.NON_SPACING_MARK:
            case UnicodeType.COMBINING_MARK:
            case UnicodeType.ENCLOSING_MARK:
            // Ignore those
                continue;
            default:
                break;
        }
        builder.append_unichar(ch);
    }
    return builder.str;
}

public string to_hex_string(string str) {
    StringBuilder builder = new StringBuilder();
    
    uint8 *data = (uint8 *) str;
    while (*data != 0)
        builder.append_printf("%02Xh%s", *data++, (*data != 0) ? " " : "");
    
    return builder.str;
}

// A note on the collated_* and precollated_* methods:
//
// A bug report (https://bugzilla.gnome.org/show_bug.cgi?id=717135) indicated that two different Hirigana characters
// as Tag names would trigger an assertion.  Investigation showed that the characters' collation
// keys computed as equal when the locale was set to anything but the default locale (C) or
// Japanese.  A related bug was that another hash table was using str_equal, which does not use
// collation, meaning that in one table the strings were seen as the same and in another as
// different.
//
// The solution we arrived at is to use collation whenever possible, but if two strings have the
// same collation, then fall back on strcmp(), which looks for byte-for-byte comparisons.  Note
// that this technique requires that both strings have been properly composed (use
// prepare_input_text() for that task) so that equal UTF-8 strings are byte-for-byte equal as
// well.

// See note above.
public uint collated_hash(void *ptr) {
    string str = (string) ptr;
    
    return str_hash(str.collate_key());
}

// See note above.
public uint precollated_hash(void *ptr) {
    return str_hash((string) ptr);
}

// See note above.
public int collated_compare(void *a, void *b) {
    string astr = (string) a;
    string bstr = (string) b;
    
    int result = astr.collate(bstr);
    
    return (result != 0) ? result : strcmp(astr, bstr);
}

// See note above.
public int precollated_compare(string astr, string akey, string bstr, string bkey) {
    int result = strcmp(akey, bkey);
    
    return (result != 0) ? result : strcmp(astr, bstr);
}

// See note above.
public bool collated_equals(void *a, void *b) {
    return collated_compare(a, b) == 0;
}

// See note above.
public bool precollated_equals(string astr, string akey, string bstr, string bkey) {
    return precollated_compare(astr, akey, bstr, bkey) == 0;
}

}