#include "wsint.h"Include dependency graph for wsutf8.c:

Go to the source code of this file.
Defines | |
| #define | WS_UTF8_ENC_1_M 0xffffff80 |
| #define | WS_UTF8_ENC_2_M 0xfffff800 |
| #define | WS_UTF8_ENC_3_M 0xffff0000 |
| #define | WS_UTF8_ENC_4_M 0xffe00000 |
| #define | WS_UTF8_ENC_5_M 0xfc000000 |
| #define | WS_UTF8_ENC_6_M 0x80000000 |
| #define | WS_UTF8_ENC_C_BITS 0x80 |
| #define | WS_UTF8_CONT_DATA_MASK 0x3f |
| #define | WS_UTF8_ENC_TYPE(ch) |
| #define | WS_UTF8_DEC_1_M 0x80 |
| #define | WS_UTF8_DEC_2_M 0xe0 |
| #define | WS_UTF8_DEC_3_M 0xf0 |
| #define | WS_UTF8_DEC_4_M 0xf8 |
| #define | WS_UTF8_DEC_5_M 0xfc |
| #define | WS_UTF8_DEC_6_M 0xfe |
| #define | WS_UTF8_DEC_1_V 0x00 |
| #define | WS_UTF8_DEC_2_V 0xc0 |
| #define | WS_UTF8_DEC_3_V 0xe0 |
| #define | WS_UTF8_DEC_4_V 0xf0 |
| #define | WS_UTF8_DEC_5_V 0xf8 |
| #define | WS_UTF8_DEC_6_V 0xfc |
| #define | WS_UTF8_DEC_C_M 0xc0 |
| #define | WS_UTF8_DEC_C_V 0x80 |
| #define | WS_UTF8_DEC_TYPE(b) |
| #define | WS_UTF8_DEC_C_P(b) (((b) & WS_UTF8_DEC_C_M) == WS_UTF8_DEC_C_V) |
Functions | |
| WsUtf8String * | ws_utf8_alloc () |
| void | ws_utf8_free (WsUtf8String *string) |
| int | ws_utf8_append_char (WsUtf8String *string, unsigned long ch) |
| int | ws_utf8_verify (const unsigned char *data, size_t len, size_t *strlen_return) |
| int | ws_utf8_set_data (WsUtf8String *string, const unsigned char *data, size_t len) |
| int | ws_utf8_get_char (const WsUtf8String *string, unsigned long *ch_return, size_t *posp) |
| unsigned char * | ws_utf8_to_latin1 (const WsUtf8String *string, unsigned char unknown_char, size_t *len_return) |
| void | ws_utf8_free_data (unsigned char *data) |
Variables | |
| unsigned char | utf8_hibits [7] |
| unsigned char | utf8_hidata_masks [7] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Definition at line 178 of file wsutf8.c. Referenced by ws_utf8_verify(). |
|
|
|
|
|
Value: (((b) & WS_UTF8_DEC_1_M) == WS_UTF8_DEC_1_V \ ? 1 \ : (((b) & WS_UTF8_DEC_2_M) == WS_UTF8_DEC_2_V \ ? 2 \ : (((b) & WS_UTF8_DEC_3_M) == WS_UTF8_DEC_3_V \ ? 3 \ : (((b) & WS_UTF8_DEC_4_M) == WS_UTF8_DEC_4_V \ ? 4 \ : (((b) & WS_UTF8_DEC_5_M) == WS_UTF8_DEC_5_V \ ? 5 \ : (((b) & WS_UTF8_DEC_6_M) == WS_UTF8_DEC_6_V \ ? 6 \ : 0)))))) Definition at line 161 of file wsutf8.c. Referenced by ws_utf8_get_char(), and ws_utf8_verify(). |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Value: (((ch) & WS_UTF8_ENC_1_M) == 0 \
? 1 \
: (((ch) & WS_UTF8_ENC_2_M) == 0 \
? 2 \
: (((ch) & WS_UTF8_ENC_3_M) == 0 \
? 3 \
: (((ch) & WS_UTF8_ENC_4_M) == 0 \
? 4 \
: (((ch) & WS_UTF8_ENC_5_M) == 0 \
? 5 \
: (((ch) & WS_UTF8_ENC_6_M) == 0 \
? 6 \
: 0))))))
Definition at line 108 of file wsutf8.c. Referenced by ws_utf8_append_char(). |
|
|
Definition at line 182 of file wsutf8.c. References ws_calloc(), and WsUtf8String. Referenced by ws_bc_encode(), and ws_yy_lex(). 00183 {
00184 return ws_calloc(1, sizeof(WsUtf8String));
00185 }
|
Here is the call graph for this function:

|
||||||||||||
|
Definition at line 198 of file wsutf8.c. References d, WsUtf8StringRec::data, WsUtf8StringRec::len, WsUtf8StringRec::num_chars, string, utf8_hibits, ws_fatal(), ws_realloc(), WS_UTF8_ENC_TYPE, and WsUtf8String. Referenced by ws_yy_lex(). 00199 {
00200 unsigned char *d;
00201 unsigned int num_bytes = WS_UTF8_ENC_TYPE(ch);
00202 unsigned int len, i;
00203
00204 if (num_bytes == 0)
00205 ws_fatal("ws_utf8_append_char(): 0x%lx is not a valid UTF-8 character",
00206 ch);
00207
00208 d = ws_realloc(string->data, string->len + num_bytes);
00209 if (d == NULL)
00210 return 0;
00211
00212 len = string->len;
00213
00214 /* Encode the continuation bytes (n > 1). */
00215 for (i = num_bytes - 1; i > 0; i--) {
00216 d[len + i] = WS_UTF8_ENC_C_BITS;
00217 d[len + i] |= ch & WS_UTF8_CONT_DATA_MASK;
00218 ch >>= 6;
00219 }
00220
00221 /* And continue the first byte. */
00222 d[len] = utf8_hibits[num_bytes];
00223 d[len] |= ch;
00224
00225 string->data = d;
00226 string->len += num_bytes;
00227 string->num_chars++;
00228
00229 return 1;
00230 }
|
Here is the call graph for this function:

|
|
Definition at line 188 of file wsutf8.c. References WsUtf8StringRec::data, string, ws_free(), and WsUtf8String. Referenced by ws_bc_encode(), and ws_yy_lex(). 00189 {
00190 if (string == NULL)
00191 return;
00192
00193 ws_free(string->data);
00194 ws_free(string);
00195 }
|
Here is the call graph for this function:

|
|
Definition at line 368 of file wsutf8.c. References data, and ws_free(). Referenced by pragma_meta(), and ws_bc_encode(). 00369 {
00370 if (data)
00371 ws_free(data);
00372 }
|
Here is the call graph for this function:

|
||||||||||||||||
|
Definition at line 293 of file wsutf8.c. References data, WsUtf8StringRec::data, WsUtf8StringRec::len, string, utf8_hidata_masks, WS_UTF8_DEC_TYPE, and WsUtf8String. Referenced by main(), and ws_utf8_to_latin1(). 00295 {
00296 size_t pos = *posp;
00297 unsigned int num_bytes, i;
00298 unsigned char *data;
00299 unsigned long ch;
00300
00301 if (pos < 0 || pos >= string->len)
00302 /* Index out range. */
00303 return 0;
00304
00305 data = string->data + pos;
00306
00307 num_bytes = WS_UTF8_DEC_TYPE(*data);
00308 if (num_bytes == 0)
00309 /* Invalid position. */
00310 return 0;
00311
00312 if (pos + num_bytes > string->len)
00313 /* Truncated data. */
00314 return 0;
00315
00316 /* Get the first byte. */
00317 ch = data[0] & utf8_hidata_masks[num_bytes];
00318
00319 /* Add the continuation bytes. */
00320 for (i = 1; i < num_bytes; i++) {
00321 ch <<= 6;
00322 ch |= data[i] & WS_UTF8_CONT_DATA_MASK;
00323 }
00324
00325 *ch_return = ch;
00326 *posp = pos + num_bytes;
00327
00328 return 1;
00329 }
|
|
||||||||||||||||
|
Definition at line 266 of file wsutf8.c. References data, WsUtf8StringRec::data, WsUtf8StringRec::len, WsUtf8StringRec::num_chars, string, ws_free(), ws_memdup(), ws_utf8_verify(), and WsUtf8String. Referenced by ws_bc_encode(). 00268 {
00269 size_t num_chars;
00270
00271 if (!ws_utf8_verify(data, len, &num_chars))
00272 /* Malformed data. */
00273 return 0;
00274
00275 /* Init `string' to empty. */
00276 ws_free(string->data);
00277 string->data = NULL;
00278 string->len = 0;
00279 string->num_chars = 0;
00280
00281 /* Set the new data. */
00282 string->data = ws_memdup(data, len);
00283 if (string->data == NULL)
00284 return 0;
00285
00286 string->len = len;
00287 string->num_chars = num_chars;
00288
00289 return 1;
00290 }
|
Here is the call graph for this function:

|
||||||||||||||||
|
Definition at line 332 of file wsutf8.c. References WsUtf8StringRec::num_chars, string, ws_fatal(), ws_malloc(), ws_utf8_get_char(), and WsUtf8String. Referenced by pragma_meta(), and ws_bc_encode(). 00335 {
00336 unsigned char *cstr;
00337 size_t i;
00338 size_t pos = 0;
00339
00340 if (string == NULL)
00341 return NULL;
00342
00343 cstr = ws_malloc(string->num_chars + 1);
00344 if (cstr == NULL)
00345 return NULL;
00346
00347 for (i = 0; i < string->num_chars; i++) {
00348 unsigned long ch;
00349
00350 if (!ws_utf8_get_char(string, &ch, &pos))
00351 ws_fatal("ws_utf8_to_latin1_cstr(): internal inconsistency");
00352
00353 if (ch > 0xff)
00354 cstr[i] = unknown_char;
00355 else
00356 cstr[i] = (unsigned char) ch;
00357 }
00358
00359 cstr[i] = '\0';
00360
00361 if (len_return)
00362 *len_return = string->num_chars;
00363
00364 return cstr;
00365 }
|
Here is the call graph for this function:

|
||||||||||||||||
|
Definition at line 233 of file wsutf8.c. References data, WS_UTF8_DEC_C_P, and WS_UTF8_DEC_TYPE. Referenced by ws_bc_decode(), and ws_utf8_set_data(). 00235 {
00236 unsigned int num_bytes, i;
00237 size_t strlen = 0;
00238
00239 while (len > 0) {
00240 num_bytes = WS_UTF8_DEC_TYPE(*data);
00241 if (num_bytes == 0)
00242 /* Not a valid beginning. */
00243 return 0;
00244
00245 if (len < num_bytes)
00246 /* The data is truncated. */
00247 return 0;
00248
00249 for (i = 1; i < num_bytes; i++)
00250 if (!WS_UTF8_DEC_C_P(data[i]))
00251 /* Not a valid continuation byte. */
00252 return 0;
00253
00254 len -= num_bytes;
00255 data += num_bytes;
00256 strlen++;
00257 }
00258
00259 if (strlen_return)
00260 *strlen_return = strlen;
00261
00262 return 1;
00263 }
|
|
|
Initial value:
{
0x00,
0x00,
0xc0,
0xe0,
0xf0,
0xf8,
0xfc,
}
Definition at line 87 of file wsutf8.c. Referenced by ws_utf8_append_char(). |
|
|
Initial value:
{
0x00,
0x7f,
0x1f,
0x0f,
0x07,
0x03,
0x01,
}
Definition at line 142 of file wsutf8.c. Referenced by ws_utf8_get_char(). |