Main Page | Alphabetical List | Data Structures | Directories | File List | Data Fields | Globals

wsutf8.c File Reference

#include "wsint.h"

Include dependency graph for wsutf8.c:

Include dependency graph

Go to the source code of this file.

Defines

#define WS_UTF8_ENC_1_M   0xffffff80
#define WS_UTF8_ENC_2_M   0xfffff800
#define WS_UTF8_ENC_3_M   0xffff0000
#define WS_UTF8_ENC_4_M   0xffe00000
#define WS_UTF8_ENC_5_M   0xfc000000
#define WS_UTF8_ENC_6_M   0x80000000
#define WS_UTF8_ENC_C_BITS   0x80
#define WS_UTF8_CONT_DATA_MASK   0x3f
#define WS_UTF8_ENC_TYPE(ch)
#define WS_UTF8_DEC_1_M   0x80
#define WS_UTF8_DEC_2_M   0xe0
#define WS_UTF8_DEC_3_M   0xf0
#define WS_UTF8_DEC_4_M   0xf8
#define WS_UTF8_DEC_5_M   0xfc
#define WS_UTF8_DEC_6_M   0xfe
#define WS_UTF8_DEC_1_V   0x00
#define WS_UTF8_DEC_2_V   0xc0
#define WS_UTF8_DEC_3_V   0xe0
#define WS_UTF8_DEC_4_V   0xf0
#define WS_UTF8_DEC_5_V   0xf8
#define WS_UTF8_DEC_6_V   0xfc
#define WS_UTF8_DEC_C_M   0xc0
#define WS_UTF8_DEC_C_V   0x80
#define WS_UTF8_DEC_TYPE(b)
#define WS_UTF8_DEC_C_P(b)   (((b) & WS_UTF8_DEC_C_M) == WS_UTF8_DEC_C_V)

Functions

WsUtf8Stringws_utf8_alloc ()
void ws_utf8_free (WsUtf8String *string)
int ws_utf8_append_char (WsUtf8String *string, unsigned long ch)
int ws_utf8_verify (const unsigned char *data, size_t len, size_t *strlen_return)
int ws_utf8_set_data (WsUtf8String *string, const unsigned char *data, size_t len)
int ws_utf8_get_char (const WsUtf8String *string, unsigned long *ch_return, size_t *posp)
unsigned char * ws_utf8_to_latin1 (const WsUtf8String *string, unsigned char unknown_char, size_t *len_return)
void ws_utf8_free_data (unsigned char *data)

Variables

unsigned char utf8_hibits [7]
unsigned char utf8_hidata_masks [7]


Define Documentation

#define WS_UTF8_CONT_DATA_MASK   0x3f
 

Definition at line 102 of file wsutf8.c.

#define WS_UTF8_DEC_1_M   0x80
 

Definition at line 125 of file wsutf8.c.

#define WS_UTF8_DEC_1_V   0x00
 

Definition at line 132 of file wsutf8.c.

#define WS_UTF8_DEC_2_M   0xe0
 

Definition at line 126 of file wsutf8.c.

#define WS_UTF8_DEC_2_V   0xc0
 

Definition at line 133 of file wsutf8.c.

#define WS_UTF8_DEC_3_M   0xf0
 

Definition at line 127 of file wsutf8.c.

#define WS_UTF8_DEC_3_V   0xe0
 

Definition at line 134 of file wsutf8.c.

#define WS_UTF8_DEC_4_M   0xf8
 

Definition at line 128 of file wsutf8.c.

#define WS_UTF8_DEC_4_V   0xf0
 

Definition at line 135 of file wsutf8.c.

#define WS_UTF8_DEC_5_M   0xfc
 

Definition at line 129 of file wsutf8.c.

#define WS_UTF8_DEC_5_V   0xf8
 

Definition at line 136 of file wsutf8.c.

#define WS_UTF8_DEC_6_M   0xfe
 

Definition at line 130 of file wsutf8.c.

#define WS_UTF8_DEC_6_V   0xfc
 

Definition at line 137 of file wsutf8.c.

#define WS_UTF8_DEC_C_M   0xc0
 

Definition at line 154 of file wsutf8.c.

#define WS_UTF8_DEC_C_P  )     (((b) & WS_UTF8_DEC_C_M) == WS_UTF8_DEC_C_V)
 

Definition at line 178 of file wsutf8.c.

Referenced by ws_utf8_verify().

#define WS_UTF8_DEC_C_V   0x80
 

Definition at line 155 of file wsutf8.c.

#define WS_UTF8_DEC_TYPE  ) 
 

Value:

(((b) & WS_UTF8_DEC_1_M) == WS_UTF8_DEC_1_V         \
     ? 1                            \
     : (((b) & WS_UTF8_DEC_2_M) == WS_UTF8_DEC_2_V      \
       ? 2                          \
       : (((b) & WS_UTF8_DEC_3_M) == WS_UTF8_DEC_3_V        \
         ? 3                            \
         : (((b) & WS_UTF8_DEC_4_M) == WS_UTF8_DEC_4_V      \
           ? 4                          \
           : (((b) & WS_UTF8_DEC_5_M) == WS_UTF8_DEC_5_V    \
             ? 5                        \
             : (((b) & WS_UTF8_DEC_6_M) == WS_UTF8_DEC_6_V  \
               ? 6                      \
               : 0))))))

Definition at line 161 of file wsutf8.c.

Referenced by ws_utf8_get_char(), and ws_utf8_verify().

#define WS_UTF8_ENC_1_M   0xffffff80
 

Definition at line 77 of file wsutf8.c.

#define WS_UTF8_ENC_2_M   0xfffff800
 

Definition at line 78 of file wsutf8.c.

#define WS_UTF8_ENC_3_M   0xffff0000
 

Definition at line 79 of file wsutf8.c.

#define WS_UTF8_ENC_4_M   0xffe00000
 

Definition at line 80 of file wsutf8.c.

#define WS_UTF8_ENC_5_M   0xfc000000
 

Definition at line 81 of file wsutf8.c.

#define WS_UTF8_ENC_6_M   0x80000000
 

Definition at line 82 of file wsutf8.c.

#define WS_UTF8_ENC_C_BITS   0x80
 

Definition at line 99 of file wsutf8.c.

#define WS_UTF8_ENC_TYPE ch   ) 
 

Value:

(((ch) & WS_UTF8_ENC_1_M) == 0      \
     ? 1                    \
     : (((ch) & WS_UTF8_ENC_2_M) == 0       \
       ? 2                  \
       : (((ch) & WS_UTF8_ENC_3_M) == 0     \
         ? 3                    \
         : (((ch) & WS_UTF8_ENC_4_M) == 0   \
           ? 4                  \
           : (((ch) & WS_UTF8_ENC_5_M) == 0 \
             ? 5                \
             : (((ch) & WS_UTF8_ENC_6_M) == 0   \
               ? 6              \
               : 0))))))

Definition at line 108 of file wsutf8.c.

Referenced by ws_utf8_append_char().


Function Documentation

WsUtf8String* ws_utf8_alloc void   ) 
 

Definition at line 182 of file wsutf8.c.

References ws_calloc(), and WsUtf8String.

Referenced by ws_bc_encode(), and ws_yy_lex().

00183 {
00184     return ws_calloc(1, sizeof(WsUtf8String));
00185 }

Here is the call graph for this function:

int ws_utf8_append_char WsUtf8String string,
unsigned long  ch
 

Definition at line 198 of file wsutf8.c.

References d, WsUtf8StringRec::data, WsUtf8StringRec::len, WsUtf8StringRec::num_chars, string, utf8_hibits, ws_fatal(), ws_realloc(), WS_UTF8_ENC_TYPE, and WsUtf8String.

Referenced by ws_yy_lex().

00199 {
00200     unsigned char *d;
00201     unsigned int num_bytes = WS_UTF8_ENC_TYPE(ch);
00202     unsigned int len, i;
00203 
00204     if (num_bytes == 0)
00205         ws_fatal("ws_utf8_append_char(): 0x%lx is not a valid UTF-8 character",
00206                  ch);
00207 
00208     d = ws_realloc(string->data, string->len + num_bytes);
00209     if (d == NULL)
00210         return 0;
00211 
00212     len = string->len;
00213 
00214     /* Encode the continuation bytes (n > 1). */
00215     for (i = num_bytes - 1; i > 0; i--) {
00216         d[len + i] = WS_UTF8_ENC_C_BITS;
00217         d[len + i] |= ch & WS_UTF8_CONT_DATA_MASK;
00218         ch >>= 6;
00219     }
00220 
00221     /* And continue the first byte. */
00222     d[len] = utf8_hibits[num_bytes];
00223     d[len] |= ch;
00224 
00225     string->data = d;
00226     string->len += num_bytes;
00227     string->num_chars++;
00228 
00229     return 1;
00230 }

Here is the call graph for this function:

void ws_utf8_free WsUtf8String string  ) 
 

Definition at line 188 of file wsutf8.c.

References WsUtf8StringRec::data, string, ws_free(), and WsUtf8String.

Referenced by ws_bc_encode(), and ws_yy_lex().

00189 {
00190     if (string == NULL)
00191         return;
00192 
00193     ws_free(string->data);
00194     ws_free(string);
00195 }

Here is the call graph for this function:

void ws_utf8_free_data unsigned char *  data  ) 
 

Definition at line 368 of file wsutf8.c.

References data, and ws_free().

Referenced by pragma_meta(), and ws_bc_encode().

00369 {
00370     if (data)
00371         ws_free(data);
00372 }

Here is the call graph for this function:

int ws_utf8_get_char const WsUtf8String string,
unsigned long *  ch_return,
size_t *  posp
 

Definition at line 293 of file wsutf8.c.

References data, WsUtf8StringRec::data, WsUtf8StringRec::len, string, utf8_hidata_masks, WS_UTF8_DEC_TYPE, and WsUtf8String.

Referenced by main(), and ws_utf8_to_latin1().

00295 {
00296     size_t pos = *posp;
00297     unsigned int num_bytes, i;
00298     unsigned char *data;
00299     unsigned long ch;
00300 
00301     if (pos < 0 || pos >= string->len)
00302         /* Index out range. */
00303         return 0;
00304 
00305     data = string->data + pos;
00306 
00307     num_bytes = WS_UTF8_DEC_TYPE(*data);
00308     if (num_bytes == 0)
00309         /* Invalid position. */
00310         return 0;
00311 
00312     if (pos + num_bytes > string->len)
00313         /* Truncated data. */
00314         return 0;
00315 
00316     /* Get the first byte. */
00317     ch = data[0] & utf8_hidata_masks[num_bytes];
00318 
00319     /* Add the continuation bytes. */
00320     for (i = 1; i < num_bytes; i++) {
00321         ch <<= 6;
00322         ch |= data[i] & WS_UTF8_CONT_DATA_MASK;
00323     }
00324 
00325     *ch_return = ch;
00326     *posp = pos + num_bytes;
00327 
00328     return 1;
00329 }

int ws_utf8_set_data WsUtf8String string,
const unsigned char *  data,
size_t  len
 

Definition at line 266 of file wsutf8.c.

References data, WsUtf8StringRec::data, WsUtf8StringRec::len, WsUtf8StringRec::num_chars, string, ws_free(), ws_memdup(), ws_utf8_verify(), and WsUtf8String.

Referenced by ws_bc_encode().

00268 {
00269     size_t num_chars;
00270 
00271     if (!ws_utf8_verify(data, len, &num_chars))
00272         /* Malformed data. */
00273         return 0;
00274 
00275     /* Init `string' to empty. */
00276     ws_free(string->data);
00277     string->data = NULL;
00278     string->len = 0;
00279     string->num_chars = 0;
00280 
00281     /* Set the new data. */
00282     string->data = ws_memdup(data, len);
00283     if (string->data == NULL)
00284         return 0;
00285 
00286     string->len = len;
00287     string->num_chars = num_chars;
00288 
00289     return 1;
00290 }

Here is the call graph for this function:

unsigned char* ws_utf8_to_latin1 const WsUtf8String string,
unsigned char  unknown_char,
size_t *  len_return
 

Definition at line 332 of file wsutf8.c.

References WsUtf8StringRec::num_chars, string, ws_fatal(), ws_malloc(), ws_utf8_get_char(), and WsUtf8String.

Referenced by pragma_meta(), and ws_bc_encode().

00335 {
00336     unsigned char *cstr;
00337     size_t i;
00338     size_t pos = 0;
00339 
00340     if (string == NULL)
00341         return NULL;
00342 
00343     cstr = ws_malloc(string->num_chars + 1);
00344     if (cstr == NULL)
00345         return NULL;
00346 
00347     for (i = 0; i < string->num_chars; i++) {
00348         unsigned long ch;
00349 
00350         if (!ws_utf8_get_char(string, &ch, &pos))
00351             ws_fatal("ws_utf8_to_latin1_cstr(): internal inconsistency");
00352 
00353         if (ch > 0xff)
00354             cstr[i] = unknown_char;
00355         else
00356             cstr[i] = (unsigned char) ch;
00357     }
00358 
00359     cstr[i] = '\0';
00360 
00361     if (len_return)
00362         *len_return = string->num_chars;
00363 
00364     return cstr;
00365 }

Here is the call graph for this function:

int ws_utf8_verify const unsigned char *  data,
size_t  len,
size_t *  strlen_return
 

Definition at line 233 of file wsutf8.c.

References data, WS_UTF8_DEC_C_P, and WS_UTF8_DEC_TYPE.

Referenced by ws_bc_decode(), and ws_utf8_set_data().

00235 {
00236     unsigned int num_bytes, i;
00237     size_t strlen = 0;
00238 
00239     while (len > 0) {
00240         num_bytes = WS_UTF8_DEC_TYPE(*data);
00241         if (num_bytes == 0)
00242             /* Not a valid beginning. */
00243             return 0;
00244 
00245         if (len < num_bytes)
00246             /* The data is truncated. */
00247             return 0;
00248 
00249         for (i = 1; i < num_bytes; i++)
00250             if (!WS_UTF8_DEC_C_P(data[i]))
00251                 /* Not a valid continuation byte. */
00252                 return 0;
00253 
00254         len -= num_bytes;
00255         data += num_bytes;
00256         strlen++;
00257     }
00258 
00259     if (strlen_return)
00260         *strlen_return = strlen;
00261 
00262     return 1;
00263 }


Variable Documentation

unsigned char utf8_hibits[7] [static]
 

Initial value:

    {
        0x00,               
        0x00,               
        0xc0,               
        0xe0,               
        0xf0,               
        0xf8,               
        0xfc,               
    }

Definition at line 87 of file wsutf8.c.

Referenced by ws_utf8_append_char().

unsigned char utf8_hidata_masks[7] [static]
 

Initial value:

    {
        0x00,               
        0x7f,               
        0x1f,               
        0x0f,               
        0x07,               
        0x03,               
        0x01,               
    }

Definition at line 142 of file wsutf8.c.

Referenced by ws_utf8_get_char().

See file LICENSE for details about the license agreement for using, modifying, copying or deriving work from this software.