Kannel: Open Source WAP and SMS gateway  svn-r5335
wsutf8.c File Reference
#include "wsint.h"

Go to the source code of this file.

Macros

#define WS_UTF8_ENC_1_M   0xffffff80
 
#define WS_UTF8_ENC_2_M   0xfffff800
 
#define WS_UTF8_ENC_3_M   0xffff0000
 
#define WS_UTF8_ENC_4_M   0xffe00000
 
#define WS_UTF8_ENC_5_M   0xfc000000
 
#define WS_UTF8_ENC_6_M   0x80000000
 
#define WS_UTF8_ENC_C_BITS   0x80
 
#define WS_UTF8_CONT_DATA_MASK   0x3f
 
#define WS_UTF8_ENC_TYPE(ch)
 
#define WS_UTF8_DEC_1_M   0x80
 
#define WS_UTF8_DEC_2_M   0xe0
 
#define WS_UTF8_DEC_3_M   0xf0
 
#define WS_UTF8_DEC_4_M   0xf8
 
#define WS_UTF8_DEC_5_M   0xfc
 
#define WS_UTF8_DEC_6_M   0xfe
 
#define WS_UTF8_DEC_1_V   0x00
 
#define WS_UTF8_DEC_2_V   0xc0
 
#define WS_UTF8_DEC_3_V   0xe0
 
#define WS_UTF8_DEC_4_V   0xf0
 
#define WS_UTF8_DEC_5_V   0xf8
 
#define WS_UTF8_DEC_6_V   0xfc
 
#define WS_UTF8_DEC_C_M   0xc0
 
#define WS_UTF8_DEC_C_V   0x80
 
#define WS_UTF8_DEC_TYPE(b)
 
#define WS_UTF8_DEC_C_P(b)   (((b) & WS_UTF8_DEC_C_M) == WS_UTF8_DEC_C_V)
 

Functions

WsUtf8Stringws_utf8_alloc ()
 
void ws_utf8_free (WsUtf8String *string)
 
int ws_utf8_append_char (WsUtf8String *string, unsigned long ch)
 
int ws_utf8_verify (const unsigned char *data, size_t len, size_t *strlen_return)
 
int ws_utf8_set_data (WsUtf8String *string, const unsigned char *data, size_t len)
 
int ws_utf8_get_char (const WsUtf8String *string, unsigned long *ch_return, size_t *posp)
 
unsigned char * ws_utf8_to_latin1 (const WsUtf8String *string, unsigned char unknown_char, size_t *len_return)
 
void ws_utf8_free_data (unsigned char *data)
 

Variables

static unsigned char utf8_hibits [7]
 
static unsigned char utf8_hidata_masks [7]
 

Macro Definition Documentation

◆ WS_UTF8_CONT_DATA_MASK

#define WS_UTF8_CONT_DATA_MASK   0x3f

Definition at line 102 of file wsutf8.c.

Referenced by ws_utf8_append_char(), and ws_utf8_get_char().

◆ WS_UTF8_DEC_1_M

#define WS_UTF8_DEC_1_M   0x80

Definition at line 125 of file wsutf8.c.

◆ WS_UTF8_DEC_1_V

#define WS_UTF8_DEC_1_V   0x00

Definition at line 132 of file wsutf8.c.

◆ WS_UTF8_DEC_2_M

#define WS_UTF8_DEC_2_M   0xe0

Definition at line 126 of file wsutf8.c.

◆ WS_UTF8_DEC_2_V

#define WS_UTF8_DEC_2_V   0xc0

Definition at line 133 of file wsutf8.c.

◆ WS_UTF8_DEC_3_M

#define WS_UTF8_DEC_3_M   0xf0

Definition at line 127 of file wsutf8.c.

◆ WS_UTF8_DEC_3_V

#define WS_UTF8_DEC_3_V   0xe0

Definition at line 134 of file wsutf8.c.

◆ WS_UTF8_DEC_4_M

#define WS_UTF8_DEC_4_M   0xf8

Definition at line 128 of file wsutf8.c.

◆ WS_UTF8_DEC_4_V

#define WS_UTF8_DEC_4_V   0xf0

Definition at line 135 of file wsutf8.c.

◆ WS_UTF8_DEC_5_M

#define WS_UTF8_DEC_5_M   0xfc

Definition at line 129 of file wsutf8.c.

◆ WS_UTF8_DEC_5_V

#define WS_UTF8_DEC_5_V   0xf8

Definition at line 136 of file wsutf8.c.

◆ WS_UTF8_DEC_6_M

#define WS_UTF8_DEC_6_M   0xfe

Definition at line 130 of file wsutf8.c.

◆ WS_UTF8_DEC_6_V

#define WS_UTF8_DEC_6_V   0xfc

Definition at line 137 of file wsutf8.c.

◆ WS_UTF8_DEC_C_M

#define WS_UTF8_DEC_C_M   0xc0

Definition at line 154 of file wsutf8.c.

◆ WS_UTF8_DEC_C_P

#define WS_UTF8_DEC_C_P (   b)    (((b) & WS_UTF8_DEC_C_M) == WS_UTF8_DEC_C_V)

Definition at line 178 of file wsutf8.c.

Referenced by ws_utf8_verify().

◆ WS_UTF8_DEC_C_V

#define WS_UTF8_DEC_C_V   0x80

Definition at line 155 of file wsutf8.c.

◆ WS_UTF8_DEC_TYPE

#define WS_UTF8_DEC_TYPE (   b)
Value:
? 1 \
? 2 \
? 3 \
? 4 \
? 5 \
? 6 \
: 0))))))
#define WS_UTF8_DEC_1_V
Definition: wsutf8.c:132
#define WS_UTF8_DEC_6_V
Definition: wsutf8.c:137
#define WS_UTF8_DEC_3_V
Definition: wsutf8.c:134
#define WS_UTF8_DEC_5_V
Definition: wsutf8.c:136
#define WS_UTF8_DEC_6_M
Definition: wsutf8.c:130
#define WS_UTF8_DEC_3_M
Definition: wsutf8.c:127
#define WS_UTF8_DEC_4_V
Definition: wsutf8.c:135
#define WS_UTF8_DEC_2_V
Definition: wsutf8.c:133
#define WS_UTF8_DEC_5_M
Definition: wsutf8.c:129
#define WS_UTF8_DEC_2_M
Definition: wsutf8.c:126
#define WS_UTF8_DEC_1_M
Definition: wsutf8.c:125
#define WS_UTF8_DEC_4_M
Definition: wsutf8.c:128

Definition at line 161 of file wsutf8.c.

Referenced by ws_utf8_get_char(), and ws_utf8_verify().

◆ WS_UTF8_ENC_1_M

#define WS_UTF8_ENC_1_M   0xffffff80

Definition at line 77 of file wsutf8.c.

◆ WS_UTF8_ENC_2_M

#define WS_UTF8_ENC_2_M   0xfffff800

Definition at line 78 of file wsutf8.c.

◆ WS_UTF8_ENC_3_M

#define WS_UTF8_ENC_3_M   0xffff0000

Definition at line 79 of file wsutf8.c.

◆ WS_UTF8_ENC_4_M

#define WS_UTF8_ENC_4_M   0xffe00000

Definition at line 80 of file wsutf8.c.

◆ WS_UTF8_ENC_5_M

#define WS_UTF8_ENC_5_M   0xfc000000

Definition at line 81 of file wsutf8.c.

◆ WS_UTF8_ENC_6_M

#define WS_UTF8_ENC_6_M   0x80000000

Definition at line 82 of file wsutf8.c.

◆ WS_UTF8_ENC_C_BITS

#define WS_UTF8_ENC_C_BITS   0x80

Definition at line 99 of file wsutf8.c.

Referenced by ws_utf8_append_char().

◆ WS_UTF8_ENC_TYPE

#define WS_UTF8_ENC_TYPE (   ch)
Value:
(((ch) & WS_UTF8_ENC_1_M) == 0 \
? 1 \
: (((ch) & WS_UTF8_ENC_2_M) == 0 \
? 2 \
: (((ch) & WS_UTF8_ENC_3_M) == 0 \
? 3 \
: (((ch) & WS_UTF8_ENC_4_M) == 0 \
? 4 \
: (((ch) & WS_UTF8_ENC_5_M) == 0 \
? 5 \
: (((ch) & WS_UTF8_ENC_6_M) == 0 \
? 6 \
: 0))))))
#define WS_UTF8_ENC_2_M
Definition: wsutf8.c:78
#define WS_UTF8_ENC_3_M
Definition: wsutf8.c:79
#define WS_UTF8_ENC_5_M
Definition: wsutf8.c:81
#define WS_UTF8_ENC_1_M
Definition: wsutf8.c:77
#define WS_UTF8_ENC_4_M
Definition: wsutf8.c:80
#define WS_UTF8_ENC_6_M
Definition: wsutf8.c:82

Definition at line 108 of file wsutf8.c.

Referenced by ws_utf8_append_char().

Function Documentation

◆ ws_utf8_alloc()

WsUtf8String* ws_utf8_alloc ( void  )

Definition at line 182 of file wsutf8.c.

References ws_calloc().

Referenced by ws_bc_encode(), and ws_yy_lex().

183 {
184  return ws_calloc(1, sizeof(WsUtf8String));
185 }
void * ws_calloc(size_t num, size_t size)
Definition: wsalloc.c:83

◆ ws_utf8_append_char()

int ws_utf8_append_char ( WsUtf8String string,
unsigned long  ch 
)

Definition at line 198 of file wsutf8.c.

References WsUtf8StringRec::data, WsUtf8StringRec::len, utf8_hibits, ws_fatal(), ws_realloc(), WS_UTF8_CONT_DATA_MASK, WS_UTF8_ENC_C_BITS, and WS_UTF8_ENC_TYPE.

Referenced by ws_yy_lex().

199 {
200  unsigned char *d;
201  unsigned int num_bytes = WS_UTF8_ENC_TYPE(ch);
202  unsigned int len, i;
203 
204  if (num_bytes == 0)
205  ws_fatal("ws_utf8_append_char(): 0x%lx is not a valid UTF-8 character",
206  ch);
207 
208  d = ws_realloc(string->data, string->len + num_bytes);
209  if (d == NULL)
210  return 0;
211 
212  len = string->len;
213 
214  /* Encode the continuation bytes (n > 1). */
215  for (i = num_bytes - 1; i > 0; i--) {
216  d[len + i] = WS_UTF8_ENC_C_BITS;
217  d[len + i] |= ch & WS_UTF8_CONT_DATA_MASK;
218  ch >>= 6;
219  }
220 
221  /* And continue the first byte. */
222  d[len] = utf8_hibits[num_bytes];
223  d[len] |= ch;
224 
225  string->data = d;
226  string->len += num_bytes;
227  string->num_chars++;
228 
229  return 1;
230 }
void ws_fatal(char *fmt,...)
Definition: wserror.c:91
size_t len
Definition: wsutf8.h:81
unsigned char * data
Definition: wsutf8.h:84
static unsigned char utf8_hibits[7]
Definition: wsutf8.c:87
void * ws_realloc(void *ptr, size_t size)
Definition: wsalloc.c:89
#define WS_UTF8_ENC_C_BITS
Definition: wsutf8.c:99
#define WS_UTF8_CONT_DATA_MASK
Definition: wsutf8.c:102
#define WS_UTF8_ENC_TYPE(ch)
Definition: wsutf8.c:108

◆ ws_utf8_free()

void ws_utf8_free ( WsUtf8String string)

Definition at line 188 of file wsutf8.c.

References WsUtf8StringRec::data, and ws_free().

Referenced by ws_bc_encode(), and ws_yy_lex().

189 {
190  if (string == NULL)
191  return;
192 
193  ws_free(string->data);
194  ws_free(string);
195 }
void ws_free(void *ptr)
Definition: wsalloc.c:139
unsigned char * data
Definition: wsutf8.h:84

◆ ws_utf8_free_data()

void ws_utf8_free_data ( unsigned char *  data)

Definition at line 368 of file wsutf8.c.

References ws_free().

Referenced by pragma_meta(), and ws_bc_encode().

369 {
370  if (data)
371  ws_free(data);
372 }
void ws_free(void *ptr)
Definition: wsalloc.c:139

◆ ws_utf8_get_char()

int ws_utf8_get_char ( const WsUtf8String string,
unsigned long *  ch_return,
size_t *  posp 
)

Definition at line 293 of file wsutf8.c.

References WsUtf8StringRec::len, utf8_hidata_masks, WS_UTF8_CONT_DATA_MASK, and WS_UTF8_DEC_TYPE.

Referenced by main(), and ws_utf8_to_latin1().

295 {
296  size_t pos = *posp;
297  unsigned int num_bytes, i;
298  unsigned char *data;
299  unsigned long ch;
300 
301  if (pos < 0 || pos >= string->len)
302  /* Index out range. */
303  return 0;
304 
305  data = string->data + pos;
306 
307  num_bytes = WS_UTF8_DEC_TYPE(*data);
308  if (num_bytes == 0)
309  /* Invalid position. */
310  return 0;
311 
312  if (pos + num_bytes > string->len)
313  /* Truncated data. */
314  return 0;
315 
316  /* Get the first byte. */
317  ch = data[0] & utf8_hidata_masks[num_bytes];
318 
319  /* Add the continuation bytes. */
320  for (i = 1; i < num_bytes; i++) {
321  ch <<= 6;
322  ch |= data[i] & WS_UTF8_CONT_DATA_MASK;
323  }
324 
325  *ch_return = ch;
326  *posp = pos + num_bytes;
327 
328  return 1;
329 }
size_t len
Definition: wsutf8.h:81
#define WS_UTF8_DEC_TYPE(b)
Definition: wsutf8.c:161
static unsigned char utf8_hidata_masks[7]
Definition: wsutf8.c:142
#define WS_UTF8_CONT_DATA_MASK
Definition: wsutf8.c:102

◆ ws_utf8_set_data()

int ws_utf8_set_data ( WsUtf8String string,
const unsigned char *  data,
size_t  len 
)

Definition at line 266 of file wsutf8.c.

References WsUtf8StringRec::data, ws_free(), ws_memdup(), and ws_utf8_verify().

Referenced by ws_bc_encode().

268 {
269  size_t num_chars;
270 
271  if (!ws_utf8_verify(data, len, &num_chars))
272  /* Malformed data. */
273  return 0;
274 
275  /* Init `string' to empty. */
276  ws_free(string->data);
277  string->data = NULL;
278  string->len = 0;
279  string->num_chars = 0;
280 
281  /* Set the new data. */
282  string->data = ws_memdup(data, len);
283  if (string->data == NULL)
284  return 0;
285 
286  string->len = len;
287  string->num_chars = num_chars;
288 
289  return 1;
290 }
void ws_free(void *ptr)
Definition: wsalloc.c:139
unsigned char * data
Definition: wsutf8.h:84
void * ws_memdup(const void *ptr, size_t size)
Definition: wsalloc.c:105
int ws_utf8_verify(const unsigned char *data, size_t len, size_t *strlen_return)
Definition: wsutf8.c:233

◆ ws_utf8_to_latin1()

unsigned char* ws_utf8_to_latin1 ( const WsUtf8String string,
unsigned char  unknown_char,
size_t *  len_return 
)

Definition at line 332 of file wsutf8.c.

References WsUtf8StringRec::num_chars, ws_fatal(), ws_malloc(), and ws_utf8_get_char().

Referenced by pragma_meta(), and ws_bc_encode().

335 {
336  unsigned char *cstr;
337  size_t i;
338  size_t pos = 0;
339 
340  if (string == NULL)
341  return NULL;
342 
343  cstr = ws_malloc(string->num_chars + 1);
344  if (cstr == NULL)
345  return NULL;
346 
347  for (i = 0; i < string->num_chars; i++) {
348  unsigned long ch;
349 
350  if (!ws_utf8_get_char(string, &ch, &pos))
351  ws_fatal("ws_utf8_to_latin1_cstr(): internal inconsistency");
352 
353  if (ch > 0xff)
354  cstr[i] = unknown_char;
355  else
356  cstr[i] = (unsigned char) ch;
357  }
358 
359  cstr[i] = '\0';
360 
361  if (len_return)
362  *len_return = string->num_chars;
363 
364  return cstr;
365 }
void ws_fatal(char *fmt,...)
Definition: wserror.c:91
int ws_utf8_get_char(const WsUtf8String *string, unsigned long *ch_return, size_t *posp)
Definition: wsutf8.c:293
size_t num_chars
Definition: wsutf8.h:87
void * ws_malloc(size_t size)
Definition: wsalloc.c:77

◆ ws_utf8_verify()

int ws_utf8_verify ( const unsigned char *  data,
size_t  len,
size_t *  strlen_return 
)

Definition at line 233 of file wsutf8.c.

References WS_UTF8_DEC_C_P, and WS_UTF8_DEC_TYPE.

Referenced by ws_bc_decode(), and ws_utf8_set_data().

235 {
236  unsigned int num_bytes, i;
237  size_t strlen = 0;
238 
239  while (len > 0) {
240  num_bytes = WS_UTF8_DEC_TYPE(*data);
241  if (num_bytes == 0)
242  /* Not a valid beginning. */
243  return 0;
244 
245  if (len < num_bytes)
246  /* The data is truncated. */
247  return 0;
248 
249  for (i = 1; i < num_bytes; i++)
250  if (!WS_UTF8_DEC_C_P(data[i]))
251  /* Not a valid continuation byte. */
252  return 0;
253 
254  len -= num_bytes;
255  data += num_bytes;
256  strlen++;
257  }
258 
259  if (strlen_return)
260  *strlen_return = strlen;
261 
262  return 1;
263 }
#define WS_UTF8_DEC_TYPE(b)
Definition: wsutf8.c:161
#define WS_UTF8_DEC_C_P(b)
Definition: wsutf8.c:178

Variable Documentation

◆ utf8_hibits

unsigned char utf8_hibits[7]
static
Initial value:
=
{
0x00,
0x00,
0xc0,
0xe0,
0xf0,
0xf8,
0xfc,
}

Definition at line 87 of file wsutf8.c.

Referenced by ws_utf8_append_char().

◆ utf8_hidata_masks

unsigned char utf8_hidata_masks[7]
static
Initial value:
=
{
0x00,
0x7f,
0x1f,
0x0f,
0x07,
0x03,
0x01,
}

Definition at line 142 of file wsutf8.c.

Referenced by ws_utf8_get_char().

See file LICENSE for details about the license agreement for using, modifying, copying or deriving work from this software.