Kannel: Open Source WAP and SMS gateway  $Revision: 5037 $
wsutf8.c
Go to the documentation of this file.
1 /* ====================================================================
2  * The Kannel Software License, Version 1.0
3  *
4  * Copyright (c) 2001-2016 Kannel Group
5  * Copyright (c) 1998-2001 WapIT Ltd.
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  * 1. Redistributions of source code must retain the above copyright
13  * notice, this list of conditions and the following disclaimer.
14  *
15  * 2. Redistributions in binary form must reproduce the above copyright
16  * notice, this list of conditions and the following disclaimer in
17  * the documentation and/or other materials provided with the
18  * distribution.
19  *
20  * 3. The end-user documentation included with the redistribution,
21  * if any, must include the following acknowledgment:
22  * "This product includes software developed by the
23  * Kannel Group (http://www.kannel.org/)."
24  * Alternately, this acknowledgment may appear in the software itself,
25  * if and wherever such third-party acknowledgments normally appear.
26  *
27  * 4. The names "Kannel" and "Kannel Group" must not be used to
28  * endorse or promote products derived from this software without
29  * prior written permission. For written permission, please
30  * contact org@kannel.org.
31  *
32  * 5. Products derived from this software may not be called "Kannel",
33  * nor may "Kannel" appear in their name, without prior written
34  * permission of the Kannel Group.
35  *
36  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
37  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
38  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
39  * DISCLAIMED. IN NO EVENT SHALL THE KANNEL GROUP OR ITS CONTRIBUTORS
40  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
41  * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
42  * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
43  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
44  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
45  * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
46  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
47  * ====================================================================
48  *
49  * This software consists of voluntary contributions made by many
50  * individuals on behalf of the Kannel Group. For more information on
51  * the Kannel Group, please see <http://www.kannel.org/>.
52  *
53  * Portions of this software are based upon software originally written at
54  * WapIT Ltd., Helsinki, Finland for the Kannel project.
55  */
56 
57 /*
58  *
59  * wsutf8.c
60  *
61  * Author: Markku Rossi <mtr@iki.fi>
62  *
63  * Copyright (c) 1999-2000 WAPIT OY LTD.
64  * All rights reserved.
65  *
66  * Functions to manipulate UTF-8 encoded strings.
67  *
68  * Specification: RFC-2279
69  *
70  */
71 
72 #include "wsint.h"
73 
74 /********************* Types and definitions ****************************/
75 
76 /* Masks to determine the UTF-8 encoding of an ISO 10646 character. */
77 #define WS_UTF8_ENC_1_M 0xffffff80
78 #define WS_UTF8_ENC_2_M 0xfffff800
79 #define WS_UTF8_ENC_3_M 0xffff0000
80 #define WS_UTF8_ENC_4_M 0xffe00000
81 #define WS_UTF8_ENC_5_M 0xfc000000
82 #define WS_UTF8_ENC_6_M 0x80000000
83 
84 /* The high-order bits. This array can be indexed with the number of
85  bytes in the encoding to get the initialization mask for the
86  high-order bits. */
87 static unsigned char utf8_hibits[7] =
88  {
89  0x00, /* unused */
90  0x00, /* 1 byte */
91  0xc0, /* 2 bytes */
92  0xe0, /* 3 bytes */
93  0xf0, /* 4 bytes */
94  0xf8, /* 5 bytes */
95  0xfc, /* 6 bytes */
96  };
97 
98 /* The high-order bits for continuation bytes (10xxxxxx). */
99 #define WS_UTF8_ENC_C_BITS 0x80
100 
101 /* Mask to get the continuation bytes from the character (00111111). */
102 #define WS_UTF8_CONT_DATA_MASK 0x3f
103 
104 /* Determine the encoding type of the ISO 10646 character `ch'. The
105  argument `ch' must be given as `unsigned long'. The macro returns
106  0 if the value `ch' can not be encoded as UTF-8 and the number of
107  bytes in the encoded value otherwise. */
108 #define WS_UTF8_ENC_TYPE(ch) \
109  (((ch) & WS_UTF8_ENC_1_M) == 0 \
110  ? 1 \
111  : (((ch) & WS_UTF8_ENC_2_M) == 0 \
112  ? 2 \
113  : (((ch) & WS_UTF8_ENC_3_M) == 0 \
114  ? 3 \
115  : (((ch) & WS_UTF8_ENC_4_M) == 0 \
116  ? 4 \
117  : (((ch) & WS_UTF8_ENC_5_M) == 0 \
118  ? 5 \
119  : (((ch) & WS_UTF8_ENC_6_M) == 0 \
120  ? 6 \
121  : 0))))))
122 
123 /* Masks and values to determine the length of an UTF-8 encoded
124  character. */
125 #define WS_UTF8_DEC_1_M 0x80
126 #define WS_UTF8_DEC_2_M 0xe0
127 #define WS_UTF8_DEC_3_M 0xf0
128 #define WS_UTF8_DEC_4_M 0xf8
129 #define WS_UTF8_DEC_5_M 0xfc
130 #define WS_UTF8_DEC_6_M 0xfe
131 
132 #define WS_UTF8_DEC_1_V 0x00
133 #define WS_UTF8_DEC_2_V 0xc0
134 #define WS_UTF8_DEC_3_V 0xe0
135 #define WS_UTF8_DEC_4_V 0xf0
136 #define WS_UTF8_DEC_5_V 0xf8
137 #define WS_UTF8_DEC_6_V 0xfc
138 
139 /* Masks to get the data bits from the first byte of an UTF-8 encoded
140  character. This array can be indexed with the number of bytes in
141  the encoding. */
142 static unsigned char utf8_hidata_masks[7] =
143  {
144  0x00, /* unused */
145  0x7f, /* 1 byte */
146  0x1f, /* 2 bytes */
147  0x0f, /* 3 bytes */
148  0x07, /* 4 bytes */
149  0x03, /* 5 bytes */
150  0x01, /* 6 bytes */
151  };
152 
153 /* The mask and the value of the continuation bytes. */
154 #define WS_UTF8_DEC_C_M 0xc0
155 #define WS_UTF8_DEC_C_V 0x80
156 
157 /* Determine how many bytes the UTF-8 encoding uses by investigating
158  the first byte `b'. The argument `b' must be given as `unsigned
159  char'. The macro returns 0 if the byte `b' is not a valid UTF-8
160  first byte. */
161 #define WS_UTF8_DEC_TYPE(b) \
162  (((b) & WS_UTF8_DEC_1_M) == WS_UTF8_DEC_1_V \
163  ? 1 \
164  : (((b) & WS_UTF8_DEC_2_M) == WS_UTF8_DEC_2_V \
165  ? 2 \
166  : (((b) & WS_UTF8_DEC_3_M) == WS_UTF8_DEC_3_V \
167  ? 3 \
168  : (((b) & WS_UTF8_DEC_4_M) == WS_UTF8_DEC_4_V \
169  ? 4 \
170  : (((b) & WS_UTF8_DEC_5_M) == WS_UTF8_DEC_5_V \
171  ? 5 \
172  : (((b) & WS_UTF8_DEC_6_M) == WS_UTF8_DEC_6_V \
173  ? 6 \
174  : 0))))))
175 
176 /* Predicate to check whether the `unsigned char' byte `b' is a
177  continuation byte. */
178 #define WS_UTF8_DEC_C_P(b) (((b) & WS_UTF8_DEC_C_M) == WS_UTF8_DEC_C_V)
179 
180 /********************* Global functions *********************************/
181 
183 {
184  return ws_calloc(1, sizeof(WsUtf8String));
185 }
186 
187 
189 {
190  if (string == NULL)
191  return;
192 
193  ws_free(string->data);
194  ws_free(string);
195 }
196 
197 
198 int ws_utf8_append_char(WsUtf8String *string, unsigned long ch)
199 {
200  unsigned char *d;
201  unsigned int num_bytes = WS_UTF8_ENC_TYPE(ch);
202  unsigned int len, i;
203 
204  if (num_bytes == 0)
205  ws_fatal("ws_utf8_append_char(): 0x%lx is not a valid UTF-8 character",
206  ch);
207 
208  d = ws_realloc(string->data, string->len + num_bytes);
209  if (d == NULL)
210  return 0;
211 
212  len = string->len;
213 
214  /* Encode the continuation bytes (n > 1). */
215  for (i = num_bytes - 1; i > 0; i--) {
216  d[len + i] = WS_UTF8_ENC_C_BITS;
217  d[len + i] |= ch & WS_UTF8_CONT_DATA_MASK;
218  ch >>= 6;
219  }
220 
221  /* And continue the first byte. */
222  d[len] = utf8_hibits[num_bytes];
223  d[len] |= ch;
224 
225  string->data = d;
226  string->len += num_bytes;
227  string->num_chars++;
228 
229  return 1;
230 }
231 
232 
233 int ws_utf8_verify(const unsigned char *data, size_t len,
234  size_t *strlen_return)
235 {
236  unsigned int num_bytes, i;
237  size_t strlen = 0;
238 
239  while (len > 0) {
240  num_bytes = WS_UTF8_DEC_TYPE(*data);
241  if (num_bytes == 0)
242  /* Not a valid beginning. */
243  return 0;
244 
245  if (len < num_bytes)
246  /* The data is truncated. */
247  return 0;
248 
249  for (i = 1; i < num_bytes; i++)
250  if (!WS_UTF8_DEC_C_P(data[i]))
251  /* Not a valid continuation byte. */
252  return 0;
253 
254  len -= num_bytes;
255  data += num_bytes;
256  strlen++;
257  }
258 
259  if (strlen_return)
260  *strlen_return = strlen;
261 
262  return 1;
263 }
264 
265 
266 int ws_utf8_set_data(WsUtf8String *string, const unsigned char *data,
267  size_t len)
268 {
269  size_t num_chars;
270 
271  if (!ws_utf8_verify(data, len, &num_chars))
272  /* Malformed data. */
273  return 0;
274 
275  /* Init `string' to empty. */
276  ws_free(string->data);
277  string->data = NULL;
278  string->len = 0;
279  string->num_chars = 0;
280 
281  /* Set the new data. */
282  string->data = ws_memdup(data, len);
283  if (string->data == NULL)
284  return 0;
285 
286  string->len = len;
287  string->num_chars = num_chars;
288 
289  return 1;
290 }
291 
292 
293 int ws_utf8_get_char(const WsUtf8String *string, unsigned long *ch_return,
294  size_t *posp)
295 {
296  size_t pos = *posp;
297  unsigned int num_bytes, i;
298  unsigned char *data;
299  unsigned long ch;
300 
301  if (pos < 0 || pos >= string->len)
302  /* Index out range. */
303  return 0;
304 
305  data = string->data + pos;
306 
307  num_bytes = WS_UTF8_DEC_TYPE(*data);
308  if (num_bytes == 0)
309  /* Invalid position. */
310  return 0;
311 
312  if (pos + num_bytes > string->len)
313  /* Truncated data. */
314  return 0;
315 
316  /* Get the first byte. */
317  ch = data[0] & utf8_hidata_masks[num_bytes];
318 
319  /* Add the continuation bytes. */
320  for (i = 1; i < num_bytes; i++) {
321  ch <<= 6;
322  ch |= data[i] & WS_UTF8_CONT_DATA_MASK;
323  }
324 
325  *ch_return = ch;
326  *posp = pos + num_bytes;
327 
328  return 1;
329 }
330 
331 
332 unsigned char *ws_utf8_to_latin1(const WsUtf8String *string,
333  unsigned char unknown_char,
334  size_t *len_return)
335 {
336  unsigned char *cstr;
337  size_t i;
338  size_t pos = 0;
339 
340  if (string == NULL)
341  return NULL;
342 
343  cstr = ws_malloc(string->num_chars + 1);
344  if (cstr == NULL)
345  return NULL;
346 
347  for (i = 0; i < string->num_chars; i++) {
348  unsigned long ch;
349 
350  if (!ws_utf8_get_char(string, &ch, &pos))
351  ws_fatal("ws_utf8_to_latin1_cstr(): internal inconsistency");
352 
353  if (ch > 0xff)
354  cstr[i] = unknown_char;
355  else
356  cstr[i] = (unsigned char) ch;
357  }
358 
359  cstr[i] = '\0';
360 
361  if (len_return)
362  *len_return = string->num_chars;
363 
364  return cstr;
365 }
366 
367 
368 void ws_utf8_free_data(unsigned char *data)
369 {
370  if (data)
371  ws_free(data);
372 }
void ws_fatal(char *fmt,...)
Definition: wserror.c:91
void * ws_calloc(size_t num, size_t size)
Definition: wsalloc.c:83
size_t len
Definition: wsutf8.h:81
void ws_free(void *ptr)
Definition: wsalloc.c:139
unsigned char * data
Definition: wsutf8.h:84
static unsigned char utf8_hibits[7]
Definition: wsutf8.c:87
void * ws_realloc(void *ptr, size_t size)
Definition: wsalloc.c:89
#define WS_UTF8_DEC_TYPE(b)
Definition: wsutf8.c:161
void ws_utf8_free(WsUtf8String *string)
Definition: wsutf8.c:188
static unsigned char utf8_hidata_masks[7]
Definition: wsutf8.c:142
#define WS_UTF8_ENC_C_BITS
Definition: wsutf8.c:99
#define WS_UTF8_CONT_DATA_MASK
Definition: wsutf8.c:102
int ws_utf8_append_char(WsUtf8String *string, unsigned long ch)
Definition: wsutf8.c:198
WsUtf8String * ws_utf8_alloc()
Definition: wsutf8.c:182
int ws_utf8_set_data(WsUtf8String *string, const unsigned char *data, size_t len)
Definition: wsutf8.c:266
void ws_utf8_free_data(unsigned char *data)
Definition: wsutf8.c:368
void * ws_memdup(const void *ptr, size_t size)
Definition: wsalloc.c:105
#define WS_UTF8_DEC_C_P(b)
Definition: wsutf8.c:178
int ws_utf8_get_char(const WsUtf8String *string, unsigned long *ch_return, size_t *posp)
Definition: wsutf8.c:293
int ws_utf8_verify(const unsigned char *data, size_t len, size_t *strlen_return)
Definition: wsutf8.c:233
size_t num_chars
Definition: wsutf8.h:87
#define WS_UTF8_ENC_TYPE(ch)
Definition: wsutf8.c:108
void * ws_malloc(size_t size)
Definition: wsalloc.c:77
unsigned char * ws_utf8_to_latin1(const WsUtf8String *string, unsigned char unknown_char, size_t *len_return)
Definition: wsutf8.c:332
See file LICENSE for details about the license agreement for using, modifying, copying or deriving work from this software.