#include "gwlib/gwlib.h"#include <errno.h>#include <iconv.h>#include "gwlib/latin1_to_gsm.h"Include dependency graph for charset.c:

Go to the source code of this file.
Data Structures | |
| struct | alias_t |
Defines | |
| #define | NRP '?' |
Typedefs | |
| typedef alias_t | alias_t |
Functions | |
| void | charset_init () |
| void | charset_shutdown () |
| void | charset_gsm_to_utf8 (Octstr *ostr) |
| void | charset_utf8_to_gsm (Octstr *ostr) |
| void | charset_gsm_to_latin1 (Octstr *ostr) |
| void | charset_latin1_to_gsm (Octstr *ostr) |
| void | charset_gsm_to_nrc_iso_21_german (Octstr *ostr) |
| void | charset_nrc_iso_21_german_to_gsm (Octstr *ostr) |
| int | charset_gsm_truncate (Octstr *gsm, long max) |
| int | charset_to_utf8 (Octstr *from, Octstr **to, Octstr *charset_from) |
| int | charset_from_utf8 (Octstr *utf8, Octstr **to, Octstr *charset_to) |
| int | charset_convert (Octstr *string, char *charset_from, char *charset_to) |
Variables | |
| struct { | |
| int gsmesc | |
| int latin1 | |
| } | gsm_esctolatin1 [] |
| struct { | |
| int gsmesc | |
| int unichar | |
| } | gsm_esctouni [] |
| const unsigned char | gsm_to_latin1 [128] |
| const int | gsm_to_unicode [128] |
| alias_t | chars_aliases [] |
|
|
|
|
|
|
|
||||||||||||||||
|
Definition at line 590 of file charset.c. References debug(), error(), octstr_append_data(), octstr_delete(), octstr_get_cstr, octstr_len(), and string. Referenced by charset_processing(), convert_addr_from_pdu(), data_sm_to_msg(), httpsmsc_send(), msg_to_pdu(), normalize_charset(), parse_attr_value(), parse_text(), pdu_to_msg(), soap_msgdata_attribute(), soap_msgdata_deps(), and soap_o2o_msgdata_attribute(). 00591 {
00592 #if HAVE_ICONV_H
00593 char *from_buf, *to_buf, *pointer;
00594 size_t inbytes, outbytes;
00595 int ret;
00596 iconv_t cd;
00597
00598 if (!charset_from || !charset_to || !string) /* sanity check */
00599 return -1;
00600
00601 cd = iconv_open(charset_to, charset_from);
00602 /* Did I succeed in getting a conversion descriptor ? */
00603 if (cd == (iconv_t)(-1)) {
00604 /* I guess not */
00605 error(0,"Failed to convert string from <%s> to <%s> - probably broken type names.",
00606 charset_from, charset_to);
00607 return -1;
00608 }
00609 from_buf = octstr_get_cstr(string);
00610 /* allocate max sized buffer, assuming target encoding may be 4 byte unicode */
00611 inbytes = octstr_len(string);
00612 outbytes = sizeof(char) * octstr_len(string) * 4;
00613 pointer = to_buf = gw_malloc(outbytes + 1);
00614 memset(to_buf, 0, outbytes + 1);
00615 ret = iconv(cd, (char**)&from_buf, &inbytes, &pointer, &outbytes);
00616 iconv_close(cd);
00617 if (ret != -1) {
00618 /* conversion succeeded */
00619 octstr_delete(string, 0, octstr_len(string));
00620 octstr_append_data(string, to_buf, pointer - to_buf);
00621 if (ret)
00622 debug("charset", 0, "charset_convert did %d non-reversible conversions", ret);
00623 ret = 0;
00624 } else {
00625 error(0,"Failed to convert string from <%s> to <%s>, errno was <%d>",
00626 charset_from, charset_to, errno);
00627 }
00628
00629 if (errno == EILSEQ) {
00630 debug("charset_convert", 0, "Found an invalid multibyte sequence at position <%d>",
00631 from_buf - octstr_get_cstr(string));
00632 }
00633 gw_free(to_buf);
00634 return ret;
00635 #endif
00636 /* no convertion done due to not having iconv */
00637 return -1;
00638 }
|
Here is the call graph for this function:

|
||||||||||||||||
|
Definition at line 559 of file charset.c. References handler, octstr_create_from_data, octstr_get_cstr, and octstr_len(). Referenced by octstr_recode(). 00560 {
00561 int ret;
00562 xmlCharEncodingHandlerPtr handler = NULL;
00563 xmlBufferPtr frombuffer = NULL;
00564 xmlBufferPtr tobuffer = NULL;
00565
00566 handler = xmlFindCharEncodingHandler(octstr_get_cstr(charset_to));
00567 if (handler == NULL)
00568 return -2;
00569
00570 /* Build the libxml buffers for the transcoding. */
00571 tobuffer = xmlBufferCreate();
00572 frombuffer = xmlBufferCreate();
00573 xmlBufferAdd(frombuffer, (unsigned char*)octstr_get_cstr(utf8), octstr_len(utf8));
00574
00575 ret = xmlCharEncOutFunc(handler, tobuffer, frombuffer);
00576 if (ret < -2)
00577 /* Libxml seems to be here a little uncertain what would be the
00578 * return code -3, so let's make it -1. Ugly thing, indeed. --tuo */
00579 ret = -1;
00580
00581 *to = octstr_create_from_data((char*)tobuffer->content, tobuffer->use);
00582
00583 /* Memory cleanup. */
00584 xmlBufferFree(tobuffer);
00585 xmlBufferFree(frombuffer);
00586
00587 return ret;
00588 }
|
Here is the call graph for this function:

|
|
Definition at line 395 of file charset.c. References gsm_esctolatin1, gsm_to_latin1, octstr_delete(), octstr_get_char(), octstr_len(), and octstr_set_char(). Referenced by main(). 00396 {
00397 long pos, len;
00398
00399 len = octstr_len(ostr);
00400 for (pos = 0; pos < len; pos++) {
00401 int c, new, i;
00402
00403 c = octstr_get_char(ostr, pos);
00404 if (c == 27 && pos + 1 < len) {
00405 /* GSM escape code. Delete it, then process the next
00406 * character specially. */
00407 octstr_delete(ostr, pos, 1);
00408 len--;
00409 c = octstr_get_char(ostr, pos);
00410 for (i = 0; gsm_esctolatin1[i].gsmesc >= 0; i++) {
00411 if (gsm_esctolatin1[i].gsmesc == c)
00412 break;
00413 }
00414 if (gsm_esctolatin1[i].gsmesc == c)
00415 new = gsm_esctolatin1[i].latin1;
00416 else if (c < 128)
00417 new = gsm_to_latin1[c];
00418 else
00419 continue;
00420 } else if (c < 128) {
00421 new = gsm_to_latin1[c];
00422 } else {
00423 continue;
00424 }
00425 if (new != c)
00426 octstr_set_char(ostr, pos, new);
00427 }
00428 }
|
Here is the call graph for this function:

|
|
Definition at line 461 of file charset.c. References octstr_get_char(), octstr_len(), and octstr_set_char(). Referenced by msg_to_emimsg(). 00462 {
00463 long pos, len;
00464 int c, new;
00465
00466 len = octstr_len(ostr);
00467
00468 for (pos = 0; pos < len; pos++) {
00469 c = octstr_get_char(ostr, pos);
00470 switch (c) {
00471 /* GSM value; NRC value */
00472 case 0x5b: new = 0x5b; break; /* Ä */
00473 case 0x5c: new = 0x5c; break; /* Ö */
00474 case 0x5e: new = 0x5d; break; /* Ü */
00475 case 0x7b: new = 0x7b; break; /* ä */
00476 case 0x7c: new = 0x7c; break; /* ö */
00477 case 0x7e: new = 0x7d; break; /* ü */
00478 case 0x1e: new = 0x7e; break; /* ß */
00479 case 0x5f: new = 0x5e; break; /* § */
00480 default: new = c;
00481 }
00482 if (new != c)
00483 octstr_set_char(ostr, pos, new);
00484 }
00485 }
|
Here is the call graph for this function:

|
|
Convert octet string in GSM format to UTF-8. Every GSM character can be represented with unicode, hence nothing will be lost. Escaped charaters will be translated into appropriate UTF-8 character. Definition at line 221 of file charset.c. References gsm_esctouni, gsm_to_unicode, octstr_append(), octstr_append_char(), octstr_create, octstr_destroy(), octstr_get_char(), octstr_len(), octstr_truncate(), and warning(). Referenced by at2_decode7bituncompressed(), cimd2_accept_message(), convert_addr_from_pdu(), data_sm_to_msg(), extract_msgdata_part_by_coding(), handle_operation(), main(), oisd_accept_message(), and pdu_to_msg(). 00222 {
00223 long pos, len;
00224 Octstr *newostr;
00225
00226 if (ostr == NULL)
00227 return;
00228
00229 newostr = octstr_create("");
00230 len = octstr_len(ostr);
00231
00232 for (pos = 0; pos < len; pos++) {
00233 int c, i;
00234
00235 c = octstr_get_char(ostr, pos);
00236 if (c > 127) {
00237 warning(0, "Could not convert GSM (0x%02x) to Unicode.", c);
00238 continue;
00239 }
00240
00241 if(c == 27 && pos + 1 < len) {
00242 c = octstr_get_char(ostr, ++pos);
00243 for (i = 0; gsm_esctouni[i].gsmesc >= 0; i++) {
00244 if (gsm_esctouni[i].gsmesc == c)
00245 break;
00246 }
00247 if (gsm_esctouni[i].gsmesc == c) {
00248 /* found a value for escaped char */
00249 c = gsm_esctouni[i].unichar;
00250 } else {
00251 /* nothing found, look esc in our table */
00252 c = gsm_to_unicode[27];
00253 pos--;
00254 }
00255 } else if (c < 128) {
00256 c = gsm_to_unicode[c];
00257 }
00258 /* unicode to utf-8 */
00259 if(c < 128) {
00260 /* 0-127 are ASCII chars that need no conversion */
00261 octstr_append_char(newostr, c);
00262 } else {
00263 /* test if it can be converterd into a two byte char */
00264 if(c < 0x0800) {
00265 octstr_append_char(newostr, ((c >> 6) | 0xC0) & 0xFF); /* add 110xxxxx */
00266 octstr_append_char(newostr, (c & 0x3F) | 0x80); /* add 10xxxxxx */
00267 } else {
00268 /* else we encode with 3 bytes. This only happens in case of euro symbol */
00269 octstr_append_char(newostr, ((c >> 12) | 0xE0) & 0xFF); /* add 1110xxxx */
00270 octstr_append_char(newostr, (((c >> 6) & 0x3F) | 0x80) & 0xFF); /* add 10xxxxxx */
00271 octstr_append_char(newostr, ((c & 0x3F) | 0x80) & 0xFF); /* add 10xxxxxx */
00272 }
00273 /* There are no 4 bytes encoded characters in GSM charset */
00274 }
00275 }
00276
00277 octstr_truncate(ostr, 0);
00278 octstr_append(ostr, newostr);
00279 octstr_destroy(newostr);
00280 }
|
Here is the call graph for this function:

|
||||||||||||
|
Definition at line 513 of file charset.c. References gsm, octstr_get_char(), octstr_len(), and octstr_truncate(). Referenced by extract_msgdata_part_by_coding(), msg_to_emimsg(), and packet_encode_message(). 00514 {
00515 if (octstr_len(gsm) > max) {
00516 /* If the last GSM character was an escaped character,
00517 * then chop off the escape as well as the character. */
00518 if (octstr_get_char(gsm, max - 1) == 27)
00519 octstr_truncate(gsm, max - 1);
00520 else
00521 octstr_truncate(gsm, max);
00522 return 1;
00523 }
00524 return 0;
00525 }
|
Here is the call graph for this function:

|
|
Definition at line 201 of file charset.c. References chars_aliases, and alias_t::real. Referenced by gwlib_init(). 00202 {
00203 int i;
00204
00205 for (i = 0; chars_aliases[i].real != NULL; i++) {
00206 xmlAddEncodingAlias(chars_aliases[i].real,chars_aliases[i].alias);
00207 /*debug("encoding",0,"Add encoding for %s",chars_aliases[i].alias);*/
00208 }
00209 }
|
|
|
Definition at line 431 of file charset.c. References gw_assert, octstr_get_char(), octstr_insert_data(), octstr_len(), and octstr_set_char(). 00432 {
00433 long pos, len;
00434 int c, new;
00435 unsigned char esc = 27;
00436
00437 len = octstr_len(ostr);
00438 for (pos = 0; pos < len; pos++) {
00439 c = octstr_get_char(ostr, pos);
00440 gw_assert(c >= 0);
00441 gw_assert(c <= 256);
00442 new = latin1_to_gsm[c];
00443 if (new < 0) {
00444 /* Escaped GSM code */
00445 octstr_insert_data(ostr, pos, (char*) &esc, 1);
00446 pos++;
00447 len++;
00448 new = -new;
00449 }
00450 if (new != c)
00451 octstr_set_char(ostr, pos, new);
00452 }
00453 }
|
Here is the call graph for this function:

|
|
Definition at line 487 of file charset.c. References octstr_get_char(), octstr_len(), and octstr_set_char(). Referenced by handle_operation(). 00488 {
00489 long pos, len;
00490 int c, new;
00491
00492 len = octstr_len(ostr);
00493
00494 for (pos = 0; pos < len; pos++) {
00495 c = octstr_get_char(ostr, pos);
00496 switch (c) {
00497 /* NRC value; GSM value */
00498 case 0x5b: new = 0x5b; break; /* Ä */
00499 case 0x5c: new = 0x5c; break; /* Ö */
00500 case 0x5d: new = 0x5e; break; /* Ü */
00501 case 0x7b: new = 0x7b; break; /* ä */
00502 case 0x7c: new = 0x7c; break; /* ö */
00503 case 0x7d: new = 0x7e; break; /* ü */
00504 case 0x7e: new = 0x1e; break; /* ß */
00505 case 0x5e: new = 0x5f; break; /* § */
00506 default: new = c;
00507 }
00508 if (new != c)
00509 octstr_set_char(ostr, pos, new);
00510 }
00511 }
|
Here is the call graph for this function:

|
|
Definition at line 211 of file charset.c. Referenced by gwlib_shutdown(). 00212 {
00213 xmlCleanupEncodingAliases();
00214 }
|
|
||||||||||||||||
|
Definition at line 527 of file charset.c. References handler, octstr_compare(), octstr_create_from_data, octstr_duplicate, octstr_get_cstr, octstr_imm(), and octstr_len(). Referenced by octstr_recode(), and set_charset(). 00528 {
00529 int ret;
00530 xmlCharEncodingHandlerPtr handler = NULL;
00531 xmlBufferPtr frombuffer = NULL;
00532 xmlBufferPtr tobuffer = NULL;
00533
00534 if (octstr_compare(charset_from, octstr_imm("UTF-8")) == 0) {
00535 *to = octstr_duplicate(from);
00536 return 0;
00537 }
00538
00539 handler = xmlFindCharEncodingHandler(octstr_get_cstr(charset_from));
00540 if (handler == NULL)
00541 return -2;
00542
00543 /* Build the libxml buffers for the transcoding. */
00544 tobuffer = xmlBufferCreate();
00545 frombuffer = xmlBufferCreate();
00546 xmlBufferAdd(frombuffer, (unsigned char*)octstr_get_cstr(from), octstr_len(from));
00547
00548 ret = xmlCharEncInFunc(handler, tobuffer, frombuffer);
00549
00550 *to = octstr_create_from_data((char*)tobuffer->content, tobuffer->use);
00551
00552 /* Memory cleanup. */
00553 xmlBufferFree(tobuffer);
00554 xmlBufferFree(frombuffer);
00555
00556 return ret;
00557 }
|
Here is the call graph for this function:

|
|
Convert octet string in UTF-8 format to GSM 03.38. Because not all UTF-8 charater can be converted to GSM 03.38 non convertable character replaces with NRP character (see define above). Special characters will be formed into escape sequences. Incomplete UTF-8 characters at the end of the string will be skipped. Definition at line 289 of file charset.c. References octstr_append(), octstr_append_char(), octstr_create, octstr_destroy(), octstr_get_char(), octstr_len(), octstr_truncate(), and warning(). Referenced by at2_pdu_encode(), extract_msgdata_part_by_coding(), main(), msg_to_emimsg(), msg_to_pdu(), ois_append_sm_text(), packet_encode_message(), and sms_msgdata_len(). 00290 {
00291 long pos, len;
00292 int val1, val2;
00293 Octstr *newostr;
00294
00295 if (ostr == NULL)
00296 return;
00297
00298 newostr = octstr_create("");
00299 len = octstr_len(ostr);
00300
00301 for (pos = 0; pos < len; pos++) {
00302 val1 = octstr_get_char(ostr, pos);
00303
00304 /* check range */
00305 if (val1 < 0 || val1 > 255) {
00306 warning(0, "Char (0x%02x) in UTF-8 string not in the range (0, 255). Skipped.", val1);
00307 continue;
00308 }
00309
00310 /* Convert UTF-8 to unicode code */
00311
00312 /* test if two byte utf8 char */
00313 if ((val1 & 0xE0) == 0xC0) {
00314 /* test if incomplete utf char */
00315 if(pos + 1 < len) {
00316 val2 = octstr_get_char(ostr, ++pos);
00317 val1 = (((val1 & ~0xC0) << 6) | (val2 & 0x3F));
00318 } else {
00319 /* incomplete, ignore it */
00320 warning(0, "Incomplete UTF-8 char discovered, skipped. 1");
00321 pos += 1;
00322 continue;
00323 }
00324 } else if ((val1 & 0xF0) == 0xE0) { /* test for three byte utf8 char */
00325 if(pos + 2 < len) {
00326 val2 = octstr_get_char(ostr, ++pos);
00327 val1 = (((val1 & ~0xE0) << 6) | (val2 & 0x3F));
00328 val2 = octstr_get_char(ostr, ++pos);
00329 val1 = (val1 << 6) | (val2 & 0x3F);
00330 } else {
00331 /* incomplete, ignore it */
00332 warning(0, "Incomplete UTF-8 char discovered, skipped. 2");
00333 pos += 2;
00334 continue;
00335 }
00336 }
00337
00338 /* test Latin code page 1 char */
00339 if(val1 <= 255) {
00340 val1 = latin1_to_gsm[val1];
00341 /* needs to be escaped ? */
00342 if(val1 < 0) {
00343 octstr_append_char(newostr, 27);
00344 val1 *= -1;
00345 }
00346 } else {
00347 /* Its not a Latin1 char, test for allowed GSM chars */
00348 switch(val1) {
00349 case 0x394:
00350 val1 = 0x10; /* GREEK CAPITAL LETTER DELTA */
00351 break;
00352 case 0x3A6:
00353 val1 = 0x12; /* GREEK CAPITAL LETTER PHI */
00354 break;
00355 case 0x393:
00356 val1 = 0x13; /* GREEK CAPITAL LETTER GAMMA */
00357 break;
00358 case 0x39B:
00359 val1 = 0x14; /* GREEK CAPITAL LETTER LAMBDA */
00360 break;
00361 case 0x3A9:
00362 val1 = 0x15; /* GREEK CAPITAL LETTER OMEGA */
00363 break;
00364 case 0x3A0:
00365 val1 = 0x16; /* GREEK CAPITAL LETTER PI */
00366 break;
00367 case 0x3A8:
00368 val1 = 0x17; /* GREEK CAPITAL LETTER PSI */
00369 break;
00370 case 0x3A3:
00371 val1 = 0x18; /* GREEK CAPITAL LETTER SIGMA */
00372 break;
00373 case 0x398:
00374 val1 = 0x19; /* GREEK CAPITAL LETTER THETA */
00375 break;
00376 case 0x39E:
00377 val1 = 0x1A; /* GREEK CAPITAL LETTER XI */
00378 break;
00379 case 0x20AC:
00380 val1 = 'e'; /* EURO SIGN */
00381 octstr_append_char(newostr, 27);
00382 break;
00383 default: val1 = NRP; /* character cannot be represented in GSM 03.38 */
00384 }
00385 }
00386 octstr_append_char(newostr, val1);
00387 }
00388
00389 octstr_truncate(ostr, 0);
00390 octstr_append(ostr, newostr);
00391 octstr_destroy(newostr);
00392 }
|
Here is the call graph for this function:

|
|
Initial value: {
{ "CP1250", "WIN-1250" },
{ "CP1250", "WINDOWS-1250" },
{ "CP1251", "WIN-1251" },
{ "CP1251", "WINDOWS-1251" },
{ "CP1252", "WIN-1252" },
{ "CP1252", "WINDOWS-1252" },
{ "CP1253", "WIN-1253" },
{ "CP1253", "WINDOWS-1253" },
{ "CP1254", "WIN-1254" },
{ "CP1254", "WINDOWS-1254" },
{ "CP1257", "WIN-1257" },
{ "CP1257", "WINDOWS-1257" },
{ NULL }
}
Definition at line 185 of file charset.c. Referenced by charset_init(). |
|
|
Referenced by charset_gsm_to_latin1(). |
|
|
Struct maps escaped GSM chars to unicode codeposition. Referenced by charset_gsm_to_utf8(). |
|
|
Initial value: {
'@', 0xa3, '$', 0xa5, 0xe8, 0xe9, 0xf9, 0xec,
0xf2, 0xc7, 10, 0xd8, 0xf8, 13, 0xc5, 0xe5,
'?', '_', '?', '?', '?', '?', '?', '?',
'?', '?', '?', ' ', 0xc6, 0xe6, 0xdf, 0xc9,
' ', '!', '"', '#', 0xa4, '%', '&', '\'',
'(', ')', '*', '+', ',', '-', '.', '/',
'0', '1', '2', '3', '4', '5', '6', '7',
'8', '9', ':', ';', '<', '=', '>', '?',
0xa1, 'A', 'B', 'C', 'D', 'E', 'F', 'G',
'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O',
'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W',
'X', 'Y', 'Z', 0xc4, 0xd6, 0xd1, 0xdc, 0xa7,
0xbf, 'a', 'b', 'c', 'd', 'e', 'f', 'g',
'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
'p', 'q', 'r', 's', 't', 'u', 'v', 'w',
'x', 'y', 'z', 0xe4, 0xf6, 0xf1, 0xfc, 0xe0
}
Definition at line 128 of file charset.c. Referenced by charset_gsm_to_latin1(). |
|
|
Initial value: {
'@', 0xA3, '$', 0xA5, 0xE8, 0xE9, 0xF9, 0xEC,
0xF2, 0xC7, 10, 0xd8, 0xF8, 13, 0xC5, 0xE5,
0x394, '_', 0x3A6, 0x393, 0x39B, 0x3A9, 0x3A0, 0x3A8,
0x3A3, 0x398, 0x39E, NRP, 0xC6, 0xE6, 0xDF, 0xC9,
' ', '!', '"', '#', 0xA4, '%', '&', '\'',
'(', ')', '*', '+', ',', '-', '.', '/',
'0', '1', '2', '3', '4', '5', '6', '7',
'8', '9', ':', ';', '<', '=', '>', '?',
0xA1, 'A', 'B', 'C', 'D', 'E', 'F', 'G',
'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O',
'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W',
'X', 'Y', 'Z', 0xC4, 0xD6, 0xD1, 0xDC, 0xA7,
0xBF, 'a', 'b', 'c', 'd', 'e', 'f', 'g',
'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
'p', 'q', 'r', 's', 't', 'u', 'v', 'w',
'x', 'y', 'z', 0xE4, 0xF6, 0xF1, 0xFC, 0xE0
}
Definition at line 154 of file charset.c. Referenced by charset_gsm_to_utf8(). |
|
|
|
|
|
Definition at line 85 of file charset.c. Referenced by convert_html_entity(), and ws_bc_encode(). |
|
|
|