00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041
00042
00043
00044
00045
00046
00047
00048
00049
00050
00051
00052
00053
00054
00055
00056
00057
00058
00059
00060
00061
00062
00063
00064
00065 #include "gwlib/gwlib.h"
00066
00067 #if HAVE_ICONV_H
00068 #include <errno.h>
00069 #include <iconv.h>
00070 #endif
00071
00072
00073 #define NRP '?'
00074
00075 #include "gwlib/latin1_to_gsm.h"
00076
00077
00078
00079
00080
00081
00082
00083 static const struct {
00084 int gsmesc;
00085 int latin1;
00086 } gsm_esctolatin1[] = {
00087 { 10, 12 },
00088 { 20, '^' },
00089 { 40, '{' },
00090 { 41, '}' },
00091 { 47, '\\' },
00092 { 60, '[' },
00093 { 61, '~' },
00094 { 62, ']' },
00095 { 64, '|' },
00096 { 101, 128 },
00097 { -1, -1 }
00098 };
00099
00100
00104 static const struct {
00105 int gsmesc;
00106 int unichar;
00107 } gsm_esctouni[] = {
00108 { 10, 12 },
00109 { 20, '^' },
00110 { 40, '{' },
00111 { 41, '}' },
00112 { 47, '\\' },
00113 { 60, '[' },
00114 { 61, '~' },
00115 { 62, ']' },
00116 { 64, '|' },
00117 { 'e', 0x20AC },
00118 { -1, -1 }
00119 };
00120
00121
00122
00123
00124
00125
00126
00127
00128 static const unsigned char gsm_to_latin1[128] = {
00129 '@', 0xa3, '$', 0xa5, 0xe8, 0xe9, 0xf9, 0xec,
00130 0xf2, 0xc7, 10, 0xd8, 0xf8, 13, 0xc5, 0xe5,
00131 '?', '_', '?', '?', '?', '?', '?', '?',
00132 '?', '?', '?', ' ', 0xc6, 0xe6, 0xdf, 0xc9,
00133 ' ', '!', '"', '#', 0xa4, '%', '&', '\'',
00134 '(', ')', '*', '+', ',', '-', '.', '/',
00135 '0', '1', '2', '3', '4', '5', '6', '7',
00136 '8', '9', ':', ';', '<', '=', '>', '?',
00137 0xa1, 'A', 'B', 'C', 'D', 'E', 'F', 'G',
00138 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O',
00139 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W',
00140 'X', 'Y', 'Z', 0xc4, 0xd6, 0xd1, 0xdc, 0xa7,
00141 0xbf, 'a', 'b', 'c', 'd', 'e', 'f', 'g',
00142 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
00143 'p', 'q', 'r', 's', 't', 'u', 'v', 'w',
00144 'x', 'y', 'z', 0xe4, 0xf6, 0xf1, 0xfc, 0xe0
00145 };
00146
00147
00154 static const int gsm_to_unicode[128] = {
00155 '@', 0xA3, '$', 0xA5, 0xE8, 0xE9, 0xF9, 0xEC,
00156 0xF2, 0xC7, 10, 0xd8, 0xF8, 13, 0xC5, 0xE5,
00157 0x394, '_', 0x3A6, 0x393, 0x39B, 0x3A9, 0x3A0, 0x3A8,
00158 0x3A3, 0x398, 0x39E, NRP, 0xC6, 0xE6, 0xDF, 0xC9,
00159 ' ', '!', '"', '#', 0xA4, '%', '&', '\'',
00160 '(', ')', '*', '+', ',', '-', '.', '/',
00161 '0', '1', '2', '3', '4', '5', '6', '7',
00162 '8', '9', ':', ';', '<', '=', '>', '?',
00163 0xA1, 'A', 'B', 'C', 'D', 'E', 'F', 'G',
00164 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O',
00165 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W',
00166 'X', 'Y', 'Z', 0xC4, 0xD6, 0xD1, 0xDC, 0xA7,
00167 0xBF, 'a', 'b', 'c', 'd', 'e', 'f', 'g',
00168 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
00169 'p', 'q', 'r', 's', 't', 'u', 'v', 'w',
00170 'x', 'y', 'z', 0xE4, 0xF6, 0xF1, 0xFC, 0xE0
00171 };
00172
00173
00174
00175
00176
00177
00178 struct alias_t {
00179 char *real;
00180 char *alias;
00181 };
00182
00183 typedef struct alias_t alias_t;
00184
00185 alias_t chars_aliases[] = {
00186 { "CP1250", "WIN-1250" },
00187 { "CP1250", "WINDOWS-1250" },
00188 { "CP1251", "WIN-1251" },
00189 { "CP1251", "WINDOWS-1251" },
00190 { "CP1252", "WIN-1252" },
00191 { "CP1252", "WINDOWS-1252" },
00192 { "CP1253", "WIN-1253" },
00193 { "CP1253", "WINDOWS-1253" },
00194 { "CP1254", "WIN-1254" },
00195 { "CP1254", "WINDOWS-1254" },
00196 { "CP1257", "WIN-1257" },
00197 { "CP1257", "WINDOWS-1257" },
00198 { NULL }
00199 };
00200
00201 void charset_init()
00202 {
00203 int i;
00204
00205 for (i = 0; chars_aliases[i].real != NULL; i++) {
00206 xmlAddEncodingAlias(chars_aliases[i].real,chars_aliases[i].alias);
00207
00208 }
00209 }
00210
00211 void charset_shutdown()
00212 {
00213 xmlCleanupEncodingAliases();
00214 }
00215
00221 void charset_gsm_to_utf8(Octstr *ostr)
00222 {
00223 long pos, len;
00224 Octstr *newostr;
00225
00226 if (ostr == NULL)
00227 return;
00228
00229 newostr = octstr_create("");
00230 len = octstr_len(ostr);
00231
00232 for (pos = 0; pos < len; pos++) {
00233 int c, i;
00234
00235 c = octstr_get_char(ostr, pos);
00236 if (c > 127) {
00237 warning(0, "Could not convert GSM (0x%02x) to Unicode.", c);
00238 continue;
00239 }
00240
00241 if(c == 27 && pos + 1 < len) {
00242 c = octstr_get_char(ostr, ++pos);
00243 for (i = 0; gsm_esctouni[i].gsmesc >= 0; i++) {
00244 if (gsm_esctouni[i].gsmesc == c)
00245 break;
00246 }
00247 if (gsm_esctouni[i].gsmesc == c) {
00248
00249 c = gsm_esctouni[i].unichar;
00250 } else {
00251
00252 c = gsm_to_unicode[27];
00253 pos--;
00254 }
00255 } else if (c < 128) {
00256 c = gsm_to_unicode[c];
00257 }
00258
00259 if(c < 128) {
00260
00261 octstr_append_char(newostr, c);
00262 } else {
00263
00264 if(c < 0x0800) {
00265 octstr_append_char(newostr, ((c >> 6) | 0xC0) & 0xFF);
00266 octstr_append_char(newostr, (c & 0x3F) | 0x80);
00267 } else {
00268
00269 octstr_append_char(newostr, ((c >> 12) | 0xE0) & 0xFF);
00270 octstr_append_char(newostr, (((c >> 6) & 0x3F) | 0x80) & 0xFF);
00271 octstr_append_char(newostr, ((c & 0x3F) | 0x80) & 0xFF);
00272 }
00273
00274 }
00275 }
00276
00277 octstr_truncate(ostr, 0);
00278 octstr_append(ostr, newostr);
00279 octstr_destroy(newostr);
00280 }
00281
00289 void charset_utf8_to_gsm(Octstr *ostr)
00290 {
00291 long pos, len;
00292 int val1, val2;
00293 Octstr *newostr;
00294
00295 if (ostr == NULL)
00296 return;
00297
00298 newostr = octstr_create("");
00299 len = octstr_len(ostr);
00300
00301 for (pos = 0; pos < len; pos++) {
00302 val1 = octstr_get_char(ostr, pos);
00303
00304
00305 if (val1 < 0 || val1 > 255) {
00306 warning(0, "Char (0x%02x) in UTF-8 string not in the range (0, 255). Skipped.", val1);
00307 continue;
00308 }
00309
00310
00311
00312
00313 if ((val1 & 0xE0) == 0xC0) {
00314
00315 if(pos + 1 < len) {
00316 val2 = octstr_get_char(ostr, ++pos);
00317 val1 = (((val1 & ~0xC0) << 6) | (val2 & 0x3F));
00318 } else {
00319
00320 warning(0, "Incomplete UTF-8 char discovered, skipped. 1");
00321 pos += 1;
00322 continue;
00323 }
00324 } else if ((val1 & 0xF0) == 0xE0) {
00325 if(pos + 2 < len) {
00326 val2 = octstr_get_char(ostr, ++pos);
00327 val1 = (((val1 & ~0xE0) << 6) | (val2 & 0x3F));
00328 val2 = octstr_get_char(ostr, ++pos);
00329 val1 = (val1 << 6) | (val2 & 0x3F);
00330 } else {
00331
00332 warning(0, "Incomplete UTF-8 char discovered, skipped. 2");
00333 pos += 2;
00334 continue;
00335 }
00336 }
00337
00338
00339 if(val1 <= 255) {
00340 val1 = latin1_to_gsm[val1];
00341
00342 if(val1 < 0) {
00343 octstr_append_char(newostr, 27);
00344 val1 *= -1;
00345 }
00346 } else {
00347
00348 switch(val1) {
00349 case 0x394:
00350 val1 = 0x10;
00351 break;
00352 case 0x3A6:
00353 val1 = 0x12;
00354 break;
00355 case 0x393:
00356 val1 = 0x13;
00357 break;
00358 case 0x39B:
00359 val1 = 0x14;
00360 break;
00361 case 0x3A9:
00362 val1 = 0x15;
00363 break;
00364 case 0x3A0:
00365 val1 = 0x16;
00366 break;
00367 case 0x3A8:
00368 val1 = 0x17;
00369 break;
00370 case 0x3A3:
00371 val1 = 0x18;
00372 break;
00373 case 0x398:
00374 val1 = 0x19;
00375 break;
00376 case 0x39E:
00377 val1 = 0x1A;
00378 break;
00379 case 0x20AC:
00380 val1 = 'e';
00381 octstr_append_char(newostr, 27);
00382 break;
00383 default: val1 = NRP;
00384 }
00385 }
00386 octstr_append_char(newostr, val1);
00387 }
00388
00389 octstr_truncate(ostr, 0);
00390 octstr_append(ostr, newostr);
00391 octstr_destroy(newostr);
00392 }
00393
00394
00395 void charset_gsm_to_latin1(Octstr *ostr)
00396 {
00397 long pos, len;
00398
00399 len = octstr_len(ostr);
00400 for (pos = 0; pos < len; pos++) {
00401 int c, new, i;
00402
00403 c = octstr_get_char(ostr, pos);
00404 if (c == 27 && pos + 1 < len) {
00405
00406
00407 octstr_delete(ostr, pos, 1);
00408 len--;
00409 c = octstr_get_char(ostr, pos);
00410 for (i = 0; gsm_esctolatin1[i].gsmesc >= 0; i++) {
00411 if (gsm_esctolatin1[i].gsmesc == c)
00412 break;
00413 }
00414 if (gsm_esctolatin1[i].gsmesc == c)
00415 new = gsm_esctolatin1[i].latin1;
00416 else if (c < 128)
00417 new = gsm_to_latin1[c];
00418 else
00419 continue;
00420 } else if (c < 128) {
00421 new = gsm_to_latin1[c];
00422 } else {
00423 continue;
00424 }
00425 if (new != c)
00426 octstr_set_char(ostr, pos, new);
00427 }
00428 }
00429
00430
00431 void charset_latin1_to_gsm(Octstr *ostr)
00432 {
00433 long pos, len;
00434 int c, new;
00435 unsigned char esc = 27;
00436
00437 len = octstr_len(ostr);
00438 for (pos = 0; pos < len; pos++) {
00439 c = octstr_get_char(ostr, pos);
00440 gw_assert(c >= 0);
00441 gw_assert(c <= 256);
00442 new = latin1_to_gsm[c];
00443 if (new < 0) {
00444
00445 octstr_insert_data(ostr, pos, (char*) &esc, 1);
00446 pos++;
00447 len++;
00448 new = -new;
00449 }
00450 if (new != c)
00451 octstr_set_char(ostr, pos, new);
00452 }
00453 }
00454
00455
00456
00457
00458
00459
00460
00461 void charset_gsm_to_nrc_iso_21_german(Octstr *ostr)
00462 {
00463 long pos, len;
00464 int c, new;
00465
00466 len = octstr_len(ostr);
00467
00468 for (pos = 0; pos < len; pos++) {
00469 c = octstr_get_char(ostr, pos);
00470 switch (c) {
00471
00472 case 0x5b: new = 0x5b; break;
00473 case 0x5c: new = 0x5c; break;
00474 case 0x5e: new = 0x5d; break;
00475 case 0x7b: new = 0x7b; break;
00476 case 0x7c: new = 0x7c; break;
00477 case 0x7e: new = 0x7d; break;
00478 case 0x1e: new = 0x7e; break;
00479 case 0x5f: new = 0x5e; break;
00480 default: new = c;
00481 }
00482 if (new != c)
00483 octstr_set_char(ostr, pos, new);
00484 }
00485 }
00486
00487 void charset_nrc_iso_21_german_to_gsm(Octstr *ostr)
00488 {
00489 long pos, len;
00490 int c, new;
00491
00492 len = octstr_len(ostr);
00493
00494 for (pos = 0; pos < len; pos++) {
00495 c = octstr_get_char(ostr, pos);
00496 switch (c) {
00497
00498 case 0x5b: new = 0x5b; break;
00499 case 0x5c: new = 0x5c; break;
00500 case 0x5d: new = 0x5e; break;
00501 case 0x7b: new = 0x7b; break;
00502 case 0x7c: new = 0x7c; break;
00503 case 0x7d: new = 0x7e; break;
00504 case 0x7e: new = 0x1e; break;
00505 case 0x5e: new = 0x5f; break;
00506 default: new = c;
00507 }
00508 if (new != c)
00509 octstr_set_char(ostr, pos, new);
00510 }
00511 }
00512
00513 int charset_gsm_truncate(Octstr *gsm, long max)
00514 {
00515 if (octstr_len(gsm) > max) {
00516
00517
00518 if (octstr_get_char(gsm, max - 1) == 27)
00519 octstr_truncate(gsm, max - 1);
00520 else
00521 octstr_truncate(gsm, max);
00522 return 1;
00523 }
00524 return 0;
00525 }
00526
00527 int charset_to_utf8(Octstr *from, Octstr **to, Octstr *charset_from)
00528 {
00529 int ret;
00530 xmlCharEncodingHandlerPtr handler = NULL;
00531 xmlBufferPtr frombuffer = NULL;
00532 xmlBufferPtr tobuffer = NULL;
00533
00534 if (octstr_compare(charset_from, octstr_imm("UTF-8")) == 0) {
00535 *to = octstr_duplicate(from);
00536 return 0;
00537 }
00538
00539 handler = xmlFindCharEncodingHandler(octstr_get_cstr(charset_from));
00540 if (handler == NULL)
00541 return -2;
00542
00543
00544 tobuffer = xmlBufferCreate();
00545 frombuffer = xmlBufferCreate();
00546 xmlBufferAdd(frombuffer, (unsigned char*)octstr_get_cstr(from), octstr_len(from));
00547
00548 ret = xmlCharEncInFunc(handler, tobuffer, frombuffer);
00549
00550 *to = octstr_create_from_data((char*)tobuffer->content, tobuffer->use);
00551
00552
00553 xmlBufferFree(tobuffer);
00554 xmlBufferFree(frombuffer);
00555
00556 return ret;
00557 }
00558
00559 int charset_from_utf8(Octstr *utf8, Octstr **to, Octstr *charset_to)
00560 {
00561 int ret;
00562 xmlCharEncodingHandlerPtr handler = NULL;
00563 xmlBufferPtr frombuffer = NULL;
00564 xmlBufferPtr tobuffer = NULL;
00565
00566 handler = xmlFindCharEncodingHandler(octstr_get_cstr(charset_to));
00567 if (handler == NULL)
00568 return -2;
00569
00570
00571 tobuffer = xmlBufferCreate();
00572 frombuffer = xmlBufferCreate();
00573 xmlBufferAdd(frombuffer, (unsigned char*)octstr_get_cstr(utf8), octstr_len(utf8));
00574
00575 ret = xmlCharEncOutFunc(handler, tobuffer, frombuffer);
00576 if (ret < -2)
00577
00578
00579 ret = -1;
00580
00581 *to = octstr_create_from_data((char*)tobuffer->content, tobuffer->use);
00582
00583
00584 xmlBufferFree(tobuffer);
00585 xmlBufferFree(frombuffer);
00586
00587 return ret;
00588 }
00589
00590 int charset_convert(Octstr* string, char* charset_from, char* charset_to)
00591 {
00592 #if HAVE_ICONV_H
00593 char *from_buf, *to_buf, *pointer;
00594 size_t inbytes, outbytes;
00595 int ret;
00596 iconv_t cd;
00597
00598 if (!charset_from || !charset_to || !string)
00599 return -1;
00600
00601 cd = iconv_open(charset_to, charset_from);
00602
00603 if (cd == (iconv_t)(-1)) {
00604
00605 error(0,"Failed to convert string from <%s> to <%s> - probably broken type names.",
00606 charset_from, charset_to);
00607 return -1;
00608 }
00609 from_buf = octstr_get_cstr(string);
00610
00611 inbytes = octstr_len(string);
00612 outbytes = sizeof(char) * octstr_len(string) * 4;
00613 pointer = to_buf = gw_malloc(outbytes + 1);
00614 memset(to_buf, 0, outbytes + 1);
00615 ret = iconv(cd, (char**)&from_buf, &inbytes, &pointer, &outbytes);
00616 iconv_close(cd);
00617 if (ret != -1) {
00618
00619 octstr_delete(string, 0, octstr_len(string));
00620 octstr_append_data(string, to_buf, pointer - to_buf);
00621 if (ret)
00622 debug("charset", 0, "charset_convert did %d non-reversible conversions", ret);
00623 ret = 0;
00624 } else {
00625 error(0,"Failed to convert string from <%s> to <%s>, errno was <%d>",
00626 charset_from, charset_to, errno);
00627 }
00628
00629 if (errno == EILSEQ) {
00630 debug("charset_convert", 0, "Found an invalid multibyte sequence at position <%d>",
00631 from_buf - octstr_get_cstr(string));
00632 }
00633 gw_free(to_buf);
00634 return ret;
00635 #endif
00636
00637 return -1;
00638 }
See file LICENSE for details about the license agreement for using,
modifying, copying or deriving work from this software.