Main Page | Alphabetical List | Data Structures | Directories | File List | Data Fields | Globals

charset.c

Go to the documentation of this file.
00001 /* ==================================================================== 
00002  * The Kannel Software License, Version 1.0 
00003  * 
00004  * Copyright (c) 2001-2008 Kannel Group  
00005  * Copyright (c) 1998-2001 WapIT Ltd.   
00006  * All rights reserved. 
00007  * 
00008  * Redistribution and use in source and binary forms, with or without 
00009  * modification, are permitted provided that the following conditions 
00010  * are met: 
00011  * 
00012  * 1. Redistributions of source code must retain the above copyright 
00013  *    notice, this list of conditions and the following disclaimer. 
00014  * 
00015  * 2. Redistributions in binary form must reproduce the above copyright 
00016  *    notice, this list of conditions and the following disclaimer in 
00017  *    the documentation and/or other materials provided with the 
00018  *    distribution. 
00019  * 
00020  * 3. The end-user documentation included with the redistribution, 
00021  *    if any, must include the following acknowledgment: 
00022  *       "This product includes software developed by the 
00023  *        Kannel Group (http://www.kannel.org/)." 
00024  *    Alternately, this acknowledgment may appear in the software itself, 
00025  *    if and wherever such third-party acknowledgments normally appear. 
00026  * 
00027  * 4. The names "Kannel" and "Kannel Group" must not be used to 
00028  *    endorse or promote products derived from this software without 
00029  *    prior written permission. For written permission, please  
00030  *    contact org@kannel.org. 
00031  * 
00032  * 5. Products derived from this software may not be called "Kannel", 
00033  *    nor may "Kannel" appear in their name, without prior written 
00034  *    permission of the Kannel Group. 
00035  * 
00036  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED 
00037  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 
00038  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 
00039  * DISCLAIMED.  IN NO EVENT SHALL THE KANNEL GROUP OR ITS CONTRIBUTORS 
00040  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,  
00041  * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT  
00042  * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR  
00043  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,  
00044  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE  
00045  * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,  
00046  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
00047  * ==================================================================== 
00048  * 
00049  * This software consists of voluntary contributions made by many 
00050  * individuals on behalf of the Kannel Group.  For more information on  
00051  * the Kannel Group, please see <http://www.kannel.org/>. 
00052  * 
00053  * Portions of this software are based upon software originally written at  
00054  * WapIT Ltd., Helsinki, Finland for the Kannel project.  
00055  */ 
00056 
00057 /*
00058  * gwlib/charset.c - character set conversions
00059  *
00060  * This file implements the character set conversions declared in charset.h.
00061  *
00062  * Richard Braakman
00063  */
00064 
00065 #include "gwlib/gwlib.h"
00066 
00067 #if HAVE_ICONV_H
00068 #include <errno.h>
00069 #include <iconv.h>
00070 #endif
00071 
00072 /* Code used for non-representable characters */
00073 #define NRP '?'
00074 
00075 #include "gwlib/latin1_to_gsm.h"
00076 
00077 
00078 /* This is the extension table defined in GSM 03.38.  It is the mapping
00079  * used for the character after a GSM 27 (Escape) character.  All characters
00080  * not in the table, as well as characters we can't represent, will map
00081  * to themselves.  We cannot represent the euro symbol, which is an escaped
00082  * 'e', so we left it out of this table. */
00083 static const struct {
00084     int gsmesc;
00085     int latin1;
00086 } gsm_esctolatin1[] = {
00087     {  10, 12 }, /* ASCII page break */
00088     {  20, '^' },
00089     {  40, '{' },
00090     {  41, '}' },
00091     {  47, '\\' },
00092     {  60, '[' },
00093     {  61, '~' },
00094     {  62, ']' },
00095     {  64, '|' },
00096     { 101, 128 },
00097     { -1, -1 }
00098 };
00099 
00100 
00104 static const struct {
00105     int gsmesc;
00106     int unichar;
00107 } gsm_esctouni[] = {
00108     { 10, 12 }, /* ASCII page break */
00109     { 20, '^' },
00110     { 40, '{' },
00111     { 41, '}' },
00112     { 47, '\\' },
00113     { 60, '[' },
00114     { 61, '~' },
00115     { 62, ']' },
00116     { 64, '|' },
00117     { 'e', 0x20AC },  /* euro symbol */
00118     { -1, -1 }
00119 };
00120 
00121 
00122 /* Map GSM default alphabet characters to ISO-Latin-1 characters.
00123  * The greek characters at positions 16 and 18 through 26 are not
00124  * mappable.  They are mapped to '?' characters.
00125  * The escape character, at position 27, is mapped to a space,
00126  * though normally the function that indexes into this table will
00127  * treat it specially. */
00128 static const unsigned char gsm_to_latin1[128] = {
00129      '@', 0xa3,  '$', 0xa5, 0xe8, 0xe9, 0xf9, 0xec,   /* 0 - 7 */
00130     0xf2, 0xc7,   10, 0xd8, 0xf8,   13, 0xc5, 0xe5,   /* 8 - 15 */
00131      '?',  '_',  '?',  '?',  '?',  '?',  '?',  '?',   /* 16 - 23 */
00132          '?',  '?',  '?',  ' ', 0xc6, 0xe6, 0xdf, 0xc9,   /* 24 - 31 */
00133      ' ',  '!',  '"',  '#', 0xa4,  '%',  '&', '\'',   /* 32 - 39 */
00134      '(',  ')',  '*',  '+',  ',',  '-',  '.',  '/',   /* 40 - 47 */
00135      '0',  '1',  '2',  '3',  '4',  '5',  '6',  '7',   /* 48 - 55 */
00136      '8',  '9',  ':',  ';',  '<',  '=',  '>',  '?',   /* 56 - 63 */
00137         0xa1,  'A',  'B',  'C',  'D',  'E',  'F',  'G',   /* 64 - 71 */
00138          'H',  'I',  'J',  'K',  'L',  'M',  'N',  'O',   /* 73 - 79 */
00139          'P',  'Q',  'R',  'S',  'T',  'U',  'V',  'W',   /* 80 - 87 */
00140          'X',  'Y',  'Z', 0xc4, 0xd6, 0xd1, 0xdc, 0xa7,   /* 88 - 95 */
00141         0xbf,  'a',  'b',  'c',  'd',  'e',  'f',  'g',   /* 96 - 103 */
00142          'h',  'i',  'j',  'k',  'l',  'm',  'n',  'o',   /* 104 - 111 */
00143          'p',  'q',  'r',  's',  't',  'u',  'v',  'w',   /* 112 - 119 */
00144          'x',  'y',  'z', 0xe4, 0xf6, 0xf1, 0xfc, 0xe0    /* 120 - 127 */
00145 };
00146 
00147 
00154 static const int gsm_to_unicode[128] = {
00155       '@',  0xA3,   '$',  0xA5,  0xE8,  0xE9,  0xF9,  0xEC,   /* 0 - 7 */
00156      0xF2,  0xC7,    10,  0xd8,  0xF8,    13,  0xC5,  0xE5,   /* 8 - 15 */
00157     0x394,   '_', 0x3A6, 0x393, 0x39B, 0x3A9, 0x3A0, 0x3A8,   /* 16 - 23 */
00158     0x3A3, 0x398, 0x39E,   NRP,  0xC6,  0xE6,  0xDF,  0xC9,   /* 24 - 31 */
00159       ' ',   '!',   '"',   '#',  0xA4,   '%',   '&',  '\'',   /* 32 - 39 */
00160       '(',   ')',   '*',   '+',   ',',   '-',   '.',   '/',   /* 40 - 47 */
00161       '0',   '1',   '2',   '3',   '4',   '5',   '6',   '7',   /* 48 - 55 */
00162       '8',   '9',   ':',   ';',   '<',   '=',   '>',   '?',   /* 56 - 63 */
00163       0xA1,  'A',   'B',   'C',   'D',   'E',   'F',   'G',   /* 64 - 71 */
00164       'H',   'I',   'J',   'K',   'L',   'M',   'N',   'O',   /* 73 - 79 */
00165       'P',   'Q',   'R',   'S',   'T',   'U',   'V',   'W',   /* 80 - 87 */
00166       'X',   'Y',   'Z',  0xC4,  0xD6,  0xD1,  0xDC,  0xA7,   /* 88 - 95 */
00167      0xBF,   'a',   'b',   'c',   'd',   'e',   'f',   'g',   /* 96 - 103 */
00168       'h',   'i',   'j',   'k',   'l',   'm',   'n',   'o',   /* 104 - 111 */
00169       'p',   'q',   'r',   's',   't',   'u',   'v',   'w',   /* 112 - 119 */
00170       'x',   'y',   'z',  0xE4,  0xF6,  0xF1,  0xFC,  0xE0    /* 120 - 127 */
00171 };
00172 
00173 /*
00174  * Register alises for Windows character sets that the libxml/libiconv can
00175  * recoqnise them.
00176  */
00177 
00178 struct alias_t {
00179     char *real;
00180     char *alias;
00181 };
00182 
00183 typedef struct alias_t alias_t;
00184 
00185 alias_t chars_aliases[] = {
00186     { "CP1250", "WIN-1250" },
00187     { "CP1250", "WINDOWS-1250" },
00188     { "CP1251", "WIN-1251" }, 
00189     { "CP1251", "WINDOWS-1251" },
00190     { "CP1252", "WIN-1252" }, 
00191     { "CP1252", "WINDOWS-1252" },
00192     { "CP1253", "WIN-1253" }, 
00193     { "CP1253", "WINDOWS-1253" },
00194     { "CP1254", "WIN-1254" }, 
00195     { "CP1254", "WINDOWS-1254" },
00196     { "CP1257", "WIN-1257" },
00197     { "CP1257", "WINDOWS-1257" },
00198     { NULL }
00199 };
00200 
00201 void charset_init()
00202 {
00203     int i;
00204 
00205     for (i = 0; chars_aliases[i].real != NULL; i++) {
00206       xmlAddEncodingAlias(chars_aliases[i].real,chars_aliases[i].alias);
00207       /*debug("encoding",0,"Add encoding for %s",chars_aliases[i].alias);*/
00208     }
00209 }
00210 
00211 void charset_shutdown()
00212 {
00213     xmlCleanupEncodingAliases();
00214 }
00215 
00221 void charset_gsm_to_utf8(Octstr *ostr)
00222 {
00223     long pos, len;
00224     Octstr *newostr;
00225 
00226     if (ostr == NULL)
00227         return;
00228 
00229     newostr = octstr_create("");
00230     len = octstr_len(ostr);
00231     
00232     for (pos = 0; pos < len; pos++) {
00233         int c, i;
00234         
00235         c = octstr_get_char(ostr, pos);
00236         if (c > 127) {
00237             warning(0, "Could not convert GSM (0x%02x) to Unicode.", c);
00238             continue;
00239         }
00240         
00241         if(c == 27 && pos + 1 < len) {
00242             c = octstr_get_char(ostr, ++pos);
00243             for (i = 0; gsm_esctouni[i].gsmesc >= 0; i++) {
00244                 if (gsm_esctouni[i].gsmesc == c)
00245                     break;
00246             }   
00247             if (gsm_esctouni[i].gsmesc == c) {
00248                 /* found a value for escaped char */
00249                 c = gsm_esctouni[i].unichar;
00250             } else {
00251             /* nothing found, look esc in our table */
00252         c = gsm_to_unicode[27];
00253                 pos--;
00254         }
00255         } else if (c < 128) {
00256             c = gsm_to_unicode[c];
00257         }
00258         /* unicode to utf-8 */
00259         if(c < 128) {
00260             /* 0-127 are ASCII chars that need no conversion */
00261             octstr_append_char(newostr, c);
00262         } else { 
00263             /* test if it can be converterd into a two byte char */
00264             if(c < 0x0800) {
00265                 octstr_append_char(newostr, ((c >> 6) | 0xC0) & 0xFF); /* add 110xxxxx */
00266                 octstr_append_char(newostr, (c & 0x3F) | 0x80); /* add 10xxxxxx */
00267             } else {
00268                 /* else we encode with 3 bytes. This only happens in case of euro symbol */
00269                 octstr_append_char(newostr, ((c >> 12) | 0xE0) & 0xFF); /* add 1110xxxx */
00270                 octstr_append_char(newostr, (((c >> 6) & 0x3F) | 0x80) & 0xFF); /* add 10xxxxxx */
00271                 octstr_append_char(newostr, ((c  & 0x3F) | 0x80) & 0xFF); /* add 10xxxxxx */
00272             }
00273             /* There are no 4 bytes encoded characters in GSM charset */
00274         }
00275     }
00276 
00277     octstr_truncate(ostr, 0);
00278     octstr_append(ostr, newostr);
00279     octstr_destroy(newostr);
00280 }
00281 
00289 void charset_utf8_to_gsm(Octstr *ostr)
00290 {
00291     long pos, len;
00292     int val1, val2;
00293     Octstr *newostr;
00294 
00295     if (ostr == NULL)
00296         return;
00297     
00298     newostr = octstr_create("");
00299     len = octstr_len(ostr);
00300     
00301     for (pos = 0; pos < len; pos++) {
00302         val1 = octstr_get_char(ostr, pos);
00303         
00304         /* check range */
00305         if (val1 < 0 || val1 > 255) {
00306             warning(0, "Char (0x%02x) in UTF-8 string not in the range (0, 255). Skipped.", val1);
00307             continue;
00308         }
00309         
00310         /* Convert UTF-8 to unicode code */
00311         
00312         /* test if two byte utf8 char */
00313         if ((val1 & 0xE0) == 0xC0) {
00314             /* test if incomplete utf char */
00315             if(pos + 1 < len) {
00316                 val2 = octstr_get_char(ostr, ++pos);
00317                 val1 = (((val1 & ~0xC0) << 6) | (val2 & 0x3F));
00318             } else {
00319                 /* incomplete, ignore it */
00320                 warning(0, "Incomplete UTF-8 char discovered, skipped. 1");
00321                 pos += 1;
00322                 continue;
00323             }
00324         } else if ((val1 & 0xF0) == 0xE0) { /* test for three byte utf8 char */
00325             if(pos + 2 < len) {
00326                 val2 = octstr_get_char(ostr, ++pos);
00327                 val1 = (((val1 & ~0xE0) << 6) | (val2 & 0x3F));
00328                 val2 = octstr_get_char(ostr, ++pos);
00329                 val1 = (val1 << 6) | (val2 & 0x3F);
00330             } else {
00331                 /* incomplete, ignore it */
00332                 warning(0, "Incomplete UTF-8 char discovered, skipped. 2");
00333                 pos += 2;
00334                 continue;
00335             }
00336         }
00337 
00338         /* test Latin code page 1 char */
00339         if(val1 <= 255) {
00340             val1 = latin1_to_gsm[val1];
00341             /* needs to be escaped ? */
00342             if(val1 < 0) {
00343                 octstr_append_char(newostr, 27);
00344                 val1 *= -1;
00345             }
00346         } else {
00347             /* Its not a Latin1 char, test for allowed GSM chars */
00348             switch(val1) {
00349             case 0x394:
00350                 val1 = 0x10; /* GREEK CAPITAL LETTER DELTA */
00351                 break;
00352             case 0x3A6:
00353                 val1 = 0x12; /* GREEK CAPITAL LETTER PHI */
00354                 break;
00355             case 0x393:
00356                 val1 = 0x13; /* GREEK CAPITAL LETTER GAMMA */
00357                 break;
00358             case 0x39B:
00359                 val1 = 0x14; /* GREEK CAPITAL LETTER LAMBDA */
00360                 break;
00361             case 0x3A9:
00362                 val1 = 0x15; /* GREEK CAPITAL LETTER OMEGA */
00363                 break;
00364             case 0x3A0:
00365                 val1 = 0x16; /* GREEK CAPITAL LETTER PI */
00366                 break;
00367             case 0x3A8:
00368                 val1 = 0x17; /* GREEK CAPITAL LETTER PSI */
00369                 break;
00370             case 0x3A3:
00371                 val1 = 0x18; /* GREEK CAPITAL LETTER SIGMA */
00372                 break;
00373             case 0x398:
00374                 val1 = 0x19; /* GREEK CAPITAL LETTER THETA */
00375                 break;
00376             case 0x39E:
00377                 val1 = 0x1A; /* GREEK CAPITAL LETTER XI */
00378                 break;
00379             case 0x20AC:
00380                 val1 = 'e'; /* EURO SIGN */
00381                 octstr_append_char(newostr, 27);
00382                 break;
00383             default: val1 = NRP; /* character cannot be represented in GSM 03.38 */
00384             }
00385         }
00386         octstr_append_char(newostr, val1);
00387     }
00388 
00389     octstr_truncate(ostr, 0);
00390     octstr_append(ostr, newostr);
00391     octstr_destroy(newostr);
00392 }
00393 
00394 
00395 void charset_gsm_to_latin1(Octstr *ostr)
00396 {
00397     long pos, len;
00398 
00399     len = octstr_len(ostr);
00400     for (pos = 0; pos < len; pos++) {
00401     int c, new, i;
00402 
00403     c = octstr_get_char(ostr, pos);
00404     if (c == 27 && pos + 1 < len) {
00405         /* GSM escape code.  Delete it, then process the next
00406              * character specially. */
00407         octstr_delete(ostr, pos, 1);
00408         len--;
00409         c = octstr_get_char(ostr, pos);
00410         for (i = 0; gsm_esctolatin1[i].gsmesc >= 0; i++) {
00411         if (gsm_esctolatin1[i].gsmesc == c)
00412             break;
00413         }
00414         if (gsm_esctolatin1[i].gsmesc == c)
00415         new = gsm_esctolatin1[i].latin1;
00416         else if (c < 128)
00417         new = gsm_to_latin1[c];
00418         else
00419         continue;
00420     } else if (c < 128) {
00421             new = gsm_to_latin1[c];
00422     } else {
00423         continue;
00424     }
00425     if (new != c)
00426         octstr_set_char(ostr, pos, new);
00427     }
00428 }
00429 
00430 
00431 void charset_latin1_to_gsm(Octstr *ostr)
00432 {
00433     long pos, len;
00434     int c, new;
00435     unsigned char esc = 27;
00436 
00437     len = octstr_len(ostr);
00438     for (pos = 0; pos < len; pos++) {
00439     c = octstr_get_char(ostr, pos);
00440     gw_assert(c >= 0);
00441     gw_assert(c <= 256);
00442     new = latin1_to_gsm[c];
00443     if (new < 0) {
00444          /* Escaped GSM code */
00445         octstr_insert_data(ostr, pos, (char*) &esc, 1);
00446         pos++;
00447         len++;
00448         new = -new;
00449     }
00450     if (new != c)
00451         octstr_set_char(ostr, pos, new);
00452     }
00453 }
00454 
00455 
00456 /*
00457  * This function is a wrapper arround charset_latin1_to_gsm()
00458  * which implements the mapping of a NRCs (national reprentation codes)
00459  * ISO 21 German.
00460  */
00461 void charset_gsm_to_nrc_iso_21_german(Octstr *ostr)
00462 {
00463     long pos, len;
00464     int c, new;
00465 
00466     len = octstr_len(ostr);
00467     
00468     for (pos = 0; pos < len; pos++) {
00469         c = octstr_get_char(ostr, pos);
00470         switch (c) {
00471             /* GSM value; NRC value */
00472             case 0x5b: new = 0x5b; break; /* Ä */
00473             case 0x5c: new = 0x5c; break; /* Ö */
00474             case 0x5e: new = 0x5d; break; /* Ü */
00475             case 0x7b: new = 0x7b; break; /* ä */
00476             case 0x7c: new = 0x7c; break; /* ö */
00477             case 0x7e: new = 0x7d; break; /* ü */
00478             case 0x1e: new = 0x7e; break; /* ß */
00479             case 0x5f: new = 0x5e; break; /* § */
00480             default: new = c;
00481         }
00482         if (new != c)
00483             octstr_set_char(ostr, pos, new);
00484     }
00485 }
00486 
00487 void charset_nrc_iso_21_german_to_gsm(Octstr *ostr)
00488 {
00489     long pos, len;
00490     int c, new;
00491 
00492     len = octstr_len(ostr);
00493 
00494     for (pos = 0; pos < len; pos++) {
00495         c = octstr_get_char(ostr, pos);
00496         switch (c) {
00497             /* NRC value; GSM value */
00498             case 0x5b: new = 0x5b; break; /* Ä */
00499             case 0x5c: new = 0x5c; break; /* Ö */
00500             case 0x5d: new = 0x5e; break; /* Ü */
00501             case 0x7b: new = 0x7b; break; /* ä */
00502             case 0x7c: new = 0x7c; break; /* ö */
00503             case 0x7d: new = 0x7e; break; /* ü */
00504             case 0x7e: new = 0x1e; break; /* ß */
00505             case 0x5e: new = 0x5f; break; /* § */
00506             default: new = c;
00507         }
00508         if (new != c)
00509             octstr_set_char(ostr, pos, new);
00510     }
00511 }
00512 
00513 int charset_gsm_truncate(Octstr *gsm, long max)
00514 {
00515     if (octstr_len(gsm) > max) {
00516     /* If the last GSM character was an escaped character,
00517      * then chop off the escape as well as the character. */
00518     if (octstr_get_char(gsm, max - 1) == 27)
00519         octstr_truncate(gsm, max - 1);
00520     else
00521         octstr_truncate(gsm, max);
00522     return 1;
00523     }
00524     return 0;
00525 }
00526 
00527 int charset_to_utf8(Octstr *from, Octstr **to, Octstr *charset_from)
00528 {
00529     int ret;
00530     xmlCharEncodingHandlerPtr handler = NULL;
00531     xmlBufferPtr frombuffer = NULL;
00532     xmlBufferPtr tobuffer = NULL;
00533 
00534     if (octstr_compare(charset_from, octstr_imm("UTF-8")) == 0) {
00535         *to = octstr_duplicate(from);
00536         return 0;
00537     }
00538 
00539     handler = xmlFindCharEncodingHandler(octstr_get_cstr(charset_from));
00540     if (handler == NULL)
00541     return -2;
00542 
00543     /* Build the libxml buffers for the transcoding. */
00544     tobuffer = xmlBufferCreate();
00545     frombuffer = xmlBufferCreate();
00546     xmlBufferAdd(frombuffer, (unsigned char*)octstr_get_cstr(from), octstr_len(from));
00547 
00548     ret = xmlCharEncInFunc(handler, tobuffer, frombuffer);
00549 
00550     *to = octstr_create_from_data((char*)tobuffer->content, tobuffer->use);
00551 
00552     /* Memory cleanup. */
00553     xmlBufferFree(tobuffer);
00554     xmlBufferFree(frombuffer);
00555 
00556     return ret;
00557 }
00558 
00559 int charset_from_utf8(Octstr *utf8, Octstr **to, Octstr *charset_to)
00560 {
00561     int ret;
00562     xmlCharEncodingHandlerPtr handler = NULL;
00563     xmlBufferPtr frombuffer = NULL;
00564     xmlBufferPtr tobuffer = NULL;
00565 
00566     handler = xmlFindCharEncodingHandler(octstr_get_cstr(charset_to));
00567     if (handler == NULL)
00568     return -2;
00569 
00570     /* Build the libxml buffers for the transcoding. */
00571     tobuffer = xmlBufferCreate();
00572     frombuffer = xmlBufferCreate();
00573     xmlBufferAdd(frombuffer, (unsigned char*)octstr_get_cstr(utf8), octstr_len(utf8));
00574 
00575     ret = xmlCharEncOutFunc(handler, tobuffer, frombuffer);
00576     if (ret < -2)
00577     /* Libxml seems to be here a little uncertain what would be the 
00578      * return code -3, so let's make it -1. Ugly thing, indeed. --tuo */
00579     ret = -1; 
00580 
00581     *to = octstr_create_from_data((char*)tobuffer->content, tobuffer->use);
00582 
00583     /* Memory cleanup. */
00584     xmlBufferFree(tobuffer);
00585     xmlBufferFree(frombuffer);
00586 
00587     return ret;
00588 }
00589 
00590 int charset_convert(Octstr* string, char* charset_from, char* charset_to)
00591 {
00592 #if HAVE_ICONV_H
00593     char *from_buf, *to_buf, *pointer;
00594     size_t inbytes, outbytes;
00595     int ret;
00596     iconv_t cd;
00597      
00598     if (!charset_from || !charset_to || !string) /* sanity check */
00599          return -1;
00600          
00601     cd = iconv_open(charset_to, charset_from);
00602     /* Did I succeed in getting a conversion descriptor ? */
00603     if (cd == (iconv_t)(-1)) {
00604         /* I guess not */
00605         error(0,"Failed to convert string from <%s> to <%s> - probably broken type names.", 
00606               charset_from, charset_to);
00607         return -1; 
00608     }
00609     from_buf = octstr_get_cstr(string);
00610     /* allocate max sized buffer, assuming target encoding may be 4 byte unicode */
00611     inbytes = octstr_len(string);
00612     outbytes = sizeof(char) * octstr_len(string) * 4;
00613     pointer = to_buf = gw_malloc(outbytes + 1);
00614     memset(to_buf, 0, outbytes + 1);
00615     ret = iconv(cd, (char**)&from_buf, &inbytes, &pointer, &outbytes);
00616     iconv_close(cd);
00617     if (ret != -1) {
00618         /* conversion succeeded */
00619         octstr_delete(string, 0, octstr_len(string));
00620         octstr_append_data(string, to_buf, pointer - to_buf);
00621     if (ret)
00622         debug("charset", 0, "charset_convert did %d non-reversible conversions", ret);
00623         ret = 0;
00624     } else {
00625         error(0,"Failed to convert string from <%s> to <%s>, errno was <%d>",
00626               charset_from, charset_to, errno);
00627     }
00628 
00629     if (errno == EILSEQ) {
00630         debug("charset_convert", 0, "Found an invalid multibyte sequence at position <%d>",
00631               from_buf - octstr_get_cstr(string));     
00632     }
00633     gw_free(to_buf);
00634     return ret;
00635 #endif
00636     /* no convertion done due to not having iconv */
00637     return -1;
00638 }
See file LICENSE for details about the license agreement for using, modifying, copying or deriving work from this software.