#include <libxml/encoding.h>
#include <libxml/tree.h>

Functions
void	charset_init (void)

void	charset_shutdown (void)

void	charset_gsm_to_utf8 (Octstr *ostr)

void	charset_utf8_to_gsm (Octstr *ostr)

void	charset_gsm_to_nrc_iso_21_german (Octstr *ostr)

void	charset_nrc_iso_21_german_to_gsm (Octstr *ostr)

int	charset_gsm_truncate (Octstr *gsm, long max)

void	charset_gsm_to_latin1 (Octstr *gsm)

void	charset_latin1_to_gsm (Octstr *latin1)

int	charset_to_utf8 (Octstr from, Octstr to, Octstr charset_from)

int	charset_from_utf8 (Octstr utf8, Octstr to, Octstr charset_to)

int	charset_convert (Octstr string, char charset_from, char *charset_to)

Function Documentation

◆ charset_convert()

int charset_convert	(	Octstr *	string,
		char *	charset_from,
		char *	charset_to
	)

Definition at line 589 of file charset.c.

References debug(), error(), octstr_append_data(), octstr_get_cstr, octstr_len(), octstr_truncate(), and warning().

Referenced by charset_processing(), convert_addr_from_pdu(), convert_charset(), data_sm_to_msg(), handle_mo_dcs(), handle_mt_dcs(), httpsmsc_send(), init_batch(), msg_to_pdu(), normalize_charset(), obey_request_thread(), parse_attr_value(), parse_text(), pdu_to_msg(), sms_charset_processing(), soap_msgdata_attribute(), soap_msgdata_deps(), and soap_o2o_msgdata_attribute().

 {
 #if HAVE_ICONV
     char *from_buf, *to_buf, *pointer;
     size_t inbytesleft, outbytesleft, ret;
     iconv_t cd;
      
     if (!charset_from || !charset_to || !string) /* sanity check */
         return -1;
 
     if (octstr_len(string) < 1 || strcasecmp(charset_from, charset_to) == 0)
         return 0; /* we are done, nothing to convert */
         
     cd = iconv_open(charset_to, charset_from);
     /* Did I succeed in getting a conversion descriptor ? */
     if (cd == (iconv_t)(-1)) {
         /* I guess not */
         error(0,"Failed to convert string from <%s> to <%s> - probably broken type names.", 
               charset_from, charset_to);
         return -1; 
     }
     
     from_buf = octstr_get_cstr(string);
     inbytesleft = octstr_len(string);
     /* allocate max sized buffer, assuming target encoding may be 4 byte unicode */
     outbytesleft = inbytesleft * 4;
     pointer = to_buf = gw_malloc(outbytesleft);
 
     do {
         ret = iconv(cd, (ICONV_CONST char**) &from_buf, &inbytesleft, &pointer, &outbytesleft);
         if(ret == -1) {
             long tmp;
             /* the conversion failed somewhere */
             switch(errno) {
             case E2BIG: /* no space in output buffer */
                 debug("charset", 0, "outbuf to small, realloc.");
                 tmp = pointer - to_buf;
                 to_buf = gw_realloc(to_buf, tmp + inbytesleft * 4);
                 outbytesleft += inbytesleft * 4;
                 pointer = to_buf + tmp;
                 ret = 0;
                 break;
             case EILSEQ: /* invalid multibyte sequence */
             case EINVAL: /* incomplete multibyte sequence */
                 warning(0, "Invalid/Incomplete multibyte sequence at position %d, skeep it.",
                         (int)(from_buf - octstr_get_cstr(string)));
                 /* skeep char and try next */
                 if (outbytesleft == 0) {
                     /* buffer to small */
                     tmp = pointer - to_buf;
                     to_buf = gw_realloc(to_buf, tmp + inbytesleft * 4);
                     outbytesleft += inbytesleft * 4;
                     pointer = to_buf + tmp;
                 }
                 pointer[0] = from_buf[0];
                 pointer++;
                 from_buf++;
                 inbytesleft--;
                 outbytesleft--;
                 ret = 0;
                 break;
             }
         }
     } while(inbytesleft && ret == 0); /* stop if error occurs and not handled above */
     
     iconv_close(cd);
     
     if (ret != -1) {
         /* conversion succeeded */
         octstr_truncate(string, 0);
         octstr_append_data(string, to_buf, pointer - to_buf);
         if (ret)
             debug("charset", 0, "charset_convert did %ld non-reversible conversions", (long) ret);
         ret = 0;
     } else
         error(errno,"Failed to convert string from <%s> to <%s>.", charset_from, charset_to);
 
     if (errno == EILSEQ) {
         debug("charset_convert", 0, "Found an invalid multibyte sequence at position <%d>",
               (int)(from_buf - octstr_get_cstr(string)));
     }
     gw_free(to_buf);
     return ret;
 #endif
     /* no convertion done due to not having iconv */
     return -1;
 }

◆ charset_from_utf8()

int charset_from_utf8	(	Octstr *	utf8,
		Octstr **	to,
		Octstr *	charset_to
	)

Definition at line 558 of file charset.c.

References handler, octstr_create_from_data, octstr_get_cstr, and octstr_len().

Referenced by octstr_recode().

 {
     int ret;
     xmlCharEncodingHandlerPtr handler = NULL;
     xmlBufferPtr frombuffer = NULL;
     xmlBufferPtr tobuffer = NULL;
 
     handler = xmlFindCharEncodingHandler(octstr_get_cstr(charset_to));
     if (handler == NULL)
     return -2;
 
     /* Build the libxml buffers for the transcoding. */
     tobuffer = xmlBufferCreate();
     frombuffer = xmlBufferCreate();
     xmlBufferAdd(frombuffer, (unsigned char*)octstr_get_cstr(utf8), octstr_len(utf8));
 
     ret = xmlCharEncOutFunc(handler, tobuffer, frombuffer);
     if (ret < -2)
     /* Libxml seems to be here a little uncertain what would be the 
      * return code -3, so let's make it -1. Ugly thing, indeed. --tuo */
     ret = -1; 
 
     *to = octstr_create_from_data((char*)tobuffer->content, tobuffer->use);
 
     /* Memory cleanup. */
     xmlBufferFree(tobuffer);
     xmlBufferFree(frombuffer);
 
     return ret;
 }

◆ charset_gsm_to_latin1()

void charset_gsm_to_latin1 ( Octstr * gsm )

Definition at line 394 of file charset.c.

References gsm_esctolatin1, gsm_to_latin1, gsmesc, octstr_delete(), octstr_get_char(), octstr_len(), and octstr_set_char().

Referenced by main().

 {
     long pos, len;
 
     len = octstr_len(ostr);
     for (pos = 0; pos < len; pos++) {
     int c, new, i;
 
     c = octstr_get_char(ostr, pos);
     if (c == 27 && pos + 1 < len) {
         /* GSM escape code.  Delete it, then process the next
              * character specially. */
         octstr_delete(ostr, pos, 1);
         len--;
         c = octstr_get_char(ostr, pos);
         for (i = 0; gsm_esctolatin1[i].gsmesc >= 0; i++) {
         if (gsm_esctolatin1[i].gsmesc == c)
             break;
         }
         if (gsm_esctolatin1[i].gsmesc == c)
         new = gsm_esctolatin1[i].latin1;
         else if (c < 128)
         new = gsm_to_latin1[c];
         else
         continue;
     } else if (c < 128) {
             new = gsm_to_latin1[c];
     } else {
         continue;
     }
     if (new != c)
         octstr_set_char(ostr, pos, new);
     }
 }

◆ charset_gsm_to_nrc_iso_21_german()

void charset_gsm_to_nrc_iso_21_german ( Octstr * ostr )

Definition at line 460 of file charset.c.

References octstr_get_char(), octstr_len(), and octstr_set_char().

Referenced by msg_to_emimsg().

 {
     long pos, len;
     int c, new;
 
     len = octstr_len(ostr);
     
     for (pos = 0; pos < len; pos++) {
         c = octstr_get_char(ostr, pos);
         switch (c) {
             /* GSM value; NRC value */
             case 0x5b: new = 0x5b; break; /* � */
             case 0x5c: new = 0x5c; break; /* � */
             case 0x5e: new = 0x5d; break; /* � */
             case 0x7b: new = 0x7b; break; /* � */
             case 0x7c: new = 0x7c; break; /* � */
             case 0x7e: new = 0x7d; break; /* � */
             case 0x1e: new = 0x7e; break; /* � */
             case 0x5f: new = 0x5e; break; /* � */
             default: new = c;
         }
         if (new != c)
             octstr_set_char(ostr, pos, new);
     }
 }

◆ charset_gsm_to_utf8()

void charset_gsm_to_utf8 ( Octstr * ostr )

Convert octet string in GSM format to UTF-8. Every GSM character can be represented with unicode, hence nothing will be lost. Escaped charaters will be translated into appropriate UTF-8 character.

Definition at line 220 of file charset.c.

References gsm_esctouni, gsm_to_unicode, gsmesc, octstr_append(), octstr_append_char(), octstr_create, octstr_destroy(), octstr_get_char(), octstr_len(), octstr_truncate(), and warning().

Referenced by at2_decode7bituncompressed(), cimd2_accept_message(), convert_addr_from_pdu(), data_sm_to_msg(), extract_msgdata_part_by_coding(), handle_mo_dcs(), handle_operation(), main(), oisd_accept_message(), and pdu_to_msg().

 {
     long pos, len;
     Octstr *newostr;
 
     if (ostr == NULL)
         return;
 
     newostr = octstr_create("");
     len = octstr_len(ostr);
     
     for (pos = 0; pos < len; pos++) {
         int c, i;
         
         c = octstr_get_char(ostr, pos);
         if (c > 127) {
             warning(0, "Could not convert GSM (0x%02x) to Unicode.", c);
             continue;
         }
         
         if(c == 27 && pos + 1 < len) {
             c = octstr_get_char(ostr, ++pos);
             for (i = 0; gsm_esctouni[i].gsmesc >= 0; i++) {
                 if (gsm_esctouni[i].gsmesc == c)
                     break;
             }   
             if (gsm_esctouni[i].gsmesc == c) {
                 /* found a value for escaped char */
                 c = gsm_esctouni[i].unichar;
             } else {
             /* nothing found, look esc in our table */
         c = gsm_to_unicode[27];
                 pos--;
         }
         } else if (c < 128) {
             c = gsm_to_unicode[c];
         }
         /* unicode to utf-8 */
         if(c < 128) {
             /* 0-127 are ASCII chars that need no conversion */
             octstr_append_char(newostr, c);
         } else { 
             /* test if it can be converterd into a two byte char */
             if(c < 0x0800) {
                 octstr_append_char(newostr, ((c >> 6) | 0xC0) & 0xFF); /* add 110xxxxx */
                 octstr_append_char(newostr, (c & 0x3F) | 0x80); /* add 10xxxxxx */
             } else {
                 /* else we encode with 3 bytes. This only happens in case of euro symbol */
                 octstr_append_char(newostr, ((c >> 12) | 0xE0) & 0xFF); /* add 1110xxxx */
                 octstr_append_char(newostr, (((c >> 6) & 0x3F) | 0x80) & 0xFF); /* add 10xxxxxx */
                 octstr_append_char(newostr, ((c  & 0x3F) | 0x80) & 0xFF); /* add 10xxxxxx */
             }
             /* There are no 4 bytes encoded characters in GSM charset */
         }
     }
 
     octstr_truncate(ostr, 0);
     octstr_append(ostr, newostr);
     octstr_destroy(newostr);
 }

◆ charset_gsm_truncate()

int charset_gsm_truncate	(	Octstr *	gsm,
		long	max
	)

Definition at line 512 of file charset.c.

References gsm, octstr_get_char(), octstr_len(), and octstr_truncate().

Referenced by extract_msgdata_part_by_coding(), msg_to_emimsg(), and packet_encode_message().

 {
     if (octstr_len(gsm) > max) {
     /* If the last GSM character was an escaped character,
      * then chop off the escape as well as the character. */
     if (octstr_get_char(gsm, max - 1) == 27)
         octstr_truncate(gsm, max - 1);
     else
         octstr_truncate(gsm, max);
     return 1;
     }
     return 0;
 }

◆ charset_init()

void charset_init ( void )

Definition at line 200 of file charset.c.

References alias_t::alias, chars_aliases, and alias_t::real.

Referenced by gwlib_init().

 {
     int i;
 
     for (i = 0; chars_aliases[i].real != NULL; i++) {
       xmlAddEncodingAlias(chars_aliases[i].real,chars_aliases[i].alias);
       /*debug("encoding",0,"Add encoding for %s",chars_aliases[i].alias);*/
     }
 }

◆ charset_latin1_to_gsm()

void charset_latin1_to_gsm ( Octstr * latin1 )

Definition at line 430 of file charset.c.

References gw_assert(), latin1_to_gsm, octstr_get_char(), octstr_insert_data(), octstr_len(), and octstr_set_char().

 {
     long pos, len;
     int c, new;
     unsigned char esc = 27;
 
     len = octstr_len(ostr);
     for (pos = 0; pos < len; pos++) {
     c = octstr_get_char(ostr, pos);
     gw_assert(c >= 0);
     gw_assert(c <= 256);
     new = latin1_to_gsm[c];
     if (new < 0) {
          /* Escaped GSM code */
         octstr_insert_data(ostr, pos, (char*) &esc, 1);
         pos++;
         len++;
         new = -new;
     }
     if (new != c)
         octstr_set_char(ostr, pos, new);
     }
 }

◆ charset_nrc_iso_21_german_to_gsm()

void charset_nrc_iso_21_german_to_gsm ( Octstr * ostr )

Definition at line 486 of file charset.c.

References octstr_get_char(), octstr_len(), and octstr_set_char().

Referenced by handle_operation().

 {
     long pos, len;
     int c, new;
 
     len = octstr_len(ostr);
 
     for (pos = 0; pos < len; pos++) {
         c = octstr_get_char(ostr, pos);
         switch (c) {
             /* NRC value; GSM value */
             case 0x5b: new = 0x5b; break; /* � */
             case 0x5c: new = 0x5c; break; /* � */
             case 0x5d: new = 0x5e; break; /* � */
             case 0x7b: new = 0x7b; break; /* � */
             case 0x7c: new = 0x7c; break; /* � */
             case 0x7d: new = 0x7e; break; /* � */
             case 0x7e: new = 0x1e; break; /* � */
             case 0x5e: new = 0x5f; break; /* � */
             default: new = c;
         }
         if (new != c)
             octstr_set_char(ostr, pos, new);
     }
 }

◆ charset_shutdown()

void charset_shutdown ( void )

Definition at line 210 of file charset.c.

Referenced by gwlib_shutdown().

 {
     xmlCleanupEncodingAliases();
 }

◆ charset_to_utf8()

int charset_to_utf8	(	Octstr *	from,
		Octstr **	to,
		Octstr *	charset_from
	)

Definition at line 526 of file charset.c.

References from, handler, octstr_compare(), octstr_create_from_data, octstr_duplicate, octstr_get_cstr, octstr_imm(), and octstr_len().

Referenced by octstr_recode(), and set_charset().

 {
     int ret;
     xmlCharEncodingHandlerPtr handler = NULL;
     xmlBufferPtr frombuffer = NULL;
     xmlBufferPtr tobuffer = NULL;
 
     if (octstr_compare(charset_from, octstr_imm("UTF-8")) == 0) {
         *to = octstr_duplicate(from);
         return 0;
     }
 
     handler = xmlFindCharEncodingHandler(octstr_get_cstr(charset_from));
     if (handler == NULL)
     return -2;
 
     /* Build the libxml buffers for the transcoding. */
     tobuffer = xmlBufferCreate();
     frombuffer = xmlBufferCreate();
     xmlBufferAdd(frombuffer, (unsigned char*)octstr_get_cstr(from), octstr_len(from));
 
     ret = xmlCharEncInFunc(handler, tobuffer, frombuffer);
 
     *to = octstr_create_from_data((char*)tobuffer->content, tobuffer->use);
 
     /* Memory cleanup. */
     xmlBufferFree(tobuffer);
     xmlBufferFree(frombuffer);
 
     return ret;
 }

◆ charset_utf8_to_gsm()

void charset_utf8_to_gsm ( Octstr * ostr )

Convert octet string in UTF-8 format to GSM 03.38. Because not all UTF-8 charater can be converted to GSM 03.38 non convertable character replaces with NRP character (see define above). Special characters will be formed into escape sequences. Incomplete UTF-8 characters at the end of the string will be skipped.

Definition at line 288 of file charset.c.

References latin1_to_gsm, NRP, octstr_append(), octstr_append_char(), octstr_create, octstr_destroy(), octstr_get_char(), octstr_len(), octstr_truncate(), and warning().

Referenced by at2_pdu_encode(), extract_msgdata_part_by_coding(), handle_mt_dcs(), main(), msg_to_emimsg(), msg_to_pdu(), ois_append_sm_text(), packet_encode_message(), and sms_msgdata_len().

 {
     long pos, len;
     int val1, val2;
     Octstr *newostr;
 
     if (ostr == NULL)
         return;
     
     newostr = octstr_create("");
     len = octstr_len(ostr);
     
     for (pos = 0; pos < len; pos++) {
         val1 = octstr_get_char(ostr, pos);
         
         /* check range */
         if (val1 < 0 || val1 > 255) {
             warning(0, "Char (0x%02x) in UTF-8 string not in the range (0, 255). Skipped.", val1);
             continue;
         }
         
         /* Convert UTF-8 to unicode code */
         
         /* test if two byte utf8 char */
         if ((val1 & 0xE0) == 0xC0) {
             /* test if incomplete utf char */
             if(pos + 1 < len) {
                 val2 = octstr_get_char(ostr, ++pos);
                 val1 = (((val1 & ~0xC0) << 6) | (val2 & 0x3F));
             } else {
                 /* incomplete, ignore it */
                 warning(0, "Incomplete UTF-8 char discovered, skipped. 1");
                 pos += 1;
                 continue;
             }
         } else if ((val1 & 0xF0) == 0xE0) { /* test for three byte utf8 char */
             if(pos + 2 < len) {
                 val2 = octstr_get_char(ostr, ++pos);
                 val1 = (((val1 & ~0xE0) << 6) | (val2 & 0x3F));
                 val2 = octstr_get_char(ostr, ++pos);
                 val1 = (val1 << 6) | (val2 & 0x3F);
             } else {
                 /* incomplete, ignore it */
                 warning(0, "Incomplete UTF-8 char discovered, skipped. 2");
                 pos += 2;
                 continue;
             }
         }
 
         /* test Latin code page 1 char */
         if(val1 <= 255) {
             val1 = latin1_to_gsm[val1];
             /* needs to be escaped ? */
             if(val1 < 0) {
                 octstr_append_char(newostr, 27);
                 val1 *= -1;
             }
         } else {
             /* Its not a Latin1 char, test for allowed GSM chars */
             switch(val1) {
             case 0x394:
                 val1 = 0x10; /* GREEK CAPITAL LETTER DELTA */
                 break;
             case 0x3A6:
                 val1 = 0x12; /* GREEK CAPITAL LETTER PHI */
                 break;
             case 0x393:
                 val1 = 0x13; /* GREEK CAPITAL LETTER GAMMA */
                 break;
             case 0x39B:
                 val1 = 0x14; /* GREEK CAPITAL LETTER LAMBDA */
                 break;
             case 0x3A9:
                 val1 = 0x15; /* GREEK CAPITAL LETTER OMEGA */
                 break;
             case 0x3A0:
                 val1 = 0x16; /* GREEK CAPITAL LETTER PI */
                 break;
             case 0x3A8:
                 val1 = 0x17; /* GREEK CAPITAL LETTER PSI */
                 break;
             case 0x3A3:
                 val1 = 0x18; /* GREEK CAPITAL LETTER SIGMA */
                 break;
             case 0x398:
                 val1 = 0x19; /* GREEK CAPITAL LETTER THETA */
                 break;
             case 0x39E:
                 val1 = 0x1A; /* GREEK CAPITAL LETTER XI */
                 break;
             case 0x20AC:
                 val1 = 'e'; /* EURO SIGN */
                 octstr_append_char(newostr, 27);
                 break;
             default: val1 = NRP; /* character cannot be represented in GSM 03.38 */
             }
         }
         octstr_append_char(newostr, val1);
     }
 
     octstr_truncate(ostr, 0);
     octstr_append(ostr, newostr);
     octstr_destroy(newostr);
 }

Functions