#include "wsint.h"

Macros
#define	WS_UTF8_ENC_1_M 0xffffff80

#define	WS_UTF8_ENC_2_M 0xfffff800

#define	WS_UTF8_ENC_3_M 0xffff0000

#define	WS_UTF8_ENC_4_M 0xffe00000

#define	WS_UTF8_ENC_5_M 0xfc000000

#define	WS_UTF8_ENC_6_M 0x80000000

#define	WS_UTF8_ENC_C_BITS 0x80

#define	WS_UTF8_CONT_DATA_MASK 0x3f

#define	WS_UTF8_ENC_TYPE(ch)

#define	WS_UTF8_DEC_1_M 0x80

#define	WS_UTF8_DEC_2_M 0xe0

#define	WS_UTF8_DEC_3_M 0xf0

#define	WS_UTF8_DEC_4_M 0xf8

#define	WS_UTF8_DEC_5_M 0xfc

#define	WS_UTF8_DEC_6_M 0xfe

#define	WS_UTF8_DEC_1_V 0x00

#define	WS_UTF8_DEC_2_V 0xc0

#define	WS_UTF8_DEC_3_V 0xe0

#define	WS_UTF8_DEC_4_V 0xf0

#define	WS_UTF8_DEC_5_V 0xf8

#define	WS_UTF8_DEC_6_V 0xfc

#define	WS_UTF8_DEC_C_M 0xc0

#define	WS_UTF8_DEC_C_V 0x80

#define	WS_UTF8_DEC_TYPE(b)

#define	WS_UTF8_DEC_C_P(b) (((b) & WS_UTF8_DEC_C_M) == WS_UTF8_DEC_C_V)

Functions
WsUtf8String *	ws_utf8_alloc ()

void	ws_utf8_free (WsUtf8String *string)

int	ws_utf8_append_char (WsUtf8String *string, unsigned long ch)

int	ws_utf8_verify (const unsigned char data, size_t len, size_t strlen_return)

int	ws_utf8_set_data (WsUtf8String string, const unsigned char data, size_t len)

int	ws_utf8_get_char (const WsUtf8String string, unsigned long ch_return, size_t *posp)

unsigned char *	ws_utf8_to_latin1 (const WsUtf8String string, unsigned char unknown_char, size_t len_return)

void	ws_utf8_free_data (unsigned char *data)

Variables
static unsigned char	utf8_hibits [7]

static unsigned char	utf8_hidata_masks [7]

Macro Definition Documentation

◆ WS_UTF8_CONT_DATA_MASK

#define WS_UTF8_CONT_DATA_MASK 0x3f

Definition at line 102 of file wsutf8.c.

Referenced by ws_utf8_append_char(), and ws_utf8_get_char().

◆ WS_UTF8_DEC_1_M

#define WS_UTF8_DEC_1_M 0x80

Definition at line 125 of file wsutf8.c.

◆ WS_UTF8_DEC_1_V

#define WS_UTF8_DEC_1_V 0x00

Definition at line 132 of file wsutf8.c.

◆ WS_UTF8_DEC_2_M

#define WS_UTF8_DEC_2_M 0xe0

Definition at line 126 of file wsutf8.c.

◆ WS_UTF8_DEC_2_V

#define WS_UTF8_DEC_2_V 0xc0

Definition at line 133 of file wsutf8.c.

◆ WS_UTF8_DEC_3_M

#define WS_UTF8_DEC_3_M 0xf0

Definition at line 127 of file wsutf8.c.

◆ WS_UTF8_DEC_3_V

#define WS_UTF8_DEC_3_V 0xe0

Definition at line 134 of file wsutf8.c.

◆ WS_UTF8_DEC_4_M

#define WS_UTF8_DEC_4_M 0xf8

Definition at line 128 of file wsutf8.c.

◆ WS_UTF8_DEC_4_V

#define WS_UTF8_DEC_4_V 0xf0

Definition at line 135 of file wsutf8.c.

◆ WS_UTF8_DEC_5_M

#define WS_UTF8_DEC_5_M 0xfc

Definition at line 129 of file wsutf8.c.

◆ WS_UTF8_DEC_5_V

#define WS_UTF8_DEC_5_V 0xf8

Definition at line 136 of file wsutf8.c.

◆ WS_UTF8_DEC_6_M

#define WS_UTF8_DEC_6_M 0xfe

Definition at line 130 of file wsutf8.c.

◆ WS_UTF8_DEC_6_V

#define WS_UTF8_DEC_6_V 0xfc

Definition at line 137 of file wsutf8.c.

◆ WS_UTF8_DEC_C_M

#define WS_UTF8_DEC_C_M 0xc0

Definition at line 154 of file wsutf8.c.

◆ WS_UTF8_DEC_C_P

#define WS_UTF8_DEC_C_P ( b ) (((b) & WS_UTF8_DEC_C_M) == WS_UTF8_DEC_C_V)

Definition at line 178 of file wsutf8.c.

Referenced by ws_utf8_verify().

◆ WS_UTF8_DEC_C_V

#define WS_UTF8_DEC_C_V 0x80

Definition at line 155 of file wsutf8.c.

◆ WS_UTF8_DEC_TYPE

#define WS_UTF8_DEC_TYPE ( b )

Value:

(((b) & WS_UTF8_DEC_1_M) == WS_UTF8_DEC_1_V         \
     ? 1                            \
     : (((b) & WS_UTF8_DEC_2_M) == WS_UTF8_DEC_2_V      \
       ? 2                          \
       : (((b) & WS_UTF8_DEC_3_M) == WS_UTF8_DEC_3_V        \
         ? 3                            \
         : (((b) & WS_UTF8_DEC_4_M) == WS_UTF8_DEC_4_V      \
           ? 4                          \
           : (((b) & WS_UTF8_DEC_5_M) == WS_UTF8_DEC_5_V    \
             ? 5                        \
             : (((b) & WS_UTF8_DEC_6_M) == WS_UTF8_DEC_6_V  \
               ? 6                      \
               : 0))))))

Definition at line 161 of file wsutf8.c.

Referenced by ws_utf8_get_char(), and ws_utf8_verify().

◆ WS_UTF8_ENC_1_M

#define WS_UTF8_ENC_1_M 0xffffff80

Definition at line 77 of file wsutf8.c.

◆ WS_UTF8_ENC_2_M

#define WS_UTF8_ENC_2_M 0xfffff800

Definition at line 78 of file wsutf8.c.

◆ WS_UTF8_ENC_3_M

#define WS_UTF8_ENC_3_M 0xffff0000

Definition at line 79 of file wsutf8.c.

◆ WS_UTF8_ENC_4_M

#define WS_UTF8_ENC_4_M 0xffe00000

Definition at line 80 of file wsutf8.c.

◆ WS_UTF8_ENC_5_M

#define WS_UTF8_ENC_5_M 0xfc000000

Definition at line 81 of file wsutf8.c.

◆ WS_UTF8_ENC_6_M

#define WS_UTF8_ENC_6_M 0x80000000

Definition at line 82 of file wsutf8.c.

◆ WS_UTF8_ENC_C_BITS

#define WS_UTF8_ENC_C_BITS 0x80

Definition at line 99 of file wsutf8.c.

Referenced by ws_utf8_append_char().

◆ WS_UTF8_ENC_TYPE

#define WS_UTF8_ENC_TYPE ( ch )

Value:

(((ch) & WS_UTF8_ENC_1_M) == 0      \
     ? 1                    \
     : (((ch) & WS_UTF8_ENC_2_M) == 0       \
       ? 2                  \
       : (((ch) & WS_UTF8_ENC_3_M) == 0     \
         ? 3                    \
         : (((ch) & WS_UTF8_ENC_4_M) == 0   \
           ? 4                  \
           : (((ch) & WS_UTF8_ENC_5_M) == 0 \
             ? 5                \
             : (((ch) & WS_UTF8_ENC_6_M) == 0   \
               ? 6              \
               : 0))))))

Definition at line 108 of file wsutf8.c.

Referenced by ws_utf8_append_char().

Function Documentation

◆ ws_utf8_alloc()

WsUtf8String* ws_utf8_alloc ( void )

Definition at line 182 of file wsutf8.c.

References ws_calloc().

Referenced by ws_bc_encode(), and ws_yy_lex().

 {
     return ws_calloc(1, sizeof(WsUtf8String));
 }

◆ ws_utf8_append_char()

int ws_utf8_append_char	(	WsUtf8String *	string,
		unsigned long	ch
	)

Definition at line 198 of file wsutf8.c.

References WsUtf8StringRec::data, WsUtf8StringRec::len, utf8_hibits, ws_fatal(), ws_realloc(), WS_UTF8_CONT_DATA_MASK, WS_UTF8_ENC_C_BITS, and WS_UTF8_ENC_TYPE.

Referenced by ws_yy_lex().

 {
     unsigned char *d;
     unsigned int num_bytes = WS_UTF8_ENC_TYPE(ch);
     unsigned int len, i;
 
     if (num_bytes == 0)
         ws_fatal("ws_utf8_append_char(): 0x%lx is not a valid UTF-8 character",
                  ch);
 
     d = ws_realloc(string->data, string->len + num_bytes);
     if (d == NULL)
         return 0;
 
     len = string->len;
 
     /* Encode the continuation bytes (n > 1). */
     for (i = num_bytes - 1; i > 0; i--) {
         d[len + i] = WS_UTF8_ENC_C_BITS;
         d[len + i] |= ch & WS_UTF8_CONT_DATA_MASK;
         ch >>= 6;
     }
 
     /* And continue the first byte. */
     d[len] = utf8_hibits[num_bytes];
     d[len] |= ch;
 
     string->data = d;
     string->len += num_bytes;
     string->num_chars++;
 
     return 1;
 }

◆ ws_utf8_free()

void ws_utf8_free ( WsUtf8String * string )

Definition at line 188 of file wsutf8.c.

References WsUtf8StringRec::data, and ws_free().

Referenced by ws_bc_encode(), and ws_yy_lex().

 {
     if (string == NULL)
         return;
 
     ws_free(string->data);
     ws_free(string);
 }

◆ ws_utf8_free_data()

void ws_utf8_free_data ( unsigned char * data )

Definition at line 368 of file wsutf8.c.

References ws_free().

Referenced by pragma_meta(), and ws_bc_encode().

 {
     if (data)
         ws_free(data);
 }

◆ ws_utf8_get_char()

int ws_utf8_get_char	(	const WsUtf8String *	string,
		unsigned long *	ch_return,
		size_t *	posp
	)

Definition at line 293 of file wsutf8.c.

References WsUtf8StringRec::len, utf8_hidata_masks, WS_UTF8_CONT_DATA_MASK, and WS_UTF8_DEC_TYPE.

Referenced by main(), and ws_utf8_to_latin1().

 {
     size_t pos = *posp;
     unsigned int num_bytes, i;
     unsigned char *data;
     unsigned long ch;
 
     if (pos < 0 || pos >= string->len)
         /* Index out range. */
         return 0;
 
     data = string->data + pos;
 
     num_bytes = WS_UTF8_DEC_TYPE(*data);
     if (num_bytes == 0)
         /* Invalid position. */
         return 0;
 
     if (pos + num_bytes > string->len)
         /* Truncated data. */
         return 0;
 
     /* Get the first byte. */
     ch = data[0] & utf8_hidata_masks[num_bytes];
 
     /* Add the continuation bytes. */
     for (i = 1; i < num_bytes; i++) {
         ch <<= 6;
         ch |= data[i] & WS_UTF8_CONT_DATA_MASK;
     }
 
     *ch_return = ch;
     *posp = pos + num_bytes;
 
     return 1;
 }

◆ ws_utf8_set_data()

int ws_utf8_set_data	(	WsUtf8String *	string,
		const unsigned char *	data,
		size_t	len
	)

Definition at line 266 of file wsutf8.c.

References WsUtf8StringRec::data, ws_free(), ws_memdup(), and ws_utf8_verify().

Referenced by ws_bc_encode().

 {
     size_t num_chars;
 
     if (!ws_utf8_verify(data, len, &num_chars))
         /* Malformed data. */
         return 0;
 
     /* Init `string' to empty. */
     ws_free(string->data);
     string->data = NULL;
     string->len = 0;
     string->num_chars = 0;
 
     /* Set the new data. */
     string->data = ws_memdup(data, len);
     if (string->data == NULL)
         return 0;
 
     string->len = len;
     string->num_chars = num_chars;
 
     return 1;
 }

◆ ws_utf8_to_latin1()

unsigned char* ws_utf8_to_latin1	(	const WsUtf8String *	string,
		unsigned char	unknown_char,
		size_t *	len_return
	)

Definition at line 332 of file wsutf8.c.

References WsUtf8StringRec::num_chars, ws_fatal(), ws_malloc(), and ws_utf8_get_char().

Referenced by pragma_meta(), and ws_bc_encode().

 {
     unsigned char *cstr;
     size_t i;
     size_t pos = 0;
 
     if (string == NULL)
         return NULL;
 
     cstr = ws_malloc(string->num_chars + 1);
     if (cstr == NULL)
         return NULL;
 
     for (i = 0; i < string->num_chars; i++) {
         unsigned long ch;
 
         if (!ws_utf8_get_char(string, &ch, &pos))
             ws_fatal("ws_utf8_to_latin1_cstr(): internal inconsistency");
 
         if (ch > 0xff)
             cstr[i] = unknown_char;
         else
             cstr[i] = (unsigned char) ch;
     }
 
     cstr[i] = '\0';
 
     if (len_return)
         *len_return = string->num_chars;
 
     return cstr;
 }

◆ ws_utf8_verify()

int ws_utf8_verify	(	const unsigned char *	data,
		size_t	len,
		size_t *	strlen_return
	)

Definition at line 233 of file wsutf8.c.

References WS_UTF8_DEC_C_P, and WS_UTF8_DEC_TYPE.

Referenced by ws_bc_decode(), and ws_utf8_set_data().

 {
     unsigned int num_bytes, i;
     size_t strlen = 0;
 
     while (len > 0) {
         num_bytes = WS_UTF8_DEC_TYPE(*data);
         if (num_bytes == 0)
             /* Not a valid beginning. */
             return 0;
 
         if (len < num_bytes)
             /* The data is truncated. */
             return 0;
 
         for (i = 1; i < num_bytes; i++)
             if (!WS_UTF8_DEC_C_P(data[i]))
                 /* Not a valid continuation byte. */
                 return 0;
 
         len -= num_bytes;
         data += num_bytes;
         strlen++;
     }
 
     if (strlen_return)
         *strlen_return = strlen;
 
     return 1;
 }

Variable Documentation

◆ utf8_hibits

unsigned char utf8_hibits[7]

static

Initial value:

Definition at line 87 of file wsutf8.c.

Referenced by ws_utf8_append_char().

◆ utf8_hidata_masks

unsigned char utf8_hidata_masks[7]

static

Initial value:

Definition at line 142 of file wsutf8.c.

Referenced by ws_utf8_get_char().

Macros

Functions

Variables

Macro Definition Documentation

◆ WS_UTF8_CONT_DATA_MASK

◆ WS_UTF8_DEC_1_M

◆ WS_UTF8_DEC_1_V

◆ WS_UTF8_DEC_2_M

◆ WS_UTF8_DEC_2_V

◆ WS_UTF8_DEC_3_M

◆ WS_UTF8_DEC_3_V

◆ WS_UTF8_DEC_4_M

◆ WS_UTF8_DEC_4_V

◆ WS_UTF8_DEC_5_M

◆ WS_UTF8_DEC_5_V

◆ WS_UTF8_DEC_6_M

◆ WS_UTF8_DEC_6_V

◆ WS_UTF8_DEC_C_M

◆ WS_UTF8_DEC_C_P

◆ WS_UTF8_DEC_C_V

◆ WS_UTF8_DEC_TYPE

◆ WS_UTF8_ENC_1_M

◆ WS_UTF8_ENC_2_M

◆ WS_UTF8_ENC_3_M

◆ WS_UTF8_ENC_4_M

◆ WS_UTF8_ENC_5_M

◆ WS_UTF8_ENC_6_M

◆ WS_UTF8_ENC_C_BITS

◆ WS_UTF8_ENC_TYPE

Function Documentation

◆ ws_utf8_alloc()

◆ ws_utf8_append_char()

◆ ws_utf8_free()

◆ ws_utf8_free_data()

◆ ws_utf8_get_char()

◆ ws_utf8_set_data()

◆ ws_utf8_to_latin1()

◆ ws_utf8_verify()

Variable Documentation

◆ utf8_hibits

◆ utf8_hidata_masks