Main Page | Alphabetical List | Data Structures | Directories | File List | Data Fields | Globals

html.c

Go to the documentation of this file.
00001 /* ==================================================================== 
00002  * The Kannel Software License, Version 1.0 
00003  * 
00004  * Copyright (c) 2001-2008 Kannel Group  
00005  * Copyright (c) 1998-2001 WapIT Ltd.   
00006  * All rights reserved. 
00007  * 
00008  * Redistribution and use in source and binary forms, with or without 
00009  * modification, are permitted provided that the following conditions 
00010  * are met: 
00011  * 
00012  * 1. Redistributions of source code must retain the above copyright 
00013  *    notice, this list of conditions and the following disclaimer. 
00014  * 
00015  * 2. Redistributions in binary form must reproduce the above copyright 
00016  *    notice, this list of conditions and the following disclaimer in 
00017  *    the documentation and/or other materials provided with the 
00018  *    distribution. 
00019  * 
00020  * 3. The end-user documentation included with the redistribution, 
00021  *    if any, must include the following acknowledgment: 
00022  *       "This product includes software developed by the 
00023  *        Kannel Group (http://www.kannel.org/)." 
00024  *    Alternately, this acknowledgment may appear in the software itself, 
00025  *    if and wherever such third-party acknowledgments normally appear. 
00026  * 
00027  * 4. The names "Kannel" and "Kannel Group" must not be used to 
00028  *    endorse or promote products derived from this software without 
00029  *    prior written permission. For written permission, please  
00030  *    contact org@kannel.org. 
00031  * 
00032  * 5. Products derived from this software may not be called "Kannel", 
00033  *    nor may "Kannel" appear in their name, without prior written 
00034  *    permission of the Kannel Group. 
00035  * 
00036  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED 
00037  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 
00038  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 
00039  * DISCLAIMED.  IN NO EVENT SHALL THE KANNEL GROUP OR ITS CONTRIBUTORS 
00040  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,  
00041  * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT  
00042  * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR  
00043  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,  
00044  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE  
00045  * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,  
00046  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
00047  * ==================================================================== 
00048  * 
00049  * This software consists of voluntary contributions made by many 
00050  * individuals on behalf of the Kannel Group.  For more information on  
00051  * the Kannel Group, please see <http://www.kannel.org/>. 
00052  * 
00053  * Portions of this software are based upon software originally written at  
00054  * WapIT Ltd., Helsinki, Finland for the Kannel project.  
00055  */ 
00056 
00057 /*
00058  * html.c - routines for manipulating HTML.
00059  *
00060  * Lars Wirzenius
00061  */
00062 
00063 
00064 #include <ctype.h>
00065 #include <stdio.h>
00066 #include <string.h>
00067 
00068 #include "html.h"
00069 #include "gwlib/gwlib.h"
00070 
00071 #define SMS_MAX 161
00072 
00073 
00074 /* Is there a comment beginning at offset `pos'? */
00075 static int html_comment_begins(Octstr *html, long pos)
00076 {
00077     char buf[10];
00078 
00079     octstr_get_many_chars(buf, html, pos, 4);
00080     buf[5] = '\0';
00081     return strcmp(buf, "<!--") == 0;
00082 }
00083 
00084 
00085 /* Skip a comment in HTML. */
00086 static void skip_html_comment(Octstr *html, long *pos)
00087 {
00088     long i;
00089 
00090     *pos += 4;  /* Skip "<!--" at beginning of comment. */
00091     i = octstr_search(html, octstr_imm("-->"), *pos);
00092     if (i == -1)
00093         *pos = octstr_len(html);
00094     else
00095         *pos = i;
00096 }
00097 
00098 
00099 /* Skip a beginning or ending tag in HTML, including any attributes. */
00100 static void skip_html_tag(Octstr *html, long *pos)
00101 {
00102     long i, len;
00103     int c;
00104 
00105     /* Skip leading '<'. */
00106     ++(*pos);
00107 
00108     /* Skip name of tag and attributes with values. */
00109     len = octstr_len(html);
00110     while (*pos < len && (c = octstr_get_char(html, *pos)) != '>') {
00111         if (c == '"' || c == '\'') {
00112             i = octstr_search_char(html, c, *pos + 1);
00113             if (i == -1)
00114                 *pos = len;
00115             else
00116                 *pos = i + 1;
00117         } else
00118             ++(*pos);
00119     }
00120 
00121     /* Skip trailing '>' if it is there. */
00122     if (octstr_get_char(html, *pos) == '>')
00123         ++(*pos);
00124 }
00125 
00126 
00127 /* Convert an HTML entity into a single character and advance `*html' past
00128    the entity. */
00129 static void convert_html_entity(Octstr *sms, Octstr *html, long *pos)
00130 {
00131     static struct {
00132         char *entity;
00133         int latin1;
00134     }
00135     tab[] = {
00136         { "&amp;", '&' },
00137         { "&lt;", '<' },
00138         { "&gt;", '>' },
00139 
00140         /* The following is copied from
00141 
00142             http://www.hut.fi/~jkorpela/HTML3.2/latin1.html
00143 
00144            by Jukka Korpela. Hand and script edited to form this
00145            table. */
00146 
00147         { "&nbsp;", ' ' },
00148         { "&iexcl;", 161 },
00149         { "&cent;", 162 },
00150         { "&pound;", 163 },
00151         { "&curren;", 164 },
00152         { "&yen;", 165 },
00153         { "&brvbar;", 166 },
00154         { "&sect;", 167 },
00155         { "&uml;", 168 },
00156         { "&copy;", 169 },
00157         { "&ordf;", 170 },
00158         { "&laquo;", 171 },
00159         { "&not;", 172 },
00160         { "&shy;", 173 },
00161         { "&reg;", 174 },
00162         { "&macr;", 175 },
00163         { "&deg;", 176 },
00164         { "&plusmn;", 177 },
00165         { "&sup2;", 178 },
00166         { "&sup3;", 179 },
00167         { "&acute;", 180 },
00168         { "&micro;", 181 },
00169         { "&para;", 182 },
00170         { "&middot;", 183 },
00171         { "&cedil;", 184 },
00172         { "&sup1;", 185 },
00173         { "&ordm;", 186 },
00174         { "&raquo;", 187 },
00175         { "&frac14;", 188 },
00176         { "&frac12;", 189 },
00177         { "&frac34;", 190 },
00178         { "&iquest;", 191 },
00179         { "&Agrave;", 192 },
00180         { "&Aacute;", 193 },
00181         { "&Acirc;", 194 },
00182         { "&Atilde;", 195 },
00183         { "&Auml;", 196 },
00184         { "&Aring;", 197 },
00185         { "&AElig;", 198 },
00186         { "&Ccedil;", 199 },
00187         { "&Egrave;", 200 },
00188         { "&Eacute;", 201 },
00189         { "&Ecirc;", 202 },
00190         { "&Euml;", 203 },
00191         { "&Igrave;", 204 },
00192         { "&Iacute;", 205 },
00193         { "&Icirc;", 206 },
00194         { "&Iuml;", 207 },
00195         { "&ETH;", 208 },
00196         { "&Ntilde;", 209 },
00197         { "&Ograve;", 210 },
00198         { "&Oacute;", 211 },
00199         { "&Ocirc;", 212 },
00200         { "&Otilde;", 213 },
00201         { "&Ouml;", 214 },
00202         { "&times;", 215 },
00203         { "&Oslash;", 216 },
00204         { "&Ugrave;", 217 },
00205         { "&Uacute;", 218 },
00206         { "&Ucirc;", 219 },
00207         { "&Uuml;", 220 },
00208         { "&Yacute;", 221 },
00209         { "&THORN;", 222 },
00210         { "&szlig;", 223 },
00211         { "&agrave;", 224 },
00212         { "&aacute;", 225 },
00213         { "&acirc;", 226 },
00214         { "&atilde;", 227 },
00215         { "&auml;", 228 },
00216         { "&aring;", 229 },
00217         { "&aelig;", 230 },
00218         { "&ccedil;", 231 },
00219         { "&egrave;", 232 },
00220         { "&eacute;", 233 },
00221         { "&ecirc;", 234 },
00222         { "&euml;", 235 },
00223         { "&igrave;", 236 },
00224         { "&iacute;", 237 },
00225         { "&icirc;", 238 },
00226         { "&iuml;", 239 },
00227         { "&eth;", 240 },
00228         { "&ntilde;", 241 },
00229         { "&ograve;", 242 },
00230         { "&oacute;", 243 },
00231         { "&ocirc;", 244 },
00232         { "&otilde;", 245 },
00233         { "&ouml;", 246 },
00234         { "&divide;", 247 },
00235         { "&oslash;", 248 },
00236         { "&ugrave;", 249 },
00237         { "&uacute;", 250 },
00238         { "&ucirc;", 251 },
00239         { "&uuml;", 252 },
00240         { "&yacute;", 253 },
00241         { "&thorn;", 254 },
00242         { "&yuml;", 255 },
00243     };
00244     int num_tab = sizeof(tab) / sizeof(tab[0]);
00245     long i, code;
00246     size_t len;
00247     char buf[1024];
00248 
00249     if (octstr_get_char(html, *pos + 1) == '#') {
00250         if (octstr_get_char(html, *pos + 2) == 'x' || octstr_get_char(html, *pos + 2) == 'X')
00251             i = octstr_parse_long(&code, html, *pos + 3, 16); /* hex */
00252         else
00253             i = octstr_parse_long(&code, html, *pos + 2, 10); /* decimal */
00254         if (i > 0) {
00255             if (code < 256)
00256                 octstr_append_char(sms, code);
00257             *pos = i + 1;
00258             if (octstr_get_char(html, *pos) == ';')
00259                 ++(*pos);
00260         } else {
00261             ++(*pos);
00262             octstr_append_char(sms, '&');
00263         }
00264     } else {
00265         for (i = 0; i < num_tab; ++i) {
00266             len = strlen(tab[i].entity);
00267             octstr_get_many_chars(buf, html, *pos, len);
00268             buf[len] = '\0';
00269             if (strcmp(buf, tab[i].entity) == 0) {
00270                 *pos += len;
00271                 octstr_append_char(sms, tab[i].latin1);
00272                 break;
00273             }
00274         }
00275         if (i == num_tab) {
00276             ++(*pos);
00277             octstr_append_char(sms, '&');
00278         }
00279     }
00280 }
00281 
00282 
00283 Octstr *html_to_sms(Octstr *html)
00284 {
00285     long i, len;
00286     int c;
00287     Octstr *sms;
00288 
00289     sms = octstr_create("");
00290     len = octstr_len(html);
00291     i = 0;
00292     while (i < len) {
00293         c = octstr_get_char(html, i);
00294         switch (c) {
00295         case '<':
00296             if (html_comment_begins(html, i))
00297                 skip_html_comment(html, &i);
00298             else
00299                 skip_html_tag(html, &i);
00300             break;
00301         case '&':
00302             convert_html_entity(sms, html, &i);
00303             break;
00304         default:
00305             octstr_append_char(sms, c);
00306             ++i;
00307             break;
00308         }
00309     }
00310     octstr_shrink_blanks(sms);
00311     octstr_strip_blanks(sms);
00312     return sms;
00313 }
See file LICENSE for details about the license agreement for using, modifying, copying or deriving work from this software.