2 * Sylpheed -- a GTK+ based, lightweight, and fast e-mail client
3 * Copyright (C) 1999-2007 Hiroyuki Yamamoto and the Claws Mail team
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
29 #define SC_HTMLBUFSIZE 8192
30 #define HR_STR "------------------------------------------------"
32 typedef struct _SC_HTMLSymbol SC_HTMLSymbol;
40 static SC_HTMLSymbol symbol_list[] = {
49 {"«", "\302\253"},
50 {"»", "\302\273"},
52 {"™", "\50\124\115\51"},
53 {"…", "\56\56\56"},
56 {"—", "\55\55"},
57 {"€", "\105\125\122"},
58 {"¢", "\302\242"},
59 {"£", "\302\243"},
60 {"¤", "\302\244"},
61 {"¥", "\302\245"},
62 {"©", "\302\251"},
63 {"®", "\302\256"},
64 {"¿", "\302\277"},
65 {"¡", "\302\241"}
68 static SC_HTMLSymbol ascii_symbol_list[] = {
69 {"¡" , "\302\241"},
70 {"¦", "\302\246"},
71 {"©" , "\302\251"},
72 {"«" , "\302\253"},
73 {"®" , "\302\256"},
75 {"²" , "\302\262"},
76 {"³" , "\302\263"},
77 {"´" , "\302\264"},
78 {"¸" , "\302\270"},
79 {"¹" , "\302\271"},
80 {"»" , "\302\273"},
81 {"¼", "\302\274"},
82 {"½", "\302\275"},
83 {"¾", "\302\276"},
84 {"¿", "\302\277"},
86 {"À", "\303\200"},
87 {"Á", "\303\201"},
88 {"Â" , "\303\202"},
89 {"Ã", "\303\203"},
90 {"Æ" , "\303\206"},
91 {"È", "\303\210"},
92 {"É", "\303\211"},
93 {"Ê" , "\303\212"},
94 {"Ì", "\303\214"},
95 {"Í", "\303\215"},
96 {"Î" , "\303\216"},
98 {"Ñ", "\303\221"},
99 {"Ò", "\303\222"},
100 {"Ó", "\303\223"},
101 {"Ô" , "\303\224"},
102 {"Õ", "\303\225"},
103 {"Ù", "\303\231"},
104 {"Ú", "\303\232"},
105 {"Û" , "\303\233"},
106 {"Ý", "\303\235"},
108 {"à", "\303\240"},
109 {"á", "\303\241"},
110 {"â" , "\303\242"},
111 {"ã", "\303\243"},
112 {"æ" , "\303\246"},
113 {"è", "\303\250"},
114 {"é", "\303\251"},
115 {"ê" , "\303\252"},
116 {"ì", "\303\254"},
117 {"í", "\303\255"},
118 {"î" , "\303\256"},
120 {"ñ", "\303\261"},
121 {"ò", "\303\262"},
122 {"ó", "\303\263"},
123 {"ô" , "\303\264"},
124 {"õ", "\303\265"},
125 {"ù", "\303\271"},
126 {"ú", "\303\272"},
127 {"û" , "\303\273"},
128 {"ý", "\303\275"}
131 typedef struct _SC_HTMLAltSymbol SC_HTMLAltSymbol;
133 struct _SC_HTMLAltSymbol
139 /* http://www.w3schools.com/html/html_entitiesref.asp */
140 static SC_HTMLAltSymbol alternate_symbol_list[] = {
141 { 96, "\140"}, /* backtick */
142 { 153, "\50\124\115\51"}, /* trademark */
143 { 161, "\302\241"}, /* inverted exclamation mark ¡ */
144 { 162, "\302\242"}, /* cent (currency) ¢ */
145 { 163, "\302\243"}, /* pound (currency) £ */
146 { 164, "\342\202\254"}, /* currency sign ¤ */
147 { 165, "\302\245"}, /* yen (currency) ¥ */
148 { 169, "\302\251"}, /* copyright sign © */
149 { 174, "\302\256"}, /* registered sign ® */
150 { 191, "\302\277"}, /* inverted question mark ¿ */
151 { 338, "\117\105"}, /* capital ligature OE &OElig */
152 { 339, "\157\145"}, /* small ligature OE &oelig */
153 { 352, NULL}, /* capital S w/caron &Scaron */
154 { 353, NULL}, /* small S w/caron &scaron */
155 { 376, NULL}, /* cap Y w/ diaeres &Yuml */
156 { 710, "\136"}, /* circumflex accent &circ */
157 { 732, "\176"}, /* small tilde &tilde */
158 {8194, "\40"}, /* en space &ensp */
159 {8195, "\40"}, /* em space &emsp */
160 {8201, "\40"}, /* thin space &thinsp */
161 {8204, NULL}, /* zero width non-joiner &zwnj */
162 {8205, NULL}, /* zero width joiner &zwj */
163 {8206, NULL}, /* l-t-r mark &lrm */
164 {8207, NULL}, /* r-t-l mark &rlm */
165 {8211, "\55"}, /* en dash &ndash */
166 {8212, "\55\55"}, /* em dash &mdash */
167 {8216, "\47"}, /* l single quot mark &lsquo */
168 {8217, "\47"}, /* r single quot mark &rsquo */
169 {8218, "\54"}, /* single low-9 quot &sbquo */
170 {8220, "\134"}, /* l double quot mark &ldquo */
171 {8221, "\134"}, /* r double quot mark &rdquo */
172 {8222, "\42"}, /* double low-9 quot &bdquo */
173 {8224, NULL}, /* dagger &dagger */
174 {8225, NULL}, /* double dagger &Dagger */
175 {8226, "\52"}, /* bullet &bull */
176 {8230, "\56\56\56"}, /* horizontal ellipsis &hellip */
177 {8240, "\45\157"}, /* per mile &permil */
178 {8249, "\74"}, /* l-pointing angle quot &lsaquo */
179 {8250, "\76"}, /* r-pointing angle quot &rsaquo */
180 {8364, "\105\125\122"}, /* euro &euro */
181 {8482, "\50\124\115\51"} /* trademark &trade */
184 static GHashTable *default_symbol_table;
185 static GHashTable *alternate_symbol_table;
187 static SC_HTMLState sc_html_read_line (SC_HTMLParser *parser);
188 static void sc_html_append_char (SC_HTMLParser *parser,
190 static void sc_html_append_str (SC_HTMLParser *parser,
193 static SC_HTMLState sc_html_parse_tag (SC_HTMLParser *parser);
194 static void sc_html_parse_special (SC_HTMLParser *parser);
195 static void sc_html_get_parenthesis (SC_HTMLParser *parser,
200 SC_HTMLParser *sc_html_parser_new(FILE *fp, CodeConverter *conv)
202 SC_HTMLParser *parser;
204 g_return_val_if_fail(fp != NULL, NULL);
205 g_return_val_if_fail(conv != NULL, NULL);
207 parser = g_new0(SC_HTMLParser, 1);
210 parser->str = g_string_new(NULL);
211 parser->buf = g_string_new(NULL);
212 parser->bufp = parser->buf->str;
213 parser->state = SC_HTML_NORMAL;
215 parser->newline = TRUE;
216 parser->empty_line = TRUE;
217 parser->space = FALSE;
220 #define SYMBOL_TABLE_ADD(table, list) \
224 for (i = 0; i < sizeof(list) / sizeof(list[0]); i++) \
225 g_hash_table_insert(table, list[i].key, list[i].val); \
227 #define SYMBOL_TABLE_REF_ADD(table, list) \
231 for (i = 0; i < sizeof(list) / sizeof(list[0]); i++) \
232 g_hash_table_insert(table, &list[i].key, list[i].val); \
235 if (!default_symbol_table) {
236 default_symbol_table =
237 g_hash_table_new(g_str_hash, g_str_equal);
238 SYMBOL_TABLE_ADD(default_symbol_table, symbol_list);
239 SYMBOL_TABLE_ADD(default_symbol_table, ascii_symbol_list);
241 if (!alternate_symbol_table) {
242 alternate_symbol_table =
243 g_hash_table_new(g_int_hash, g_int_equal);
244 SYMBOL_TABLE_REF_ADD(alternate_symbol_table, alternate_symbol_list);
247 #undef SYMBOL_TABLE_ADD
248 #undef SYMBOL_TABLE_REF_ADD
250 parser->symbol_table = default_symbol_table;
251 parser->alt_symbol_table = alternate_symbol_table;
256 void sc_html_parser_destroy(SC_HTMLParser *parser)
258 g_string_free(parser->str, TRUE);
259 g_string_free(parser->buf, TRUE);
260 g_free(parser->href);
264 gchar *sc_html_parse(SC_HTMLParser *parser)
266 parser->state = SC_HTML_NORMAL;
267 g_string_truncate(parser->str, 0);
269 if (*parser->bufp == '\0') {
270 g_string_truncate(parser->buf, 0);
271 parser->bufp = parser->buf->str;
272 if (sc_html_read_line(parser) == SC_HTML_EOF)
276 while (*parser->bufp != '\0') {
277 switch (*parser->bufp) {
280 st = sc_html_parse_tag(parser);
281 /* when we see an href, we need to flush the str
282 * buffer. Then collect all the chars until we
283 * see the end anchor tag
285 if (SC_HTML_HREF_BEG == st || SC_HTML_HREF == st)
286 return parser->str->str;
290 sc_html_parse_special(parser);
296 if (parser->bufp[0] == '\r' && parser->bufp[1] == '\n')
300 if (!parser->newline)
301 parser->space = TRUE;
308 sc_html_append_char(parser, *parser->bufp++);
312 return parser->str->str;
315 static SC_HTMLState sc_html_read_line(SC_HTMLParser *parser)
317 gchar buf[SC_HTMLBUFSIZE];
318 gchar buf2[SC_HTMLBUFSIZE];
321 if (fgets(buf, sizeof(buf), parser->fp) == NULL) {
322 parser->state = SC_HTML_EOF;
326 if (conv_convert(parser->conv, buf2, sizeof(buf2), buf) < 0) {
327 index = parser->bufp - parser->buf->str;
329 conv_utf8todisp(buf2, sizeof(buf2), buf);
330 g_string_append(parser->buf, buf2);
332 parser->bufp = parser->buf->str + index;
334 return SC_HTML_CONV_FAILED;
337 index = parser->bufp - parser->buf->str;
339 g_string_append(parser->buf, buf2);
341 parser->bufp = parser->buf->str + index;
343 return SC_HTML_NORMAL;
346 static void sc_html_append_char(SC_HTMLParser *parser, gchar ch)
348 GString *str = parser->str;
350 if (!parser->pre && parser->space) {
351 g_string_append_c(str, ' ');
352 parser->space = FALSE;
355 g_string_append_c(str, ch);
357 parser->empty_line = FALSE;
359 parser->newline = TRUE;
360 if (str->len > 1 && str->str[str->len - 2] == '\n')
361 parser->empty_line = TRUE;
363 parser->newline = FALSE;
366 static void sc_html_append_str(SC_HTMLParser *parser, const gchar *str, gint len)
368 GString *string = parser->str;
370 if (!parser->pre && parser->space) {
371 g_string_append_c(string, ' ');
372 parser->space = FALSE;
375 if (len == 0) return;
377 g_string_append(string, str);
380 Xstrndup_a(s, str, len, return);
381 g_string_append(string, s);
384 parser->empty_line = FALSE;
385 if (string->len > 0 && string->str[string->len - 1] == '\n') {
386 parser->newline = TRUE;
387 if (string->len > 1 && string->str[string->len - 2] == '\n')
388 parser->empty_line = TRUE;
390 parser->newline = FALSE;
393 static SC_HTMLTag *sc_html_get_tag(const gchar *str)
399 g_return_val_if_fail(str != NULL, NULL);
401 if (*str == '\0' || *str == '!') return NULL;
403 Xstrdup_a(tmp, str, return NULL);
405 tag = g_new0(SC_HTMLTag, 1);
407 for (tmpp = tmp; *tmpp != '\0' && !g_ascii_isspace(*tmpp); tmpp++)
412 tag->name = g_strdup(tmp);
417 tag->name = g_strdup(tmp);
420 while (*tmpp != '\0') {
427 while (g_ascii_isspace(*tmpp)) tmpp++;
430 while (*tmpp != '\0' && !g_ascii_isspace(*tmpp) &&
433 if (*tmpp != '\0' && *tmpp != '=') {
435 while (g_ascii_isspace(*tmpp)) tmpp++;
440 while (g_ascii_isspace(*tmpp)) tmpp++;
442 if (*tmpp == '"' || *tmpp == '\'') {
447 if ((p = strchr(attr_value, quote)) == NULL) {
448 g_warning("sc_html_get_tag(): syntax error in tag: '%s'\n", str);
453 while (g_ascii_isspace(*tmpp)) tmpp++;
457 while (*tmpp != '\0' && !g_ascii_isspace(*tmpp)) tmpp++;
464 g_strchomp(attr_name);
465 g_strdown(attr_name);
466 attr = g_new(SC_HTMLAttr, 1);
467 attr->name = g_strdup(attr_name);
468 attr->value = g_strdup(attr_value);
469 tag->attr = g_list_append(tag->attr, attr);
475 static void sc_html_free_tag(SC_HTMLTag *tag)
480 while (tag->attr != NULL) {
481 SC_HTMLAttr *attr = (SC_HTMLAttr *)tag->attr->data;
485 tag->attr = g_list_remove(tag->attr, tag->attr->data);
490 static SC_HTMLState sc_html_parse_tag(SC_HTMLParser *parser)
492 gchar buf[SC_HTMLBUFSIZE];
495 sc_html_get_parenthesis(parser, buf, sizeof(buf));
497 tag = sc_html_get_tag(buf);
499 parser->state = SC_HTML_UNKNOWN;
500 if (!tag) return SC_HTML_UNKNOWN;
502 if (!strcmp(tag->name, "br")) {
503 parser->space = FALSE;
504 sc_html_append_char(parser, '\n');
505 parser->state = SC_HTML_BR;
506 } else if (!strcmp(tag->name, "a")) {
508 for (cur = tag->attr; cur != NULL; cur = cur->next) {
509 if (cur->data && !strcmp(((SC_HTMLAttr *)cur->data)->name, "href")) {
510 g_free(parser->href);
511 parser->href = g_strdup(((SC_HTMLAttr *)cur->data)->value);
512 parser->state = SC_HTML_HREF_BEG;
516 } else if (!strcmp(tag->name, "/a")) {
517 parser->state = SC_HTML_HREF;
518 } else if (!strcmp(tag->name, "p")) {
519 parser->space = FALSE;
520 if (!parser->empty_line) {
521 parser->space = FALSE;
522 if (!parser->newline) sc_html_append_char(parser, '\n');
523 sc_html_append_char(parser, '\n');
525 parser->state = SC_HTML_PAR;
526 } else if (!strcmp(tag->name, "pre")) {
528 parser->state = SC_HTML_PRE;
529 } else if (!strcmp(tag->name, "/pre")) {
531 parser->state = SC_HTML_NORMAL;
532 } else if (!strcmp(tag->name, "hr")) {
533 if (!parser->newline) {
534 parser->space = FALSE;
535 sc_html_append_char(parser, '\n');
537 sc_html_append_str(parser, HR_STR "\n", -1);
538 parser->state = SC_HTML_HR;
539 } else if (!strcmp(tag->name, "div") ||
540 !strcmp(tag->name, "ul") ||
541 !strcmp(tag->name, "li") ||
542 !strcmp(tag->name, "table") ||
543 !strcmp(tag->name, "tr") ||
544 (tag->name[0] == 'h' && g_ascii_isdigit(tag->name[1]))) {
545 if (!parser->newline) {
546 parser->space = FALSE;
547 sc_html_append_char(parser, '\n');
549 parser->state = SC_HTML_NORMAL;
550 } else if (!strcmp(tag->name, "/table") ||
551 (tag->name[0] == '/' &&
552 tag->name[1] == 'h' &&
553 g_ascii_isdigit(tag->name[1]))) {
554 if (!parser->empty_line) {
555 parser->space = FALSE;
556 if (!parser->newline) sc_html_append_char(parser, '\n');
557 sc_html_append_char(parser, '\n');
559 parser->state = SC_HTML_NORMAL;
560 } else if (!strcmp(tag->name, "/div") ||
561 !strcmp(tag->name, "/ul") ||
562 !strcmp(tag->name, "/li")) {
563 if (!parser->newline) {
564 parser->space = FALSE;
565 sc_html_append_char(parser, '\n');
567 parser->state = SC_HTML_NORMAL;
570 sc_html_free_tag(tag);
572 return parser->state;
575 static void sc_html_parse_special(SC_HTMLParser *parser)
577 gchar symbol_name[9];
581 parser->state = SC_HTML_UNKNOWN;
582 g_return_if_fail(*parser->bufp == '&');
585 for (n = 0; parser->bufp[n] != '\0' && parser->bufp[n] != ';'; n++)
587 if (n > 7 || parser->bufp[n] != ';') {
588 /* output literal `&' */
589 sc_html_append_char(parser, *parser->bufp++);
590 parser->state = SC_HTML_NORMAL;
593 strncpy2(symbol_name, parser->bufp, n + 2);
594 parser->bufp += n + 1;
596 if ((val = g_hash_table_lookup(parser->symbol_table, symbol_name))
598 sc_html_append_str(parser, val, -1);
599 parser->state = SC_HTML_NORMAL;
601 } else if (symbol_name[1] == '#' && g_ascii_isdigit(symbol_name[2])) {
604 ch = atoi(symbol_name + 2);
605 if ((ch > 0 && ch <= 127) ||
606 (ch >= 128 && ch <= 255 &&
607 parser->conv->charset == C_ISO_8859_1)) {
608 sc_html_append_char(parser, ch);
609 parser->state = SC_HTML_NORMAL;
612 const gchar *symb = g_hash_table_lookup(parser->alt_symbol_table, &ch);
614 sc_html_append_str(parser, symb, -1);
615 parser->state = SC_HTML_NORMAL;
621 sc_html_append_str(parser, symbol_name, -1);
624 static void sc_html_get_parenthesis(SC_HTMLParser *parser, gchar *buf, gint len)
629 g_return_if_fail(*parser->bufp == '<');
631 /* ignore comment / CSS / script stuff */
632 if (!strncmp(parser->bufp, "<!--", 4)) {
634 while ((p = strstr(parser->bufp, "-->")) == NULL)
635 if (sc_html_read_line(parser) == SC_HTML_EOF) return;
636 parser->bufp = p + 3;
639 if (!g_ascii_strncasecmp(parser->bufp, "<style", 6)) {
641 while ((p = strcasestr(parser->bufp, "</style>")) == NULL)
642 if (sc_html_read_line(parser) == SC_HTML_EOF) return;
643 parser->bufp = p + 8;
646 if (!g_ascii_strncasecmp(parser->bufp, "<script", 7)) {
648 while ((p = strcasestr(parser->bufp, "</script>")) == NULL)
649 if (sc_html_read_line(parser) == SC_HTML_EOF) return;
650 parser->bufp = p + 9;
655 while ((p = strchr(parser->bufp, '>')) == NULL)
656 if (sc_html_read_line(parser) == SC_HTML_EOF) return;
658 strncpy2(buf, parser->bufp, MIN(p - parser->bufp + 1, len));
660 parser->bufp = p + 1;