2 * Sylpheed -- a GTK+ based, lightweight, and fast e-mail client
3 * Copyright (C) 1999-2003 Hiroyuki Yamamoto
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
29 #define HTMLBUFSIZE 8192
30 #define HR_STR "------------------------------------------------"
32 typedef struct _HTMLSymbol HTMLSymbol;
40 static HTMLSymbol symbol_list[] = {
51 static HTMLSymbol ascii_symbol_list[] = {
114 static HTMLSymbol eucjp_symbol_list[] = {
116 {"¢" , "\xa1\xf1"},
117 {"£" , "\xa1\xf2"},
118 {"¥" , "\xa1\xef"},
120 {"§" , "\xa1\xf8"},
121 {"¨" , "\xa1\xaf"},
126 {"°" , "\xa1\xeb"},
127 {"±", "\xa1\xde"},
131 {"µ" , "\xa6\xcc"},
132 {"¶" , "\xa2\xf9"},
133 {"·", "\xa1\xa6"},
146 {"Ä" , "A\xa1\xaf"},
147 {"Å" , "A\xa1\xeb"},
152 {"Ë" , "E\xa1\xaf"},
156 {"Ï" , "I\xa1\xaf"},
163 {"Ö" , "O\xa1\xaf"},
164 {"×" , "\xa1\xdf"},
168 {"Ü" , "U\xa1\xaf"},
175 {"ä" , "a\xa1\xaf"},
176 {"å" , "a\xa1\xeb"},
181 {"ë" , "e\xa1\xaf"},
185 {"ï" , "i\xa1\xaf"},
187 {"ð" , "\xa2\xdf"},
193 {"ö" , "o\xa1\xaf"},
194 {"÷", "\xa1\xe0"},
198 {"ü" , "u\xa1\xaf"},
200 {"ÿ" , "y\xa1\xaf"},
203 static HTMLSymbol latin_symbol_list[] = {
204 {"¡" , "\xa1"},
206 {"£" , "\xa3"},
207 {"¤", "\xa4"},
209 {"¦", "\xa6"},
214 {"«" , "\xab"},
221 {"±", "\xb1"},
224 {"´" , "\xb4"},
225 {"µ" , "\xb5"},
227 {"·", "\xb7"},
228 {"¸" , "\xb8"},
231 {"»" , "\xbb"},
232 {"¼", "\xbc"},
233 {"½", "\xbd"},
234 {"¾", "\xbe"},
235 {"¿", "\xbf"},
237 {"À", "\xc0"},
238 {"Á", "\xc1"},
239 {"Â" , "\xc2"},
240 {"Ã", "\xc3"},
242 {"Å" , "\xc5"},
243 {"Æ" , "\xc6"},
244 {"Ç", "\xc7"},
245 {"È", "\xc8"},
246 {"É", "\xc9"},
247 {"Ê" , "\xca"},
249 {"Ì", "\xcc"},
250 {"Í", "\xcd"},
251 {"Î" , "\xce"},
255 {"Ñ", "\xd1"},
256 {"Ò", "\xd2"},
257 {"Ó", "\xd3"},
258 {"Ô" , "\xd4"},
259 {"Õ", "\xd5"},
261 {"×" , "\xd7"},
262 {"Ø", "\xd8"},
263 {"Ù", "\xd9"},
264 {"Ú", "\xda"},
265 {"Û" , "\xdb"},
267 {"Ý", "\xdd"},
268 {"Þ" , "\xde"},
269 {"ß" , "\xdf"},
271 {"à", "\xe0"},
272 {"á", "\xe1"},
273 {"â" , "\xe2"},
274 {"ã", "\xe3"},
276 {"å" , "\xe5"},
277 {"æ" , "\xe6"},
278 {"ç", "\xe7"},
279 {"è", "\xe8"},
280 {"é", "\xe9"},
281 {"ê" , "\xea"},
283 {"ì", "\xec"},
284 {"í", "\xed"},
285 {"î" , "\xee"},
289 {"ñ", "\xf1"},
290 {"ò", "\xf2"},
291 {"ó", "\xf3"},
292 {"ô" , "\xf4"},
293 {"õ", "\xf5"},
295 {"÷", "\xf7"},
296 {"ø", "\xf8"},
297 {"ù", "\xf9"},
298 {"ú", "\xfa"},
299 {"û" , "\xfb"},
301 {"ý", "\xfd"},
302 {"þ" , "\xfe"},
306 static GHashTable *default_symbol_table;
307 static GHashTable *eucjp_symbol_table;
308 static GHashTable *latin_symbol_table;
310 static HTMLState html_read_line (HTMLParser *parser);
311 static void html_append_char (HTMLParser *parser,
313 static void html_append_str (HTMLParser *parser,
316 static HTMLState html_parse_tag (HTMLParser *parser);
317 static void html_parse_special (HTMLParser *parser);
318 static void html_get_parenthesis (HTMLParser *parser,
323 HTMLParser *html_parser_new(FILE *fp, CodeConverter *conv)
327 g_return_val_if_fail(fp != NULL, NULL);
328 g_return_val_if_fail(conv != NULL, NULL);
330 parser = g_new0(HTMLParser, 1);
333 parser->str = g_string_new(NULL);
334 parser->buf = g_string_new(NULL);
335 parser->bufp = parser->buf->str;
336 parser->state = HTML_NORMAL;
338 parser->newline = TRUE;
339 parser->empty_line = TRUE;
340 parser->space = FALSE;
343 #define SYMBOL_TABLE_ADD(table, list) \
347 for (i = 0; i < sizeof(list) / sizeof(list[0]); i++) \
348 g_hash_table_insert(table, list[i].key, list[i].val); \
351 if (!default_symbol_table) {
352 default_symbol_table =
353 g_hash_table_new(g_str_hash, g_str_equal);
354 SYMBOL_TABLE_ADD(default_symbol_table, symbol_list);
355 SYMBOL_TABLE_ADD(default_symbol_table, ascii_symbol_list);
357 if (!eucjp_symbol_table) {
359 g_hash_table_new(g_str_hash, g_str_equal);
360 SYMBOL_TABLE_ADD(eucjp_symbol_table, symbol_list);
361 SYMBOL_TABLE_ADD(eucjp_symbol_table, eucjp_symbol_list);
363 if (!latin_symbol_table) {
365 g_hash_table_new(g_str_hash, g_str_equal);
366 SYMBOL_TABLE_ADD(latin_symbol_table, symbol_list);
367 SYMBOL_TABLE_ADD(latin_symbol_table, latin_symbol_list);
370 #undef SYMBOL_TABLE_ADD
372 if (conv->charset == C_ISO_8859_1)
373 parser->symbol_table = latin_symbol_table;
374 else if ((conv->charset == C_ISO_2022_JP ||
375 conv->charset == C_ISO_2022_JP_2 ||
376 conv->charset == C_EUC_JP ||
377 conv->charset == C_SHIFT_JIS) &&
378 conv_get_current_charset() == C_EUC_JP)
379 parser->symbol_table = eucjp_symbol_table;
381 parser->symbol_table = default_symbol_table;
386 void html_parser_destroy(HTMLParser *parser)
388 g_string_free(parser->str, TRUE);
389 g_string_free(parser->buf, TRUE);
390 g_free(parser->href);
394 gchar *html_parse(HTMLParser *parser)
396 parser->state = HTML_NORMAL;
397 g_string_truncate(parser->str, 0);
399 if (*parser->bufp == '\0') {
400 g_string_truncate(parser->buf, 0);
401 parser->bufp = parser->buf->str;
402 if (html_read_line(parser) == HTML_EOF)
406 while (*parser->bufp != '\0') {
407 switch (*parser->bufp) {
410 st = html_parse_tag(parser);
411 /* when we see an href, we need to flush the str
412 * buffer. Then collect all the chars until we
413 * see the end anchor tag
415 if (HTML_HREF_BEG == st || HTML_HREF == st)
416 return parser->str->str;
419 html_parse_special(parser);
425 if (parser->bufp[0] == '\r' && parser->bufp[1] == '\n')
429 if (!parser->newline)
430 parser->space = TRUE;
437 html_append_char(parser, *parser->bufp++);
441 return parser->str->str;
444 static HTMLState html_read_line(HTMLParser *parser)
446 gchar buf[HTMLBUFSIZE];
447 gchar buf2[HTMLBUFSIZE];
450 if (fgets(buf, sizeof(buf), parser->fp) == NULL) {
451 parser->state = HTML_EOF;
455 if (conv_convert(parser->conv, buf2, sizeof(buf2), buf) < 0) {
456 index = parser->bufp - parser->buf->str;
458 conv_localetodisp(buf2, sizeof(buf2), buf);
459 g_string_append(parser->buf, buf2);
461 parser->bufp = parser->buf->str + index;
463 return HTML_CONV_FAILED;
466 index = parser->bufp - parser->buf->str;
468 g_string_append(parser->buf, buf2);
470 parser->bufp = parser->buf->str + index;
475 static void html_append_char(HTMLParser *parser, gchar ch)
477 GString *str = parser->str;
479 if (!parser->pre && parser->space) {
480 g_string_append_c(str, ' ');
481 parser->space = FALSE;
484 g_string_append_c(str, ch);
486 parser->empty_line = FALSE;
488 parser->newline = TRUE;
489 if (str->len > 1 && str->str[str->len - 2] == '\n')
490 parser->empty_line = TRUE;
492 parser->newline = FALSE;
495 static void html_append_str(HTMLParser *parser, const gchar *str, gint len)
497 GString *string = parser->str;
499 if (!parser->pre && parser->space) {
500 g_string_append_c(string, ' ');
501 parser->space = FALSE;
504 if (len == 0) return;
506 g_string_append(string, str);
509 Xstrndup_a(s, str, len, return);
510 g_string_append(string, s);
513 parser->empty_line = FALSE;
514 if (string->len > 0 && string->str[string->len - 1] == '\n') {
515 parser->newline = TRUE;
516 if (string->len > 1 && string->str[string->len - 2] == '\n')
517 parser->empty_line = TRUE;
519 parser->newline = FALSE;
522 static HTMLTag *html_get_tag(const gchar *str)
528 g_return_val_if_fail(str != NULL, NULL);
530 if (*str == '\0' || *str == '!') return NULL;
532 Xstrdup_a(tmp, str, return NULL);
534 tag = g_new0(HTMLTag, 1);
536 for (tmpp = tmp; *tmpp != '\0' && !isspace(*tmpp); tmpp++)
541 tag->name = g_strdup(tmp);
546 tag->name = g_strdup(tmp);
549 while (*tmpp != '\0') {
556 while (isspace(*tmpp)) tmpp++;
559 while (*tmpp != '\0' && !isspace(*tmpp) && *tmpp != '=') tmpp++;
560 if (*tmpp != '\0' && *tmpp != '=') {
562 while (isspace(*tmpp)) tmpp++;
567 while (isspace(*tmpp)) tmpp++;
569 if (*tmpp == '"' || *tmpp == '\'') {
574 if ((p = strchr(attr_value, quote)) == NULL) {
575 g_warning("html_get_tag(): syntax error in tag: '%s'\n", str);
580 while (isspace(*tmpp)) tmpp++;
584 while (*tmpp != '\0' && !isspace(*tmpp)) tmpp++;
591 g_strchomp(attr_name);
592 g_strdown(attr_name);
593 attr = g_new(HTMLAttr, 1);
594 attr->name = g_strdup(attr_name);
595 attr->value = g_strdup(attr_value);
596 tag->attr = g_list_append(tag->attr, attr);
602 static void html_free_tag(HTMLTag *tag)
607 while (tag->attr != NULL) {
608 HTMLAttr *attr = (HTMLAttr *)tag->attr->data;
612 tag->attr = g_list_remove(tag->attr, tag->attr->data);
617 static HTMLState html_parse_tag(HTMLParser *parser)
619 gchar buf[HTMLBUFSIZE];
622 html_get_parenthesis(parser, buf, sizeof(buf));
624 tag = html_get_tag(buf);
626 parser->state = HTML_UNKNOWN;
627 if (!tag) return HTML_UNKNOWN;
629 if (!strcmp(tag->name, "br")) {
630 parser->space = FALSE;
631 html_append_char(parser, '\n');
632 parser->state = HTML_BR;
633 } else if (!strcmp(tag->name, "a")) {
634 if (tag->attr && tag->attr->data &&
635 !strcmp(((HTMLAttr *)tag->attr->data)->name, "href")) {
636 g_free(parser->href);
638 g_strdup(((HTMLAttr *)tag->attr->data)->value);
639 parser->state = HTML_HREF_BEG;
641 } else if (!strcmp(tag->name, "/a")) {
642 parser->state = HTML_HREF;
643 } else if (!strcmp(tag->name, "p")) {
644 parser->space = FALSE;
645 if (!parser->empty_line) {
646 parser->space = FALSE;
647 if (!parser->newline) html_append_char(parser, '\n');
648 html_append_char(parser, '\n');
650 parser->state = HTML_PAR;
651 } else if (!strcmp(tag->name, "pre")) {
653 parser->state = HTML_PRE;
654 } else if (!strcmp(tag->name, "/pre")) {
656 parser->state = HTML_NORMAL;
657 } else if (!strcmp(tag->name, "hr")) {
658 if (!parser->newline) {
659 parser->space = FALSE;
660 html_append_char(parser, '\n');
662 html_append_str(parser, HR_STR "\n", -1);
663 parser->state = HTML_HR;
664 } else if (!strcmp(tag->name, "div") ||
665 !strcmp(tag->name, "ul") ||
666 !strcmp(tag->name, "li") ||
667 !strcmp(tag->name, "table") ||
668 !strcmp(tag->name, "tr") ||
669 (tag->name[0] == 'h' && isdigit((guchar)tag->name[1]))) {
670 if (!parser->newline) {
671 parser->space = FALSE;
672 html_append_char(parser, '\n');
674 parser->state = HTML_NORMAL;
675 } else if (!strcmp(tag->name, "/table") ||
676 (tag->name[0] == '/' &&
677 tag->name[1] == 'h' &&
678 isdigit((guchar)tag->name[1]))) {
679 if (!parser->empty_line) {
680 parser->space = FALSE;
681 if (!parser->newline) html_append_char(parser, '\n');
682 html_append_char(parser, '\n');
684 parser->state = HTML_NORMAL;
685 } else if (!strcmp(tag->name, "/div") ||
686 !strcmp(tag->name, "/ul") ||
687 !strcmp(tag->name, "/li")) {
688 if (!parser->newline) {
689 parser->space = FALSE;
690 html_append_char(parser, '\n');
692 parser->state = HTML_NORMAL;
697 return parser->state;
700 static void html_parse_special(HTMLParser *parser)
702 gchar symbol_name[9];
706 parser->state = HTML_UNKNOWN;
707 g_return_if_fail(*parser->bufp == '&');
710 for (n = 0; parser->bufp[n] != '\0' && parser->bufp[n] != ';'; n++)
712 if (n > 7 || parser->bufp[n] != ';') {
713 /* output literal `&' */
714 html_append_char(parser, *parser->bufp++);
715 parser->state = HTML_NORMAL;
718 strncpy2(symbol_name, parser->bufp, n + 2);
719 parser->bufp += n + 1;
721 if ((val = g_hash_table_lookup(parser->symbol_table, symbol_name))
723 html_append_str(parser, val, -1);
724 parser->state = HTML_NORMAL;
726 } else if (symbol_name[1] == '#' && isdigit((guchar)symbol_name[2])) {
729 ch = atoi(symbol_name + 2);
730 if ((ch > 0 && ch <= 127) ||
731 (ch >= 128 && ch <= 255 &&
732 parser->conv->charset == C_ISO_8859_1)) {
733 html_append_char(parser, ch);
734 parser->state = HTML_NORMAL;
739 /* http://www.w3schools.com/html/html_entitiesref.asp */
740 case 338: /* capital ligature OE Œ */
742 case 339: /* small ligature OE œ */
744 case 352: /* capital S w/caron Š */
745 case 353: /* small S w/caron š */
746 case 376: /* cap Y w/ diaeres Ÿ */
748 case 710: /* circumflex accent ˆ */
750 case 732: /* small tilde ˜ */
752 case 8194: /* en space   */
753 case 8195: /* em space   */
754 case 8201: /* thin space   */
756 case 8204: /* zero width non-joiner ‌ */
757 case 8205: /* zero width joiner ‍ */
758 case 8206: /* l-t-r mark ‎ */
759 case 8207: /* r-t-l mark &rlm */
761 case 8211: /* en dash – */
763 case 8212: /* em dash — */
765 case 8216: /* l single quot mark ‘ */
767 case 8217: /* r single quot mark ’ */
769 case 8218: /* single low-9 quot ‚ */
771 case 8220: /* l double quot mark “ */
773 case 8221: /* r double quot mark ” */
775 case 8222: /* double low-9 quot „ */
777 case 8224: /* dagger † */
778 case 8225: /* double dagger ‡ */
780 case 8230: /* horizontal ellipsis … */
782 case 8240: /* per mile ‰ */
784 case 8249: /* l-pointing angle quot ‹ */
786 case 8250: /* r-pointing angle quot › */
788 case 8364: /* euro € */
789 symb = "&euro"; break;
790 case 8482: /* trademark ™ */
791 symb = "(TM)"; break;
795 html_append_str(parser, symb, -1);
796 parser->state = HTML_NORMAL;
802 html_append_str(parser, symbol_name, -1);
805 static void html_get_parenthesis(HTMLParser *parser, gchar *buf, gint len)
810 g_return_if_fail(*parser->bufp == '<');
812 /* ignore comment / CSS / script stuff */
813 if (!strncmp(parser->bufp, "<!--", 4)) {
815 while ((p = strstr(parser->bufp, "-->")) == NULL)
816 if (html_read_line(parser) == HTML_EOF) return;
817 parser->bufp = p + 3;
820 if (!g_ascii_strncasecmp(parser->bufp, "<style", 6)) {
822 while ((p = strcasestr(parser->bufp, "</style>")) == NULL)
823 if (html_read_line(parser) == HTML_EOF) return;
824 parser->bufp = p + 8;
827 if (!g_ascii_strncasecmp(parser->bufp, "<script", 7)) {
829 while ((p = strcasestr(parser->bufp, "</script>")) == NULL)
830 if (html_read_line(parser) == HTML_EOF) return;
831 parser->bufp = p + 9;
836 while ((p = strchr(parser->bufp, '>')) == NULL)
837 if (html_read_line(parser) == HTML_EOF) return;
839 strncpy2(buf, parser->bufp, MIN(p - parser->bufp + 1, len));
841 parser->bufp = p + 1;