2 * Sylpheed -- a GTK+ based, lightweight, and fast e-mail client
3 * Copyright (C) 1999-2003 Hiroyuki Yamamoto
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
29 #define HTMLBUFSIZE 8192
30 #define HR_STR "------------------------------------------------"
32 typedef struct _HTMLSymbol HTMLSymbol;
40 static HTMLSymbol symbol_list[] = {
51 static HTMLSymbol ascii_symbol_list[] = {
114 static HTMLSymbol eucjp_symbol_list[] = {
116 {"¢" , "\xa1\xf1"},
117 {"£" , "\xa1\xf2"},
118 {"¥" , "\xa1\xef"},
120 {"§" , "\xa1\xf8"},
121 {"¨" , "\xa1\xaf"},
126 {"°" , "\xa1\xeb"},
127 {"±", "\xa1\xde"},
131 {"µ" , "\xa6\xcc"},
132 {"¶" , "\xa2\xf9"},
133 {"·", "\xa1\xa6"},
146 {"Ä" , "A\xa1\xaf"},
147 {"Å" , "A\xa1\xeb"},
152 {"Ë" , "E\xa1\xaf"},
156 {"Ï" , "I\xa1\xaf"},
163 {"Ö" , "O\xa1\xaf"},
164 {"×" , "\xa1\xdf"},
168 {"Ü" , "U\xa1\xaf"},
175 {"ä" , "a\xa1\xaf"},
176 {"å" , "a\xa1\xeb"},
181 {"ë" , "e\xa1\xaf"},
185 {"ï" , "i\xa1\xaf"},
187 {"ð" , "\xa2\xdf"},
193 {"ö" , "o\xa1\xaf"},
194 {"÷", "\xa1\xe0"},
198 {"ü" , "u\xa1\xaf"},
200 {"ÿ" , "y\xa1\xaf"},
203 static HTMLSymbol latin_symbol_list[] = {
204 {"¡" , "\xa1"},
206 {"£" , "\xa3"},
207 {"¤", "\xa4"},
209 {"¦", "\xa6"},
214 {"«" , "\xab"},
221 {"±", "\xb1"},
224 {"´" , "\xb4"},
225 {"µ" , "\xb5"},
227 {"·", "\xb7"},
228 {"¸" , "\xb8"},
231 {"»" , "\xbb"},
232 {"¼", "\xbc"},
233 {"½", "\xbd"},
234 {"¾", "\xbe"},
235 {"¿", "\xbf"},
237 {"À", "\xc0"},
238 {"Á", "\xc1"},
239 {"Â" , "\xc2"},
240 {"Ã", "\xc3"},
242 {"Å" , "\xc5"},
243 {"Æ" , "\xc6"},
244 {"Ç", "\xc7"},
245 {"È", "\xc8"},
246 {"É", "\xc9"},
247 {"Ê" , "\xca"},
249 {"Ì", "\xcc"},
250 {"Í", "\xcd"},
251 {"Î" , "\xce"},
255 {"Ñ", "\xd1"},
256 {"Ò", "\xd2"},
257 {"Ó", "\xd3"},
258 {"Ô" , "\xd4"},
259 {"Õ", "\xd5"},
261 {"×" , "\xd7"},
262 {"Ø", "\xd8"},
263 {"Ù", "\xd9"},
264 {"Ú", "\xda"},
265 {"Û" , "\xdb"},
267 {"Ý", "\xdd"},
268 {"Þ" , "\xde"},
269 {"ß" , "\xdf"},
271 {"à", "\xe0"},
272 {"á", "\xe1"},
273 {"â" , "\xe2"},
274 {"ã", "\xe3"},
276 {"å" , "\xe5"},
277 {"æ" , "\xe6"},
278 {"ç", "\xe7"},
279 {"è", "\xe8"},
280 {"é", "\xe9"},
281 {"ê" , "\xea"},
283 {"ì", "\xec"},
284 {"í", "\xed"},
285 {"î" , "\xee"},
289 {"ñ", "\xf1"},
290 {"ò", "\xf2"},
291 {"ó", "\xf3"},
292 {"ô" , "\xf4"},
293 {"õ", "\xf5"},
295 {"÷", "\xf7"},
296 {"ø", "\xf8"},
297 {"ù", "\xf9"},
298 {"ú", "\xfa"},
299 {"û" , "\xfb"},
301 {"ý", "\xfd"},
302 {"þ" , "\xfe"},
306 static GHashTable *default_symbol_table;
307 static GHashTable *eucjp_symbol_table;
308 static GHashTable *latin_symbol_table;
310 static HTMLState html_read_line (HTMLParser *parser);
311 static void html_append_char (HTMLParser *parser,
313 static void html_append_str (HTMLParser *parser,
316 static HTMLState html_parse_tag (HTMLParser *parser);
317 static void html_parse_special (HTMLParser *parser);
318 static void html_get_parenthesis (HTMLParser *parser,
323 static gint g_str_case_equal (gconstpointer v,
325 static guint g_str_case_hash (gconstpointer key);
328 HTMLParser *html_parser_new(FILE *fp, CodeConverter *conv)
332 g_return_val_if_fail(fp != NULL, NULL);
333 g_return_val_if_fail(conv != NULL, NULL);
335 parser = g_new0(HTMLParser, 1);
338 parser->str = g_string_new(NULL);
339 parser->buf = g_string_new(NULL);
340 parser->bufp = parser->buf->str;
341 parser->state = HTML_NORMAL;
343 parser->newline = TRUE;
344 parser->empty_line = TRUE;
345 parser->space = FALSE;
348 #define SYMBOL_TABLE_ADD(table, list) \
352 for (i = 0; i < sizeof(list) / sizeof(list[0]); i++) \
353 g_hash_table_insert(table, list[i].key, list[i].val); \
356 if (!default_symbol_table) {
357 default_symbol_table =
358 g_hash_table_new(g_str_hash, g_str_equal);
359 SYMBOL_TABLE_ADD(default_symbol_table, symbol_list);
360 SYMBOL_TABLE_ADD(default_symbol_table, ascii_symbol_list);
362 if (!eucjp_symbol_table) {
364 g_hash_table_new(g_str_hash, g_str_equal);
365 SYMBOL_TABLE_ADD(eucjp_symbol_table, symbol_list);
366 SYMBOL_TABLE_ADD(eucjp_symbol_table, eucjp_symbol_list);
368 if (!latin_symbol_table) {
370 g_hash_table_new(g_str_hash, g_str_equal);
371 SYMBOL_TABLE_ADD(latin_symbol_table, symbol_list);
372 SYMBOL_TABLE_ADD(latin_symbol_table, latin_symbol_list);
375 #undef SYMBOL_TABLE_ADD
377 if (conv->charset == C_ISO_8859_1)
378 parser->symbol_table = latin_symbol_table;
379 else if ((conv->charset == C_ISO_2022_JP ||
380 conv->charset == C_ISO_2022_JP_2 ||
381 conv->charset == C_EUC_JP ||
382 conv->charset == C_SHIFT_JIS) &&
383 conv_get_current_charset() == C_EUC_JP)
384 parser->symbol_table = eucjp_symbol_table;
386 parser->symbol_table = default_symbol_table;
391 void html_parser_destroy(HTMLParser *parser)
393 g_string_free(parser->str, TRUE);
394 g_string_free(parser->buf, TRUE);
395 g_free(parser->href);
399 gchar *html_parse(HTMLParser *parser)
401 parser->state = HTML_NORMAL;
402 g_string_truncate(parser->str, 0);
404 if (*parser->bufp == '\0') {
405 g_string_truncate(parser->buf, 0);
406 parser->bufp = parser->buf->str;
407 if (html_read_line(parser) == HTML_EOF)
411 while (*parser->bufp != '\0') {
412 switch (*parser->bufp) {
414 if (parser->str->len == 0)
415 html_parse_tag(parser);
417 return parser->str->str;
420 html_parse_special(parser);
426 if (parser->bufp[0] == '\r' && parser->bufp[1] == '\n')
430 if (!parser->newline)
431 parser->space = TRUE;
438 html_append_char(parser, *parser->bufp++);
442 return parser->str->str;
445 static HTMLState html_read_line(HTMLParser *parser)
447 gchar buf[HTMLBUFSIZE];
448 gchar buf2[HTMLBUFSIZE];
451 if (fgets(buf, sizeof(buf), parser->fp) == NULL) {
452 parser->state = HTML_EOF;
456 if (conv_convert(parser->conv, buf2, sizeof(buf2), buf) < 0) {
457 index = parser->bufp - parser->buf->str;
459 conv_localetodisp(buf2, sizeof(buf2), buf);
460 g_string_append(parser->buf, buf2);
462 parser->bufp = parser->buf->str + index;
464 return HTML_CONV_FAILED;
467 index = parser->bufp - parser->buf->str;
469 g_string_append(parser->buf, buf2);
471 parser->bufp = parser->buf->str + index;
476 static void html_append_char(HTMLParser *parser, gchar ch)
478 GString *str = parser->str;
480 if (!parser->pre && parser->space) {
481 g_string_append_c(str, ' ');
482 parser->space = FALSE;
485 g_string_append_c(str, ch);
487 parser->empty_line = FALSE;
489 parser->newline = TRUE;
490 if (str->len > 1 && str->str[str->len - 2] == '\n')
491 parser->empty_line = TRUE;
493 parser->newline = FALSE;
496 static void html_append_str(HTMLParser *parser, const gchar *str, gint len)
498 GString *string = parser->str;
500 if (!parser->pre && parser->space) {
501 g_string_append_c(string, ' ');
502 parser->space = FALSE;
505 if (len == 0) return;
507 g_string_append(string, str);
510 Xstrndup_a(s, str, len, return);
511 g_string_append(string, s);
514 parser->empty_line = FALSE;
515 if (string->len > 0 && string->str[string->len - 1] == '\n') {
516 parser->newline = TRUE;
517 if (string->len > 1 && string->str[string->len - 2] == '\n')
518 parser->empty_line = TRUE;
520 parser->newline = FALSE;
523 static HTMLTag *html_get_tag(const gchar *str)
529 g_return_val_if_fail(str != NULL, NULL);
531 if (*str == '\0' || *str == '!') return NULL;
533 Xstrdup_a(tmp, str, return NULL);
535 tag = g_new0(HTMLTag, 1);
537 for (tmpp = tmp; *tmpp != '\0' && !isspace(*tmpp); tmpp++)
542 tag->name = g_strdup(tmp);
547 tag->name = g_strdup(tmp);
550 while (*tmpp != '\0') {
557 while (isspace(*tmpp)) tmpp++;
559 if ((p = strchr(attr_name, '=')) == NULL) {
560 g_warning("html_get_tag(): syntax error in tag: '%s'\n", str);
565 while (isspace(*tmpp)) tmpp++;
568 g_warning("html_get_tag(): syntax error in tag: '%s'\n", str);
570 } else if (*tmpp == '"' || *tmpp == '\'') {
575 if ((p = strchr(attr_value, quote)) == NULL) {
576 g_warning("html_get_tag(): syntax error in tag: '%s'\n", str);
581 while (isspace(*tmpp)) tmpp++;
585 while (*tmpp != '\0' && !isspace(*tmpp)) tmpp++;
590 g_strchomp(attr_name);
591 g_strdown(attr_name);
592 attr = g_new(HTMLAttr, 1);
593 attr->name = g_strdup(attr_name);
594 attr->value = g_strdup(attr_value);
595 tag->attr = g_list_append(tag->attr, attr);
601 static void html_free_tag(HTMLTag *tag)
606 while (tag->attr != NULL) {
607 HTMLAttr *attr = (HTMLAttr *)tag->attr->data;
611 tag->attr = g_list_remove(tag->attr, tag->attr->data);
616 static HTMLState html_parse_tag(HTMLParser *parser)
618 gchar buf[HTMLBUFSIZE];
621 html_get_parenthesis(parser, buf, sizeof(buf));
623 tag = html_get_tag(buf);
625 parser->state = HTML_UNKNOWN;
626 if (!tag) return HTML_UNKNOWN;
628 if (!strcmp(tag->name, "br")) {
629 parser->space = FALSE;
630 html_append_char(parser, '\n');
631 parser->state = HTML_BR;
632 } else if (!strcmp(tag->name, "a")) {
633 if (tag->attr && tag->attr->data &&
634 !strcmp(((HTMLAttr *)tag->attr->data)->name, "href")) {
635 g_free(parser->href);
637 g_strdup(((HTMLAttr *)tag->attr->data)->value);
638 parser->state = HTML_HREF;
640 } else if (!strcmp(tag->name, "/a")) {
641 g_free(parser->href);
643 parser->state = HTML_NORMAL;
644 } else if (!strcmp(tag->name, "p")) {
645 parser->space = FALSE;
646 if (!parser->empty_line) {
647 parser->space = FALSE;
648 if (!parser->newline) html_append_char(parser, '\n');
649 html_append_char(parser, '\n');
651 parser->state = HTML_PAR;
652 } else if (!strcmp(tag->name, "pre")) {
654 parser->state = HTML_PRE;
655 } else if (!strcmp(tag->name, "/pre")) {
657 parser->state = HTML_NORMAL;
658 } else if (!strcmp(tag->name, "hr")) {
659 if (!parser->newline) {
660 parser->space = FALSE;
661 html_append_char(parser, '\n');
663 html_append_str(parser, HR_STR "\n", -1);
664 parser->state = HTML_HR;
665 } else if (!strcmp(tag->name, "div") ||
666 !strcmp(tag->name, "ul") ||
667 !strcmp(tag->name, "li") ||
668 !strcmp(tag->name, "table") ||
669 !strcmp(tag->name, "tr") ||
670 (tag->name[0] == 'h' && isdigit(tag->name[1]))) {
671 if (!parser->newline) {
672 parser->space = FALSE;
673 html_append_char(parser, '\n');
675 parser->state = HTML_NORMAL;
676 } else if (!strcmp(tag->name, "/table") ||
677 (tag->name[0] == '/' &&
678 tag->name[1] == 'h' &&
679 isdigit(tag->name[1]))) {
680 if (!parser->empty_line) {
681 parser->space = FALSE;
682 if (!parser->newline) html_append_char(parser, '\n');
683 html_append_char(parser, '\n');
685 parser->state = HTML_NORMAL;
686 } else if (!strcmp(tag->name, "/div") ||
687 !strcmp(tag->name, "/ul") ||
688 !strcmp(tag->name, "/li")) {
689 if (!parser->newline) {
690 parser->space = FALSE;
691 html_append_char(parser, '\n');
693 parser->state = HTML_NORMAL;
698 return parser->state;
701 static void html_parse_special(HTMLParser *parser)
703 gchar symbol_name[9];
707 parser->state = HTML_UNKNOWN;
708 g_return_if_fail(*parser->bufp == '&');
711 for (n = 0; parser->bufp[n] != '\0' && parser->bufp[n] != ';'; n++)
713 if (n > 7 || parser->bufp[n] != ';') {
714 /* output literal `&' */
715 html_append_char(parser, *parser->bufp++);
716 parser->state = HTML_NORMAL;
719 strncpy2(symbol_name, parser->bufp, n + 2);
720 parser->bufp += n + 1;
722 if ((val = g_hash_table_lookup(parser->symbol_table, symbol_name))
724 html_append_str(parser, val, -1);
725 parser->state = HTML_NORMAL;
727 } else if (symbol_name[1] == '#' && isdigit(symbol_name[2])) {
730 ch = atoi(symbol_name + 2);
731 if ((ch > 0 && ch <= 127) ||
732 (ch >= 128 && ch <= 255 &&
733 parser->conv->charset == C_ISO_8859_1)) {
734 html_append_char(parser, ch);
735 parser->state = HTML_NORMAL;
740 html_append_str(parser, symbol_name, -1);
743 static void html_get_parenthesis(HTMLParser *parser, gchar *buf, gint len)
748 g_return_if_fail(*parser->bufp == '<');
750 /* ignore comment / CSS / script stuff */
751 if (!strncmp(parser->bufp, "<!--", 4)) {
753 while ((p = strstr(parser->bufp, "-->")) == NULL)
754 if (html_read_line(parser) == HTML_EOF) return;
755 parser->bufp = p + 3;
758 if (!g_strncasecmp(parser->bufp, "<style", 6)) {
760 while ((p = strcasestr(parser->bufp, "</style>")) == NULL)
761 if (html_read_line(parser) == HTML_EOF) return;
762 parser->bufp = p + 8;
765 if (!g_strncasecmp(parser->bufp, "<script", 7)) {
767 while ((p = strcasestr(parser->bufp, "</script>")) == NULL)
768 if (html_read_line(parser) == HTML_EOF) return;
769 parser->bufp = p + 9;
774 while ((p = strchr(parser->bufp, '>')) == NULL)
775 if (html_read_line(parser) == HTML_EOF) return;
777 strncpy2(buf, parser->bufp, MIN(p - parser->bufp + 1, len));
779 parser->bufp = p + 1;
782 /* these hash functions were taken from gstring.c in glib */
784 static gint g_str_case_equal(gconstpointer v, gconstpointer v2)
786 return strcasecmp((const gchar *)v, (const gchar *)v2) == 0;
789 static guint g_str_case_hash(gconstpointer key)
791 const gchar *p = key;
796 for (p += 1; *p != '\0'; p++)
797 h = (h << 5) - h + tolower(*p);