2 * Sylpheed -- a GTK+ based, lightweight, and fast e-mail client
3 * Copyright (C) 1999-2003 Hiroyuki Yamamoto
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
29 #define HTMLBUFSIZE 8192
30 #define HR_STR "------------------------------------------------"
32 typedef struct _HTMLSymbol HTMLSymbol;
40 static HTMLSymbol symbol_list[] = {
51 static HTMLSymbol ascii_symbol_list[] = {
114 static HTMLSymbol eucjp_symbol_list[] = {
116 {"¢" , "\xa1\xf1"},
117 {"£" , "\xa1\xf2"},
118 {"¥" , "\xa1\xef"},
120 {"§" , "\xa1\xf8"},
121 {"¨" , "\xa1\xaf"},
126 {"°" , "\xa1\xeb"},
127 {"±", "\xa1\xde"},
131 {"µ" , "\xa6\xcc"},
132 {"¶" , "\xa2\xf9"},
133 {"·", "\xa1\xa6"},
146 {"Ä" , "A\xa1\xaf"},
147 {"Å" , "A\xa1\xeb"},
152 {"Ë" , "E\xa1\xaf"},
156 {"Ï" , "I\xa1\xaf"},
163 {"Ö" , "O\xa1\xaf"},
164 {"×" , "\xa1\xdf"},
168 {"Ü" , "U\xa1\xaf"},
175 {"ä" , "a\xa1\xaf"},
176 {"å" , "a\xa1\xeb"},
181 {"ë" , "e\xa1\xaf"},
185 {"ï" , "i\xa1\xaf"},
187 {"ð" , "\xa2\xdf"},
193 {"ö" , "o\xa1\xaf"},
194 {"÷", "\xa1\xe0"},
198 {"ü" , "u\xa1\xaf"},
200 {"ÿ" , "y\xa1\xaf"},
203 static HTMLSymbol latin_symbol_list[] = {
204 {"¡" , "\xa1"},
206 {"£" , "\xa3"},
207 {"¤", "\xa4"},
209 {"¦", "\xa6"},
214 {"«" , "\xab"},
221 {"±", "\xb1"},
224 {"´" , "\xb4"},
225 {"µ" , "\xb5"},
227 {"·", "\xb7"},
228 {"¸" , "\xb8"},
231 {"»" , "\xbb"},
232 {"¼", "\xbc"},
233 {"½", "\xbd"},
234 {"¾", "\xbe"},
235 {"¿", "\xbf"},
237 {"À", "\xc0"},
238 {"Á", "\xc1"},
239 {"Â" , "\xc2"},
240 {"Ã", "\xc3"},
242 {"Å" , "\xc5"},
243 {"Æ" , "\xc6"},
244 {"Ç", "\xc7"},
245 {"È", "\xc8"},
246 {"É", "\xc9"},
247 {"Ê" , "\xca"},
249 {"Ì", "\xcc"},
250 {"Í", "\xcd"},
251 {"Î" , "\xce"},
255 {"Ñ", "\xd1"},
256 {"Ò", "\xd2"},
257 {"Ó", "\xd3"},
258 {"Ô" , "\xd4"},
259 {"Õ", "\xd5"},
261 {"×" , "\xd7"},
262 {"Ø", "\xd8"},
263 {"Ù", "\xd9"},
264 {"Ú", "\xda"},
265 {"Û" , "\xdb"},
267 {"Ý", "\xdd"},
268 {"Þ" , "\xde"},
269 {"ß" , "\xdf"},
271 {"à", "\xe0"},
272 {"á", "\xe1"},
273 {"â" , "\xe2"},
274 {"ã", "\xe3"},
276 {"å" , "\xe5"},
277 {"æ" , "\xe6"},
278 {"ç", "\xe7"},
279 {"è", "\xe8"},
280 {"é", "\xe9"},
281 {"ê" , "\xea"},
283 {"ì", "\xec"},
284 {"í", "\xed"},
285 {"î" , "\xee"},
289 {"ñ", "\xf1"},
290 {"ò", "\xf2"},
291 {"ó", "\xf3"},
292 {"ô" , "\xf4"},
293 {"õ", "\xf5"},
295 {"÷", "\xf7"},
296 {"ø", "\xf8"},
297 {"ù", "\xf9"},
298 {"ú", "\xfa"},
299 {"û" , "\xfb"},
301 {"ý", "\xfd"},
302 {"þ" , "\xfe"},
306 static GHashTable *default_symbol_table;
307 static GHashTable *eucjp_symbol_table;
308 static GHashTable *latin_symbol_table;
310 static HTMLState html_read_line (HTMLParser *parser);
311 static void html_append_char (HTMLParser *parser,
313 static void html_append_str (HTMLParser *parser,
316 static HTMLState html_parse_tag (HTMLParser *parser);
317 static void html_parse_special (HTMLParser *parser);
318 static void html_get_parenthesis (HTMLParser *parser,
323 HTMLParser *html_parser_new(FILE *fp, CodeConverter *conv)
327 g_return_val_if_fail(fp != NULL, NULL);
328 g_return_val_if_fail(conv != NULL, NULL);
330 parser = g_new0(HTMLParser, 1);
333 parser->str = g_string_new(NULL);
334 parser->buf = g_string_new(NULL);
335 parser->bufp = parser->buf->str;
336 parser->state = HTML_NORMAL;
338 parser->newline = TRUE;
339 parser->empty_line = TRUE;
340 parser->space = FALSE;
343 #define SYMBOL_TABLE_ADD(table, list) \
347 for (i = 0; i < sizeof(list) / sizeof(list[0]); i++) \
348 g_hash_table_insert(table, list[i].key, list[i].val); \
351 if (!default_symbol_table) {
352 default_symbol_table =
353 g_hash_table_new(g_str_hash, g_str_equal);
354 SYMBOL_TABLE_ADD(default_symbol_table, symbol_list);
355 SYMBOL_TABLE_ADD(default_symbol_table, ascii_symbol_list);
357 if (!eucjp_symbol_table) {
359 g_hash_table_new(g_str_hash, g_str_equal);
360 SYMBOL_TABLE_ADD(eucjp_symbol_table, symbol_list);
361 SYMBOL_TABLE_ADD(eucjp_symbol_table, eucjp_symbol_list);
363 if (!latin_symbol_table) {
365 g_hash_table_new(g_str_hash, g_str_equal);
366 SYMBOL_TABLE_ADD(latin_symbol_table, symbol_list);
367 SYMBOL_TABLE_ADD(latin_symbol_table, latin_symbol_list);
370 #undef SYMBOL_TABLE_ADD
372 if (conv->charset == C_ISO_8859_1)
373 parser->symbol_table = latin_symbol_table;
374 else if ((conv->charset == C_ISO_2022_JP ||
375 conv->charset == C_ISO_2022_JP_2 ||
376 conv->charset == C_EUC_JP ||
377 conv->charset == C_SHIFT_JIS) &&
378 conv_get_current_charset() == C_EUC_JP)
379 parser->symbol_table = eucjp_symbol_table;
381 parser->symbol_table = default_symbol_table;
386 void html_parser_destroy(HTMLParser *parser)
388 g_string_free(parser->str, TRUE);
389 g_string_free(parser->buf, TRUE);
390 g_free(parser->href);
394 gchar *html_parse(HTMLParser *parser)
396 parser->state = HTML_NORMAL;
397 g_string_truncate(parser->str, 0);
399 if (*parser->bufp == '\0') {
400 g_string_truncate(parser->buf, 0);
401 parser->bufp = parser->buf->str;
402 if (html_read_line(parser) == HTML_EOF)
406 while (*parser->bufp != '\0') {
407 switch (*parser->bufp) {
409 if (parser->str->len == 0)
410 html_parse_tag(parser);
412 return parser->str->str;
415 html_parse_special(parser);
421 if (parser->bufp[0] == '\r' && parser->bufp[1] == '\n')
425 if (!parser->newline)
426 parser->space = TRUE;
433 html_append_char(parser, *parser->bufp++);
437 return parser->str->str;
440 static HTMLState html_read_line(HTMLParser *parser)
442 gchar buf[HTMLBUFSIZE];
443 gchar buf2[HTMLBUFSIZE];
446 if (fgets(buf, sizeof(buf), parser->fp) == NULL) {
447 parser->state = HTML_EOF;
451 if (conv_convert(parser->conv, buf2, sizeof(buf2), buf) < 0) {
452 index = parser->bufp - parser->buf->str;
454 conv_localetodisp(buf2, sizeof(buf2), buf);
455 g_string_append(parser->buf, buf2);
457 parser->bufp = parser->buf->str + index;
459 return HTML_CONV_FAILED;
462 index = parser->bufp - parser->buf->str;
464 g_string_append(parser->buf, buf2);
466 parser->bufp = parser->buf->str + index;
471 static void html_append_char(HTMLParser *parser, gchar ch)
473 GString *str = parser->str;
475 if (!parser->pre && parser->space) {
476 g_string_append_c(str, ' ');
477 parser->space = FALSE;
480 g_string_append_c(str, ch);
482 parser->empty_line = FALSE;
484 parser->newline = TRUE;
485 if (str->len > 1 && str->str[str->len - 2] == '\n')
486 parser->empty_line = TRUE;
488 parser->newline = FALSE;
491 static void html_append_str(HTMLParser *parser, const gchar *str, gint len)
493 GString *string = parser->str;
495 if (!parser->pre && parser->space) {
496 g_string_append_c(string, ' ');
497 parser->space = FALSE;
500 if (len == 0) return;
502 g_string_append(string, str);
505 Xstrndup_a(s, str, len, return);
506 g_string_append(string, s);
509 parser->empty_line = FALSE;
510 if (string->len > 0 && string->str[string->len - 1] == '\n') {
511 parser->newline = TRUE;
512 if (string->len > 1 && string->str[string->len - 2] == '\n')
513 parser->empty_line = TRUE;
515 parser->newline = FALSE;
518 static HTMLTag *html_get_tag(const gchar *str)
524 g_return_val_if_fail(str != NULL, NULL);
526 if (*str == '\0' || *str == '!') return NULL;
528 Xstrdup_a(tmp, str, return NULL);
530 tag = g_new0(HTMLTag, 1);
532 for (tmpp = tmp; *tmpp != '\0' && !isspace(*tmpp); tmpp++)
537 tag->name = g_strdup(tmp);
542 tag->name = g_strdup(tmp);
545 while (*tmpp != '\0') {
552 while (isspace(*tmpp)) tmpp++;
554 if ((p = strchr(attr_name, '=')) == NULL) {
555 g_warning("html_get_tag(): syntax error in tag: '%s'\n", str);
560 while (isspace(*tmpp)) tmpp++;
563 g_warning("html_get_tag(): syntax error in tag: '%s'\n", str);
565 } else if (*tmpp == '"' || *tmpp == '\'') {
570 if ((p = strchr(attr_value, quote)) == NULL) {
571 g_warning("html_get_tag(): syntax error in tag: '%s'\n", str);
576 while (isspace(*tmpp)) tmpp++;
580 while (*tmpp != '\0' && !isspace(*tmpp)) tmpp++;
585 g_strchomp(attr_name);
586 g_strdown(attr_name);
587 attr = g_new(HTMLAttr, 1);
588 attr->name = g_strdup(attr_name);
589 attr->value = g_strdup(attr_value);
590 tag->attr = g_list_append(tag->attr, attr);
596 static void html_free_tag(HTMLTag *tag)
601 while (tag->attr != NULL) {
602 HTMLAttr *attr = (HTMLAttr *)tag->attr->data;
606 tag->attr = g_list_remove(tag->attr, tag->attr->data);
611 static HTMLState html_parse_tag(HTMLParser *parser)
613 gchar buf[HTMLBUFSIZE];
616 html_get_parenthesis(parser, buf, sizeof(buf));
618 tag = html_get_tag(buf);
620 parser->state = HTML_UNKNOWN;
621 if (!tag) return HTML_UNKNOWN;
623 if (!strcmp(tag->name, "br")) {
624 parser->space = FALSE;
625 html_append_char(parser, '\n');
626 parser->state = HTML_BR;
627 } else if (!strcmp(tag->name, "a")) {
628 if (tag->attr && tag->attr->data &&
629 !strcmp(((HTMLAttr *)tag->attr->data)->name, "href")) {
630 g_free(parser->href);
632 g_strdup(((HTMLAttr *)tag->attr->data)->value);
633 parser->state = HTML_HREF;
635 } else if (!strcmp(tag->name, "/a")) {
636 g_free(parser->href);
638 parser->state = HTML_NORMAL;
639 } else if (!strcmp(tag->name, "p")) {
640 parser->space = FALSE;
641 if (!parser->empty_line) {
642 parser->space = FALSE;
643 if (!parser->newline) html_append_char(parser, '\n');
644 html_append_char(parser, '\n');
646 parser->state = HTML_PAR;
647 } else if (!strcmp(tag->name, "pre")) {
649 parser->state = HTML_PRE;
650 } else if (!strcmp(tag->name, "/pre")) {
652 parser->state = HTML_NORMAL;
653 } else if (!strcmp(tag->name, "hr")) {
654 if (!parser->newline) {
655 parser->space = FALSE;
656 html_append_char(parser, '\n');
658 html_append_str(parser, HR_STR "\n", -1);
659 parser->state = HTML_HR;
660 } else if (!strcmp(tag->name, "div") ||
661 !strcmp(tag->name, "ul") ||
662 !strcmp(tag->name, "li") ||
663 !strcmp(tag->name, "table") ||
664 !strcmp(tag->name, "tr") ||
665 (tag->name[0] == 'h' && isdigit(tag->name[1]))) {
666 if (!parser->newline) {
667 parser->space = FALSE;
668 html_append_char(parser, '\n');
670 parser->state = HTML_NORMAL;
671 } else if (!strcmp(tag->name, "/table") ||
672 (tag->name[0] == '/' &&
673 tag->name[1] == 'h' &&
674 isdigit(tag->name[1]))) {
675 if (!parser->empty_line) {
676 parser->space = FALSE;
677 if (!parser->newline) html_append_char(parser, '\n');
678 html_append_char(parser, '\n');
680 parser->state = HTML_NORMAL;
681 } else if (!strcmp(tag->name, "/div") ||
682 !strcmp(tag->name, "/ul") ||
683 !strcmp(tag->name, "/li")) {
684 if (!parser->newline) {
685 parser->space = FALSE;
686 html_append_char(parser, '\n');
688 parser->state = HTML_NORMAL;
693 return parser->state;
696 static void html_parse_special(HTMLParser *parser)
698 gchar symbol_name[9];
702 parser->state = HTML_UNKNOWN;
703 g_return_if_fail(*parser->bufp == '&');
706 for (n = 0; parser->bufp[n] != '\0' && parser->bufp[n] != ';'; n++)
708 if (n > 7 || parser->bufp[n] != ';') {
709 /* output literal `&' */
710 html_append_char(parser, *parser->bufp++);
711 parser->state = HTML_NORMAL;
714 strncpy2(symbol_name, parser->bufp, n + 2);
715 parser->bufp += n + 1;
717 if ((val = g_hash_table_lookup(parser->symbol_table, symbol_name))
719 html_append_str(parser, val, -1);
720 parser->state = HTML_NORMAL;
722 } else if (symbol_name[1] == '#' && isdigit(symbol_name[2])) {
725 ch = atoi(symbol_name + 2);
726 if ((ch > 0 && ch <= 127) ||
727 (ch >= 128 && ch <= 255 &&
728 parser->conv->charset == C_ISO_8859_1)) {
729 html_append_char(parser, ch);
730 parser->state = HTML_NORMAL;
735 html_append_str(parser, symbol_name, -1);
738 static void html_get_parenthesis(HTMLParser *parser, gchar *buf, gint len)
743 g_return_if_fail(*parser->bufp == '<');
745 /* ignore comment / CSS / script stuff */
746 if (!strncmp(parser->bufp, "<!--", 4)) {
748 while ((p = strstr(parser->bufp, "-->")) == NULL)
749 if (html_read_line(parser) == HTML_EOF) return;
750 parser->bufp = p + 3;
753 if (!g_strncasecmp(parser->bufp, "<style", 6)) {
755 while ((p = strcasestr(parser->bufp, "</style>")) == NULL)
756 if (html_read_line(parser) == HTML_EOF) return;
757 parser->bufp = p + 8;
760 if (!g_strncasecmp(parser->bufp, "<script", 7)) {
762 while ((p = strcasestr(parser->bufp, "</script>")) == NULL)
763 if (html_read_line(parser) == HTML_EOF) return;
764 parser->bufp = p + 9;
769 while ((p = strchr(parser->bufp, '>')) == NULL)
770 if (html_read_line(parser) == HTML_EOF) return;
772 strncpy2(buf, parser->bufp, MIN(p - parser->bufp + 1, len));
774 parser->bufp = p + 1;