2 * Sylpheed -- a GTK+ based, lightweight, and fast e-mail client
3 * Copyright (C) 1999,2000 Hiroyuki Yamamoto
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
29 #define HTMLBUFSIZE 8192
30 #define HR_STR "------------------------------------------------"
32 typedef struct _HTMLSymbol HTMLSymbol;
40 static HTMLSymbol symbol_list[] = {
51 static HTMLSymbol ascii_symbol_list[] = {
114 static HTMLSymbol eucjp_symbol_list[] = {
116 {"¢" , "\xa1\xf1"},
117 {"£" , "\xa1\xf2"},
118 {"¥" , "\xa1\xef"},
120 {"§" , "\xa1\xf8"},
121 {"¨" , "\xa1\xaf"},
126 {"°" , "\xa1\xeb"},
127 {"±", "\xa1\xde"},
131 {"µ" , "\xa6\xcc"},
132 {"¶" , "\xa2\xf9"},
133 {"·", "\xa1\xa6"},
146 {"Ä" , "A\xa1\xaf"},
147 {"Å" , "A\xa1\xeb"},
152 {"Ë" , "E\xa1\xaf"},
156 {"Ï" , "I\xa1\xaf"},
163 {"Ö" , "O\xa1\xaf"},
164 {"×" , "\xa1\xdf"},
168 {"Ü" , "U\xa1\xaf"},
175 {"ä" , "a\xa1\xaf"},
176 {"å" , "a\xa1\xeb"},
181 {"ë" , "e\xa1\xaf"},
185 {"ï" , "i\xa1\xaf"},
187 {"ð" , "\xa2\xdf"},
193 {"ö" , "o\xa1\xaf"},
194 {"÷", "\xa1\xe0"},
198 {"ü" , "u\xa1\xaf"},
200 {"ÿ" , "y\xa1\xaf"},
203 static HTMLSymbol latin_symbol_list[] = {
204 {"¡" , "\xa1"},
206 {"£" , "\xa3"},
207 {"¤", "\xa4"},
209 {"¦", "\xa6"},
214 {"«" , "\xab"},
221 {"±", "\xb1"},
224 {"´" , "\xb4"},
225 {"µ" , "\xb5"},
227 {"·", "\xb7"},
228 {"¸" , "\xb8"},
231 {"»" , "\xbb"},
232 {"¼", "\xbc"},
233 {"½", "\xbd"},
234 {"¾", "\xbe"},
235 {"¿", "\xbf"},
237 {"À", "\xc0"},
238 {"Á", "\xc1"},
239 {"Â" , "\xc2"},
240 {"Ã", "\xc3"},
242 {"Å" , "\xc5"},
243 {"Æ" , "\xc6"},
244 {"Ç", "\xc7"},
245 {"È", "\xc8"},
246 {"É", "\xc9"},
247 {"Ê" , "\xca"},
249 {"Ì", "\xcc"},
250 {"Í", "\xcd"},
251 {"Î" , "\xce"},
255 {"Ñ", "\xd1"},
256 {"Ò", "\xd2"},
257 {"Ó", "\xd3"},
258 {"Ô" , "\xd4"},
259 {"Õ", "\xd5"},
261 {"×" , "\xd7"},
262 {"Ø", "\xd8"},
263 {"Ù", "\xd9"},
264 {"Ú", "\xda"},
265 {"Û" , "\xdb"},
267 {"Ý", "\xdd"},
268 {"Þ" , "\xde"},
269 {"ß" , "\xdf"},
271 {"à", "\xe0"},
272 {"á", "\xe1"},
273 {"â" , "\xe2"},
274 {"ã", "\xe3"},
276 {"å" , "\xe5"},
277 {"æ" , "\xe6"},
278 {"ç", "\xe7"},
279 {"è", "\xe8"},
280 {"é", "\xe9"},
281 {"ê" , "\xea"},
283 {"ì", "\xec"},
284 {"í", "\xed"},
285 {"î" , "\xee"},
289 {"ñ", "\xf1"},
290 {"ò", "\xf2"},
291 {"ó", "\xf3"},
292 {"ô" , "\xf4"},
293 {"õ", "\xf5"},
295 {"÷", "\xf7"},
296 {"ø", "\xf8"},
297 {"ù", "\xf9"},
298 {"ú", "\xfa"},
299 {"û" , "\xfb"},
301 {"ý", "\xfd"},
302 {"þ" , "\xfe"},
306 static GHashTable *default_symbol_table;
307 static GHashTable *eucjp_symbol_table;
308 static GHashTable *latin_symbol_table;
310 static HTMLState html_read_line (HTMLParser *parser);
311 static void html_append_char (HTMLParser *parser,
313 static void html_append_str (HTMLParser *parser,
316 static HTMLState html_parse_tag (HTMLParser *parser);
317 static void html_parse_special (HTMLParser *parser);
318 static void html_get_parenthesis (HTMLParser *parser,
323 static gint g_str_case_equal (gconstpointer v,
325 static guint g_str_case_hash (gconstpointer key);
328 HTMLParser *html_parser_new(FILE *fp, CodeConverter *conv)
332 g_return_val_if_fail(fp != NULL, NULL);
333 g_return_val_if_fail(conv != NULL, NULL);
335 parser = g_new0(HTMLParser, 1);
338 parser->str = g_string_new(NULL);
339 parser->buf = g_string_new(NULL);
340 parser->bufp = parser->buf->str;
341 parser->newline = TRUE;
342 parser->empty_line = TRUE;
343 parser->space = FALSE;
346 #define SYMBOL_TABLE_ADD(table, list) \
350 for (i = 0; i < sizeof(list) / sizeof(list[0]); i++) \
351 g_hash_table_insert(table, list[i].key, list[i].val); \
354 if (!default_symbol_table) {
355 default_symbol_table =
356 g_hash_table_new(g_str_hash, g_str_equal);
357 SYMBOL_TABLE_ADD(default_symbol_table, symbol_list);
358 SYMBOL_TABLE_ADD(default_symbol_table, ascii_symbol_list);
360 if (!eucjp_symbol_table) {
362 g_hash_table_new(g_str_hash, g_str_equal);
363 SYMBOL_TABLE_ADD(eucjp_symbol_table, symbol_list);
364 SYMBOL_TABLE_ADD(eucjp_symbol_table, eucjp_symbol_list);
366 if (!latin_symbol_table) {
368 g_hash_table_new(g_str_hash, g_str_equal);
369 SYMBOL_TABLE_ADD(latin_symbol_table, symbol_list);
370 SYMBOL_TABLE_ADD(latin_symbol_table, latin_symbol_list);
373 #undef SYMBOL_TABLE_ADD
375 if (conv->charset == C_ISO_8859_1)
376 parser->symbol_table = latin_symbol_table;
377 else if ((conv->charset == C_ISO_2022_JP ||
378 conv->charset == C_ISO_2022_JP_2 ||
379 conv->charset == C_EUC_JP ||
380 conv->charset == C_SHIFT_JIS) &&
381 conv_get_current_charset() == C_EUC_JP)
382 parser->symbol_table = eucjp_symbol_table;
384 parser->symbol_table = default_symbol_table;
389 void html_parser_destroy(HTMLParser *parser)
391 g_string_free(parser->str, TRUE);
392 g_string_free(parser->buf, TRUE);
396 gchar *html_parse(HTMLParser *parser)
398 parser->state = HTML_NORMAL;
399 g_string_truncate(parser->str, 0);
401 if (*parser->bufp == '\0') {
402 g_string_truncate(parser->buf, 0);
403 parser->bufp = parser->buf->str;
404 if (html_read_line(parser) == HTML_EOF)
408 while (*parser->bufp != '\0') {
409 switch (*parser->bufp) {
411 if (parser->str->len == 0)
412 html_parse_tag(parser);
414 return parser->str->str;
417 html_parse_special(parser);
423 if (parser->bufp[0] == '\r' && parser->bufp[1] == '\n')
427 if (!parser->newline)
428 parser->space = TRUE;
435 html_append_char(parser, *parser->bufp++);
439 return parser->str->str;
442 static HTMLState html_read_line(HTMLParser *parser)
444 gchar buf[HTMLBUFSIZE];
445 gchar buf2[HTMLBUFSIZE];
448 if (fgets(buf, sizeof(buf), parser->fp) == NULL) {
449 parser->state = HTML_EOF;
453 if (conv_convert(parser->conv, buf2, sizeof(buf2), buf) < 0) {
454 g_warning("html_read_line(): code conversion failed\n");
456 index = parser->bufp - parser->buf->str;
458 g_string_append(parser->buf, buf);
460 parser->bufp = parser->buf->str + index;
465 index = parser->bufp - parser->buf->str;
467 g_string_append(parser->buf, buf2);
469 parser->bufp = parser->buf->str + index;
474 static void html_append_char(HTMLParser *parser, gchar ch)
476 GString *str = parser->str;
478 if (!parser->pre && parser->space) {
479 g_string_append_c(str, ' ');
480 parser->space = FALSE;
483 g_string_append_c(str, ch);
485 parser->empty_line = FALSE;
487 parser->newline = TRUE;
488 if (str->len > 1 && str->str[str->len - 2] == '\n')
489 parser->empty_line = TRUE;
491 parser->newline = FALSE;
494 static void html_append_str(HTMLParser *parser, const gchar *str, gint len)
496 GString *string = parser->str;
498 if (!parser->pre && parser->space) {
499 g_string_append_c(string, ' ');
500 parser->space = FALSE;
503 if (len == 0) return;
505 g_string_append(string, str);
508 Xstrndup_a(s, str, len, return);
509 g_string_append(string, s);
512 parser->empty_line = FALSE;
513 if (string->len > 0 && string->str[string->len - 1] == '\n') {
514 parser->newline = TRUE;
515 if (string->len > 1 && string->str[string->len - 2] == '\n')
516 parser->empty_line = TRUE;
518 parser->newline = FALSE;
521 static HTMLState html_parse_tag(HTMLParser *parser)
523 gchar buf[HTMLBUFSIZE];
525 static gboolean is_in_href = FALSE;
527 html_get_parenthesis(parser, buf, sizeof(buf));
529 for (p = buf; *p != '\0'; p++) {
536 parser->state = HTML_UNKNOWN;
537 if (buf[0] == '\0') return parser->state;
541 if (!strcmp(buf, "br")) {
542 parser->space = FALSE;
543 html_append_char(parser, '\n');
544 parser->state = HTML_BR;
545 } else if (!strcmp(buf, "a")) {
546 /* look for tokens separated by space or = */
547 char* href_token = strtok(++p, " =");
548 parser->state = HTML_NORMAL;
549 while (href_token != NULL) {
551 if (!strcmp(href_token, "href")) {
552 /* the next token is the url, between double
554 char* url = strtok(NULL, "\"");
555 html_append_str(parser, url, strlen(url));
556 html_append_char(parser, ' ');
557 /* start enforcing html link */
558 parser->state = HTML_HREF;
562 /* or get next token */
563 href_token = strtok(NULL, " =");
565 } else if (!strcmp(buf, "/a")) {
566 /* stop enforcing html link */
567 parser->state = HTML_NORMAL;
569 } else if (!strcmp(buf, "p")) {
570 parser->space = FALSE;
571 if (!parser->empty_line) {
572 parser->space = FALSE;
573 if (!parser->newline) html_append_char(parser, '\n');
574 html_append_char(parser, '\n');
576 parser->state = HTML_PAR;
577 } else if (!strcmp(buf, "pre")) {
579 parser->state = HTML_PRE;
580 } else if (!strcmp(buf, "/pre")) {
582 parser->state = HTML_NORMAL;
583 } else if (!strcmp(buf, "hr")) {
584 if (!parser->newline) {
585 parser->space = FALSE;
586 html_append_char(parser, '\n');
588 html_append_str(parser, HR_STR "\n", -1);
589 parser->state = HTML_HR;
590 } else if (!strcmp(buf, "div") ||
591 !strcmp(buf, "ul") ||
592 !strcmp(buf, "li") ||
593 !strcmp(buf, "table") ||
594 !strcmp(buf, "tr") ||
595 (buf[0] == 'h' && isdigit(buf[1]))) {
596 if (!parser->newline) {
597 parser->space = FALSE;
598 html_append_char(parser, '\n');
600 parser->state = HTML_NORMAL;
601 } else if (!strcmp(buf, "/table") ||
602 (buf[0] == '/' && buf[1] == 'h' && isdigit(buf[1]))) {
603 if (!parser->empty_line) {
604 parser->space = FALSE;
605 if (!parser->newline) html_append_char(parser, '\n');
606 html_append_char(parser, '\n');
608 parser->state = HTML_NORMAL;
609 } else if (!strcmp(buf, "/div") ||
610 !strcmp(buf, "/ul") ||
611 !strcmp(buf, "/li")) {
612 if (!parser->newline) {
613 parser->space = FALSE;
614 html_append_char(parser, '\n');
616 parser->state = HTML_NORMAL;
619 if (is_in_href == TRUE) {
620 /* when inside a link, everything will be written as
621 * clickable (see textview_show_thml in textview.c) */
622 parser->state = HTML_HREF;
625 return parser->state;
628 static void html_parse_special(HTMLParser *parser)
630 gchar symbol_name[9];
634 parser->state = HTML_UNKNOWN;
635 g_return_if_fail(*parser->bufp == '&');
638 for (n = 0; parser->bufp[n] != '\0' && parser->bufp[n] != ';'; n++)
640 if (n > 7 || parser->bufp[n] != ';') {
641 /* output literal `&' */
642 html_append_char(parser, *parser->bufp++);
643 parser->state = HTML_NORMAL;
646 strncpy2(symbol_name, parser->bufp, n + 2);
647 parser->bufp += n + 1;
649 if ((val = g_hash_table_lookup(parser->symbol_table, symbol_name))
651 html_append_str(parser, val, -1);
652 parser->state = HTML_NORMAL;
654 } else if (symbol_name[1] == '#' && isdigit(symbol_name[2])) {
657 ch = atoi(symbol_name + 2);
658 if ((ch > 0 && ch <= 127) ||
659 (ch >= 128 && ch <= 255 &&
660 parser->conv->charset == C_ISO_8859_1)) {
661 html_append_char(parser, ch);
662 parser->state = HTML_NORMAL;
667 html_append_str(parser, symbol_name, -1);
670 static void html_get_parenthesis(HTMLParser *parser, gchar *buf, gint len)
675 g_return_if_fail(*parser->bufp == '<');
677 /* ignore comment / CSS / script stuff */
678 if (!strncmp(parser->bufp, "<!--", 4)) {
680 while ((p = strstr(parser->bufp, "-->")) == NULL)
681 if (html_read_line(parser) == HTML_EOF) return;
682 parser->bufp = p + 3;
685 if (!g_strncasecmp(parser->bufp, "<style", 6)) {
687 while ((p = strcasestr(parser->bufp, "</style>")) == NULL)
688 if (html_read_line(parser) == HTML_EOF) return;
689 parser->bufp = p + 8;
692 if (!g_strncasecmp(parser->bufp, "<script", 7)) {
694 while ((p = strcasestr(parser->bufp, "</script>")) == NULL)
695 if (html_read_line(parser) == HTML_EOF) return;
696 parser->bufp = p + 9;
701 while ((p = strchr(parser->bufp, '>')) == NULL)
702 if (html_read_line(parser) == HTML_EOF) return;
704 strncpy2(buf, parser->bufp, MIN(p - parser->bufp + 1, len));
705 parser->bufp = p + 1;
708 /* these hash functions were taken from gstring.c in glib */
710 static gint g_str_case_equal(gconstpointer v, gconstpointer v2)
712 return strcasecmp((const gchar *)v, (const gchar *)v2) == 0;
715 static guint g_str_case_hash(gconstpointer key)
717 const gchar *p = key;
722 for (p += 1; *p != '\0'; p++)
723 h = (h << 5) - h + tolower(*p);