2007-01-03 [paul] 2.6.1cvs96
[claws.git] / src / html.c
1 /*
2  * Sylpheed -- a GTK+ based, lightweight, and fast e-mail client
3  * Copyright (C) 1999-2007 Hiroyuki Yamamoto and the Claws Mail team
4  *
5  * This program is free software; you can redistribute it and/or modify
6  * it under the terms of the GNU General Public License as published by
7  * the Free Software Foundation; either version 2 of the License, or
8  * (at your option) any later version.
9  *
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  * GNU General Public License for more details.
14  *
15  * You should have received a copy of the GNU General Public License
16  * along with this program; if not, write to the Free Software
17  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
18  */
19
20 #include <glib.h>
21 #include <stdio.h>
22 #include <string.h>
23 #include <ctype.h>
24
25 #include "html.h"
26 #include "codeconv.h"
27 #include "utils.h"
28
29 #define SC_HTMLBUFSIZE  8192
30 #define HR_STR          "------------------------------------------------"
31
32 typedef struct _SC_HTMLSymbol   SC_HTMLSymbol;
33
34 struct _SC_HTMLSymbol
35 {
36         gchar *const key;
37         gchar *const val;
38 };
39
40 static SC_HTMLSymbol symbol_list[] = {
41         {"&lt;"    , "<"},
42         {"&gt;"    , ">"},
43         {"&amp;"   , "&"},
44         {"&quot;"  , "\""},
45         {"&lsquo;",  "'"},
46         {"&rsquo;",  "'"},
47         {"&ldquo;",  "\""},
48         {"&rdquo;",  "\""},
49         {"&nbsp;"  , " "},
50         {"&trade;" , "(TM)"},
51         {"&hellip;", "..."},
52         {"&bull;", "*"},
53         {"&ndash;", "-"},
54         {"&mdash;", "--"},
55 };
56
57 static SC_HTMLSymbol ascii_symbol_list[] = {
58         {"&iexcl;" , "\302\241"},
59         {"&brvbar;", "\302\246"},
60         {"&copy;"  , "\302\251"},
61         {"&laquo;" , "\302\253"},
62         {"&reg;"   , "\302\256"},
63
64         {"&sup2;"  , "\302\262"},
65         {"&sup3;"  , "\302\263"},
66         {"&acute;" , "\302\264"},
67         {"&cedil;" , "\302\270"},
68         {"&sup1;"  , "\302\271"},
69         {"&raquo;" , "\302\273"},
70         {"&frac14;", "\302\274"},
71         {"&frac12;", "\302\275"},
72         {"&frac34;", "\302\276"},
73         {"&iquest;", "\302\277"},
74
75         {"&Agrave;", "\303\200"},
76         {"&Aacute;", "\303\201"},
77         {"&Acirc;" , "\303\202"},
78         {"&Atilde;", "\303\203"},
79         {"&AElig;" , "\303\206"},
80         {"&Egrave;", "\303\210"},
81         {"&Eacute;", "\303\211"},
82         {"&Ecirc;" , "\303\212"},
83         {"&Igrave;", "\303\214"},
84         {"&Iacute;", "\303\215"},
85         {"&Icirc;" , "\303\216"},
86
87         {"&Ntilde;", "\303\221"},
88         {"&Ograve;", "\303\222"},
89         {"&Oacute;", "\303\223"},
90         {"&Ocirc;" , "\303\224"},
91         {"&Otilde;", "\303\225"},
92         {"&Ugrave;", "\303\231"},
93         {"&Uacute;", "\303\232"},
94         {"&Ucirc;" , "\303\233"},
95         {"&Yacute;", "\303\235"},
96
97         {"&agrave;", "\303\240"},
98         {"&aacute;", "\303\241"},
99         {"&acirc;" , "\303\242"},
100         {"&atilde;", "\303\243"},
101         {"&aelig;" , "\303\246"},
102         {"&egrave;", "\303\250"},
103         {"&eacute;", "\303\251"},
104         {"&ecirc;" , "\303\252"},
105         {"&igrave;", "\303\254"},
106         {"&iacute;", "\303\255"},
107         {"&icirc;" , "\303\256"},
108
109         {"&ntilde;", "\303\261"},
110         {"&ograve;", "\303\262"},
111         {"&oacute;", "\303\263"},
112         {"&ocirc;" , "\303\264"},
113         {"&otilde;", "\303\265"},
114         {"&ugrave;", "\303\271"},
115         {"&uacute;", "\303\272"},
116         {"&ucirc;" , "\303\273"},
117         {"&yacute;", "\303\275"},
118 };
119
120 static GHashTable *default_symbol_table;
121
122 static SC_HTMLState sc_html_read_line   (SC_HTMLParser  *parser);
123 static void sc_html_append_char                 (SC_HTMLParser  *parser,
124                                          gchar           ch);
125 static void sc_html_append_str                  (SC_HTMLParser  *parser,
126                                          const gchar    *str,
127                                          gint            len);
128 static SC_HTMLState sc_html_parse_tag   (SC_HTMLParser  *parser);
129 static void sc_html_parse_special               (SC_HTMLParser  *parser);
130 static void sc_html_get_parenthesis             (SC_HTMLParser  *parser,
131                                          gchar          *buf,
132                                          gint            len);
133
134
135 SC_HTMLParser *sc_html_parser_new(FILE *fp, CodeConverter *conv)
136 {
137         SC_HTMLParser *parser;
138
139         g_return_val_if_fail(fp != NULL, NULL);
140         g_return_val_if_fail(conv != NULL, NULL);
141
142         parser = g_new0(SC_HTMLParser, 1);
143         parser->fp = fp;
144         parser->conv = conv;
145         parser->str = g_string_new(NULL);
146         parser->buf = g_string_new(NULL);
147         parser->bufp = parser->buf->str;
148         parser->state = SC_HTML_NORMAL;
149         parser->href = NULL;
150         parser->newline = TRUE;
151         parser->empty_line = TRUE;
152         parser->space = FALSE;
153         parser->pre = FALSE;
154
155 #define SYMBOL_TABLE_ADD(table, list) \
156 { \
157         gint i; \
158  \
159         for (i = 0; i < sizeof(list) / sizeof(list[0]); i++) \
160                 g_hash_table_insert(table, list[i].key, list[i].val); \
161 }
162
163         if (!default_symbol_table) {
164                 default_symbol_table =
165                         g_hash_table_new(g_str_hash, g_str_equal);
166                 SYMBOL_TABLE_ADD(default_symbol_table, symbol_list);
167                 SYMBOL_TABLE_ADD(default_symbol_table, ascii_symbol_list);
168         }
169
170 #undef SYMBOL_TABLE_ADD
171
172         parser->symbol_table = default_symbol_table;
173
174         return parser;
175 }
176
177 void sc_html_parser_destroy(SC_HTMLParser *parser)
178 {
179         g_string_free(parser->str, TRUE);
180         g_string_free(parser->buf, TRUE);
181         g_free(parser->href);
182         g_free(parser);
183 }
184
185 gchar *sc_html_parse(SC_HTMLParser *parser)
186 {
187         parser->state = SC_HTML_NORMAL;
188         g_string_truncate(parser->str, 0);
189
190         if (*parser->bufp == '\0') {
191                 g_string_truncate(parser->buf, 0);
192                 parser->bufp = parser->buf->str;
193                 if (sc_html_read_line(parser) == SC_HTML_EOF)
194                         return NULL;
195         }
196
197         while (*parser->bufp != '\0') {
198                 switch (*parser->bufp) {
199                 case '<': {
200                         SC_HTMLState st;
201                         st = sc_html_parse_tag(parser);
202                         /* when we see an href, we need to flush the str
203                          * buffer.  Then collect all the chars until we
204                          * see the end anchor tag
205                          */
206                         if (SC_HTML_HREF_BEG == st || SC_HTML_HREF == st)
207                                 return parser->str->str;
208                         } 
209                         break;
210                 case '&':
211                         sc_html_parse_special(parser);
212                         break;
213                 case ' ':
214                 case '\t':
215                 case '\r':
216                 case '\n':
217                         if (parser->bufp[0] == '\r' && parser->bufp[1] == '\n')
218                                 parser->bufp++;
219
220                         if (!parser->pre) {
221                                 if (!parser->newline)
222                                         parser->space = TRUE;
223
224                                 parser->bufp++;
225                                 break;
226                         }
227                         /* fallthrough */
228                 default:
229                         sc_html_append_char(parser, *parser->bufp++);
230                 }
231         }
232
233         return parser->str->str;
234 }
235
236 static SC_HTMLState sc_html_read_line(SC_HTMLParser *parser)
237 {
238         gchar buf[SC_HTMLBUFSIZE];
239         gchar buf2[SC_HTMLBUFSIZE];
240         gint index;
241
242         if (fgets(buf, sizeof(buf), parser->fp) == NULL) {
243                 parser->state = SC_HTML_EOF;
244                 return SC_HTML_EOF;
245         }
246
247         if (conv_convert(parser->conv, buf2, sizeof(buf2), buf) < 0) {
248                 index = parser->bufp - parser->buf->str;
249
250                 conv_utf8todisp(buf2, sizeof(buf2), buf);
251                 g_string_append(parser->buf, buf2);
252
253                 parser->bufp = parser->buf->str + index;
254
255                 return SC_HTML_CONV_FAILED;
256         }
257
258         index = parser->bufp - parser->buf->str;
259
260         g_string_append(parser->buf, buf2);
261
262         parser->bufp = parser->buf->str + index;
263
264         return SC_HTML_NORMAL;
265 }
266
267 static void sc_html_append_char(SC_HTMLParser *parser, gchar ch)
268 {
269         GString *str = parser->str;
270
271         if (!parser->pre && parser->space) {
272                 g_string_append_c(str, ' ');
273                 parser->space = FALSE;
274         }
275
276         g_string_append_c(str, ch);
277
278         parser->empty_line = FALSE;
279         if (ch == '\n') {
280                 parser->newline = TRUE;
281                 if (str->len > 1 && str->str[str->len - 2] == '\n')
282                         parser->empty_line = TRUE;
283         } else
284                 parser->newline = FALSE;
285 }
286
287 static void sc_html_append_str(SC_HTMLParser *parser, const gchar *str, gint len)
288 {
289         GString *string = parser->str;
290
291         if (!parser->pre && parser->space) {
292                 g_string_append_c(string, ' ');
293                 parser->space = FALSE;
294         }
295
296         if (len == 0) return;
297         if (len < 0)
298                 g_string_append(string, str);
299         else {
300                 gchar *s;
301                 Xstrndup_a(s, str, len, return);
302                 g_string_append(string, s);
303         }
304
305         parser->empty_line = FALSE;
306         if (string->len > 0 && string->str[string->len - 1] == '\n') {
307                 parser->newline = TRUE;
308                 if (string->len > 1 && string->str[string->len - 2] == '\n')
309                         parser->empty_line = TRUE;
310         } else
311                 parser->newline = FALSE;
312 }
313
314 static SC_HTMLTag *sc_html_get_tag(const gchar *str)
315 {
316         SC_HTMLTag *tag;
317         gchar *tmp;
318         guchar *tmpp;
319
320         g_return_val_if_fail(str != NULL, NULL);
321
322         if (*str == '\0' || *str == '!') return NULL;
323
324         Xstrdup_a(tmp, str, return NULL);
325
326         tag = g_new0(SC_HTMLTag, 1);
327
328         for (tmpp = tmp; *tmpp != '\0' && !g_ascii_isspace(*tmpp); tmpp++)
329                 ;
330
331         if (*tmpp == '\0') {
332                 g_strdown(tmp);
333                 tag->name = g_strdup(tmp);
334                 return tag;
335         } else {
336                 *tmpp++ = '\0';
337                 g_strdown(tmp);
338                 tag->name = g_strdup(tmp);
339         }
340
341         while (*tmpp != '\0') {
342                 SC_HTMLAttr *attr;
343                 gchar *attr_name;
344                 gchar *attr_value;
345                 gchar *p;
346                 gchar quote;
347
348                 while (g_ascii_isspace(*tmpp)) tmpp++;
349                 attr_name = tmpp;
350
351                 while (*tmpp != '\0' && !g_ascii_isspace(*tmpp) &&
352                        *tmpp != '=')
353                         tmpp++;
354                 if (*tmpp != '\0' && *tmpp != '=') {
355                         *tmpp++ = '\0';
356                         while (g_ascii_isspace(*tmpp)) tmpp++;
357                 }
358
359                 if (*tmpp == '=') {
360                         *tmpp++ = '\0';
361                         while (g_ascii_isspace(*tmpp)) tmpp++;
362
363                         if (*tmpp == '"' || *tmpp == '\'') {
364                                 /* name="value" */
365                                 quote = *tmpp;
366                                 tmpp++;
367                                 attr_value = tmpp;
368                                 if ((p = strchr(attr_value, quote)) == NULL) {
369                                         g_warning("sc_html_get_tag(): syntax error in tag: '%s'\n", str);
370                                         return tag;
371                                 }
372                                 tmpp = p;
373                                 *tmpp++ = '\0';
374                                 while (g_ascii_isspace(*tmpp)) tmpp++;
375                         } else {
376                                 /* name=value */
377                                 attr_value = tmpp;
378                                 while (*tmpp != '\0' && !g_ascii_isspace(*tmpp)) tmpp++;
379                                 if (*tmpp != '\0')
380                                         *tmpp++ = '\0';
381                         }
382                 } else
383                         attr_value = "";
384
385                 g_strchomp(attr_name);
386                 g_strdown(attr_name);
387                 attr = g_new(SC_HTMLAttr, 1);
388                 attr->name = g_strdup(attr_name);
389                 attr->value = g_strdup(attr_value);
390                 tag->attr = g_list_append(tag->attr, attr);
391         }
392
393         return tag;
394 }
395
396 static void sc_html_free_tag(SC_HTMLTag *tag)
397 {
398         if (!tag) return;
399
400         g_free(tag->name);
401         while (tag->attr != NULL) {
402                 SC_HTMLAttr *attr = (SC_HTMLAttr *)tag->attr->data;
403                 g_free(attr->name);
404                 g_free(attr->value);
405                 g_free(attr);
406                 tag->attr = g_list_remove(tag->attr, tag->attr->data);
407         }
408         g_free(tag);
409 }
410
411 static SC_HTMLState sc_html_parse_tag(SC_HTMLParser *parser)
412 {
413         gchar buf[SC_HTMLBUFSIZE];
414         SC_HTMLTag *tag;
415
416         sc_html_get_parenthesis(parser, buf, sizeof(buf));
417
418         tag = sc_html_get_tag(buf);
419
420         parser->state = SC_HTML_UNKNOWN;
421         if (!tag) return SC_HTML_UNKNOWN;
422
423         if (!strcmp(tag->name, "br")) {
424                 parser->space = FALSE;
425                 sc_html_append_char(parser, '\n');
426                 parser->state = SC_HTML_BR;
427         } else if (!strcmp(tag->name, "a")) {
428                 GList *cur;
429                 for (cur = tag->attr; cur != NULL; cur = cur->next) {
430                         if (cur->data && !strcmp(((SC_HTMLAttr *)cur->data)->name, "href")) {
431                                 g_free(parser->href);
432                                 parser->href = g_strdup(((SC_HTMLAttr *)cur->data)->value);
433                                 parser->state = SC_HTML_HREF_BEG;
434                                 break;
435                         }
436                 }
437         } else if (!strcmp(tag->name, "/a")) {
438                 parser->state = SC_HTML_HREF;
439         } else if (!strcmp(tag->name, "p")) {
440                 parser->space = FALSE;
441                 if (!parser->empty_line) {
442                         parser->space = FALSE;
443                         if (!parser->newline) sc_html_append_char(parser, '\n');
444                         sc_html_append_char(parser, '\n');
445                 }
446                 parser->state = SC_HTML_PAR;
447         } else if (!strcmp(tag->name, "pre")) {
448                 parser->pre = TRUE;
449                 parser->state = SC_HTML_PRE;
450         } else if (!strcmp(tag->name, "/pre")) {
451                 parser->pre = FALSE;
452                 parser->state = SC_HTML_NORMAL;
453         } else if (!strcmp(tag->name, "hr")) {
454                 if (!parser->newline) {
455                         parser->space = FALSE;
456                         sc_html_append_char(parser, '\n');
457                 }
458                 sc_html_append_str(parser, HR_STR "\n", -1);
459                 parser->state = SC_HTML_HR;
460         } else if (!strcmp(tag->name, "div")    ||
461                    !strcmp(tag->name, "ul")     ||
462                    !strcmp(tag->name, "li")     ||
463                    !strcmp(tag->name, "table")  ||
464                    !strcmp(tag->name, "tr")     ||
465                    (tag->name[0] == 'h' && g_ascii_isdigit(tag->name[1]))) {
466                 if (!parser->newline) {
467                         parser->space = FALSE;
468                         sc_html_append_char(parser, '\n');
469                 }
470                 parser->state = SC_HTML_NORMAL;
471         } else if (!strcmp(tag->name, "/table") ||
472                    (tag->name[0] == '/' &&
473                     tag->name[1] == 'h' &&
474                     g_ascii_isdigit(tag->name[1]))) {
475                 if (!parser->empty_line) {
476                         parser->space = FALSE;
477                         if (!parser->newline) sc_html_append_char(parser, '\n');
478                         sc_html_append_char(parser, '\n');
479                 }
480                 parser->state = SC_HTML_NORMAL;
481         } else if (!strcmp(tag->name, "/div")   ||
482                    !strcmp(tag->name, "/ul")    ||
483                    !strcmp(tag->name, "/li")) {
484                 if (!parser->newline) {
485                         parser->space = FALSE;
486                         sc_html_append_char(parser, '\n');
487                 }
488                 parser->state = SC_HTML_NORMAL;
489                         }
490
491         sc_html_free_tag(tag);
492
493         return parser->state;
494 }
495
496 static void sc_html_parse_special(SC_HTMLParser *parser)
497 {
498         gchar symbol_name[9];
499         gint n;
500         const gchar *val;
501
502         parser->state = SC_HTML_UNKNOWN;
503         g_return_if_fail(*parser->bufp == '&');
504
505         /* &foo; */
506         for (n = 0; parser->bufp[n] != '\0' && parser->bufp[n] != ';'; n++)
507                 ;
508         if (n > 7 || parser->bufp[n] != ';') {
509                 /* output literal `&' */
510                 sc_html_append_char(parser, *parser->bufp++);
511                 parser->state = SC_HTML_NORMAL;
512                 return;
513         }
514         strncpy2(symbol_name, parser->bufp, n + 2);
515         parser->bufp += n + 1;
516
517         if ((val = g_hash_table_lookup(parser->symbol_table, symbol_name))
518             != NULL) {
519                 sc_html_append_str(parser, val, -1);
520                 parser->state = SC_HTML_NORMAL;
521                 return;
522         } else if (symbol_name[1] == '#' && g_ascii_isdigit(symbol_name[2])) {
523                 gint ch;
524
525                 ch = atoi(symbol_name + 2);
526                 if ((ch > 0 && ch <= 127) ||
527                     (ch >= 128 && ch <= 255 &&
528                      parser->conv->charset == C_ISO_8859_1)) {
529                         sc_html_append_char(parser, ch);
530                         parser->state = SC_HTML_NORMAL;
531                         return;
532                 } else {
533                         char *symb = NULL;
534                         switch (ch) {
535                         /* http://www.w3schools.com/html/html_entitiesref.asp */
536                         case 96:        /* backtick  */
537                                 symb = "`";
538                                 break;
539                         case 153:       /* trademark */
540                                 symb = "(TM)";
541                                 break;
542                         case 338:       /* capital ligature OE  &OElig;  */
543                                 symb = "OE";  
544                                 break;
545                         case 339:       /* small ligature OE    &oelig;  */
546                                 symb = "oe";  
547                                 break;
548                         case 352:       /* capital S w/caron    &Scaron; */
549                         case 353:       /* small S w/caron      &scaron; */
550                         case 376:       /* cap Y w/ diaeres     &Yuml;   */
551                                 break;
552                         case 710:       /* circumflex accent    &circ;   */
553                                 symb = "^";  
554                                 break;
555                         case 732:       /* small tilde          &tilde;  */
556                                 symb = "~";  
557                                 break;
558                         case 8194:      /* en space             &ensp;   */
559                         case 8195:      /* em space             &emsp;   */
560                         case 8201:      /* thin space           &thinsp; */
561                                 symb = " ";  
562                                 break;
563                         case 8204:      /* zero width non-joiner &zwnj;  */
564                         case 8205:      /* zero width joiner    &zwj;   */
565                         case 8206:      /* l-t-r mark           &lrm;   */
566                         case 8207:      /* r-t-l mark           &rlm     */
567                                 break;
568                         case 8211:      /* en dash              &ndash;  */
569                                 symb = "-";  
570                                 break;
571                         case 8212:      /* em dash              &mdash;  */
572                                 symb = "--";  
573                                 break;
574                         case 8216:      /* l single quot mark   &lsquo;  */
575                         case 8217:      /* r single quot mark   &rsquo;  */
576                                 symb = "'";  
577                                 break;
578                         case 8218:      /* single low-9 quot    &sbquo;  */
579                                 symb = ",";  
580                                 break;
581                         case 8220:      /* l double quot mark   &ldquo;  */
582                         case 8221:      /* r double quot mark   &rdquo;  */
583                                 symb = "\"";  
584                                 break;
585                         case 8222:      /* double low-9 quot    &bdquo;  */
586                                 symb = ",,";  
587                                 break;
588                         case 8224:      /* dagger               &dagger; */
589                         case 8225:      /* double dagger        &Dagger; */
590                                 break;
591                         case 8226:      /* bullet       &bull;  */
592                                 symb = "*";  
593                                 break;
594                         case 8230:      /* horizontal ellipsis  &hellip; */
595                                 symb = "...";  
596                                 break;
597                         case 8240:      /* per mile             &permil; */
598                                 symb = "\%o";  
599                                 break;
600                         case 8249:      /* l-pointing angle quot &lsaquo; */
601                                 symb = "<";  
602                                 break;
603                         case 8250:      /* r-pointing angle quot &rsaquo; */
604                                 symb = ">";  
605                                 break;
606                         case 8364:      /* euro                 &euro;   */
607                                 symb = "EUR";  
608                                 break;
609                         case 8482:      /* trademark            &trade;  */
610                                 symb  = "(TM)";  
611                                 break;
612                         default: 
613                                 break;
614                         }
615                         if (symb) {
616                                 sc_html_append_str(parser, symb, -1);
617                                 parser->state = SC_HTML_NORMAL;
618                                 return;
619                         }
620                 }
621         }
622
623         sc_html_append_str(parser, symbol_name, -1);
624 }
625
626 static void sc_html_get_parenthesis(SC_HTMLParser *parser, gchar *buf, gint len)
627 {
628         gchar *p;
629
630         buf[0] = '\0';
631         g_return_if_fail(*parser->bufp == '<');
632
633         /* ignore comment / CSS / script stuff */
634         if (!strncmp(parser->bufp, "<!--", 4)) {
635                 parser->bufp += 4;
636                 while ((p = strstr(parser->bufp, "-->")) == NULL)
637                         if (sc_html_read_line(parser) == SC_HTML_EOF) return;
638                 parser->bufp = p + 3;
639                 return;
640         }
641         if (!g_ascii_strncasecmp(parser->bufp, "<style", 6)) {
642                 parser->bufp += 6;
643                 while ((p = strcasestr(parser->bufp, "</style>")) == NULL)
644                         if (sc_html_read_line(parser) == SC_HTML_EOF) return;
645                 parser->bufp = p + 8;
646                 return;
647         }
648         if (!g_ascii_strncasecmp(parser->bufp, "<script", 7)) {
649                 parser->bufp += 7;
650                 while ((p = strcasestr(parser->bufp, "</script>")) == NULL)
651                         if (sc_html_read_line(parser) == SC_HTML_EOF) return;
652                 parser->bufp = p + 9;
653                 return;
654         }
655
656         parser->bufp++;
657         while ((p = strchr(parser->bufp, '>')) == NULL)
658                 if (sc_html_read_line(parser) == SC_HTML_EOF) return;
659
660         strncpy2(buf, parser->bufp, MIN(p - parser->bufp + 1, len));
661         g_strstrip(buf);
662         parser->bufp = p + 1;
663 }