Implement image handling
[claws.git] / src / html.c
1 /*
2  * Claws Mail -- a GTK+ based, lightweight, and fast e-mail client
3  * Copyright (C) 1999-2016 Hiroyuki Yamamoto and the Claws Mail team
4  *
5  * This program is free software; you can redistribute it and/or modify
6  * it under the terms of the GNU General Public License as published by
7  * the Free Software Foundation; either version 3 of the License, or
8  * (at your option) any later version.
9  *
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  * GNU General Public License for more details.
14  *
15  * You should have received a copy of the GNU General Public License
16  * along with this program. If not, see <http://www.gnu.org/licenses/>.
17  */
18
19 #ifdef HAVE_CONFIG_H
20 #  include "config.h"
21 #include "claws-features.h"
22 #endif
23
24 #include <glib.h>
25 #include <stdio.h>
26 #include <string.h>
27 #include <ctype.h>
28
29 #include "html.h"
30 #include "codeconv.h"
31 #include "utils.h"
32 #include "entity.h"
33 #include "file-utils.h"
34
35 #define SC_HTMLBUFSIZE  8192
36 #define HR_STR          "────────────────────────────────────────────────"
37 #define LI_STR          "• "
38
39 static SC_HTMLState sc_html_read_line   (SC_HTMLParser  *parser);
40 static void sc_html_append_char                 (SC_HTMLParser  *parser,
41                                          gchar           ch);
42 static void sc_html_append_str                  (SC_HTMLParser  *parser,
43                                          const gchar    *str,
44                                          gint            len);
45 static SC_HTMLState sc_html_parse_tag   (SC_HTMLParser  *parser);
46 static void sc_html_parse_special               (SC_HTMLParser  *parser);
47 static void sc_html_get_parenthesis             (SC_HTMLParser  *parser,
48                                          gchar          *buf,
49                                          gint            len);
50
51
52 SC_HTMLParser *sc_html_parser_new(FILE *fp, CodeConverter *conv)
53 {
54         SC_HTMLParser *parser;
55
56         cm_return_val_if_fail(fp != NULL, NULL);
57         cm_return_val_if_fail(conv != NULL, NULL);
58
59         parser = g_new0(SC_HTMLParser, 1);
60         parser->fp = fp;
61         parser->conv = conv;
62         parser->str = g_string_new(NULL);
63         parser->buf = g_string_new(NULL);
64         parser->bufp = parser->buf->str;
65         parser->state = SC_HTML_NORMAL;
66         parser->href = NULL;
67         parser->newline = TRUE;
68         parser->empty_line = TRUE;
69         parser->space = FALSE;
70         parser->pre = FALSE;
71         parser->indent = 0;
72
73         return parser;
74 }
75
76 void sc_html_parser_destroy(SC_HTMLParser *parser)
77 {
78         g_string_free(parser->str, TRUE);
79         g_string_free(parser->buf, TRUE);
80         g_free(parser->href);
81         g_free(parser);
82 }
83
84 gchar *sc_html_parse(SC_HTMLParser *parser)
85 {
86         parser->state = SC_HTML_NORMAL;
87         g_string_truncate(parser->str, 0);
88
89         if (*parser->bufp == '\0') {
90                 g_string_truncate(parser->buf, 0);
91                 parser->bufp = parser->buf->str;
92                 if (sc_html_read_line(parser) == SC_HTML_EOF)
93                         return NULL;
94         }
95
96         while (*parser->bufp != '\0') {
97                 switch (*parser->bufp) {
98                 case '<': {
99                         SC_HTMLState st;
100                         st = sc_html_parse_tag(parser);
101                         /* when we see an href, we need to flush the str
102                          * buffer.  Then collect all the chars until we
103                          * see the end anchor tag
104                          */
105                         if (SC_HTML_HREF_BEG == st || SC_HTML_HREF == st)
106                                 return parser->str->str;
107                         } 
108                         break;
109                 case '&':
110                         sc_html_parse_special(parser);
111                         break;
112                 case ' ':
113                 case '\t':
114                 case '\r':
115                 case '\n':
116                         if (parser->bufp[0] == '\r' && parser->bufp[1] == '\n')
117                                 parser->bufp++;
118
119                         if (!parser->pre) {
120                                 if (!parser->newline)
121                                         parser->space = TRUE;
122
123                                 parser->bufp++;
124                                 break;
125                         }
126                         /* fallthrough */
127                 default:
128                         sc_html_append_char(parser, *parser->bufp++);
129                 }
130         }
131
132         return parser->str->str;
133 }
134
135 static SC_HTMLState sc_html_read_line(SC_HTMLParser *parser)
136 {
137         gchar buf[SC_HTMLBUFSIZE];
138         gchar buf2[SC_HTMLBUFSIZE*4];
139         gint index;
140         gint n;
141
142         if (parser->fp == NULL)
143                 return SC_HTML_EOF;
144
145         n = claws_fread(buf, 1, sizeof(buf) - 1, parser->fp);
146         if (n == 0) {
147                 parser->state = SC_HTML_EOF;
148                 return SC_HTML_EOF;
149         } else
150                 buf[n] = '\0';
151
152         if (conv_convert(parser->conv, buf2, sizeof(buf2), buf) < 0) {
153                 index = parser->bufp - parser->buf->str;
154
155                 conv_utf8todisp(buf2, sizeof(buf2), buf);
156                 g_string_append(parser->buf, buf2);
157
158                 parser->bufp = parser->buf->str + index;
159
160                 return SC_HTML_CONV_FAILED;
161         }
162
163         index = parser->bufp - parser->buf->str;
164
165         g_string_append(parser->buf, buf2);
166
167         parser->bufp = parser->buf->str + index;
168
169         return SC_HTML_NORMAL;
170 }
171
172 static void sc_html_append_char(SC_HTMLParser *parser, gchar ch)
173 {
174         GString *str = parser->str;
175
176         if (!parser->pre && parser->space) {
177                 g_string_append_c(str, ' ');
178                 parser->space = FALSE;
179         }
180
181         g_string_append_c(str, ch);
182
183         parser->empty_line = FALSE;
184         if (ch == '\n') {
185                 parser->newline = TRUE;
186                 if (str->len > 1 && str->str[str->len - 2] == '\n')
187                         parser->empty_line = TRUE;
188                 if (parser->indent > 0) {
189                         gint i, n = parser->indent;
190                         for (i = 0; i < n; i++)
191                                 g_string_append_c(str, '>');
192                         g_string_append_c(str, ' ');
193                 }
194         } else
195                 parser->newline = FALSE;
196 }
197
198 static void sc_html_append_str(SC_HTMLParser *parser, const gchar *str, gint len)
199 {
200         GString *string = parser->str;
201
202         if (!parser->pre && parser->space) {
203                 g_string_append_c(string, ' ');
204                 parser->space = FALSE;
205         }
206
207         if (len == 0) return;
208         if (len < 0)
209                 g_string_append(string, str);
210         else {
211                 gchar *s;
212                 Xstrndup_a(s, str, len, return);
213                 g_string_append(string, s);
214         }
215
216         parser->empty_line = FALSE;
217         if (string->len > 0 && string->str[string->len - 1] == '\n') {
218                 parser->newline = TRUE;
219                 if (string->len > 1 && string->str[string->len - 2] == '\n')
220                         parser->empty_line = TRUE;
221         } else
222                 parser->newline = FALSE;
223 }
224
225 static SC_HTMLTag *sc_html_get_tag(const gchar *str)
226 {
227         SC_HTMLTag *tag;
228         gchar *tmp;
229         guchar *tmpp;
230
231         cm_return_val_if_fail(str != NULL, NULL);
232
233         if (*str == '\0' || *str == '!') return NULL;
234
235         Xstrdup_a(tmp, str, return NULL);
236
237         tag = g_new0(SC_HTMLTag, 1);
238
239         for (tmpp = tmp; *tmpp != '\0' && !g_ascii_isspace(*tmpp); tmpp++)
240                 ;
241
242         if (*tmpp == '\0') {
243                 tag->name = g_utf8_strdown(tmp, -1);
244                 return tag;
245         } else {
246                 *tmpp++ = '\0';
247                 tag->name = g_utf8_strdown(tmp, -1);
248         }
249
250         while (*tmpp != '\0') {
251                 SC_HTMLAttr *attr;
252                 gchar *attr_name;
253                 gchar *attr_value;
254                 gchar *p;
255                 gchar quote;
256
257                 while (g_ascii_isspace(*tmpp)) tmpp++;
258                 attr_name = tmpp;
259
260                 while (*tmpp != '\0' && !g_ascii_isspace(*tmpp) &&
261                        *tmpp != '=')
262                         tmpp++;
263                 if (*tmpp != '\0' && *tmpp != '=') {
264                         *tmpp++ = '\0';
265                         while (g_ascii_isspace(*tmpp)) tmpp++;
266                 }
267
268                 if (*tmpp == '=') {
269                         *tmpp++ = '\0';
270                         while (g_ascii_isspace(*tmpp)) tmpp++;
271
272                         if (*tmpp == '"' || *tmpp == '\'') {
273                                 /* name="value" */
274                                 quote = *tmpp;
275                                 tmpp++;
276                                 attr_value = tmpp;
277                                 if ((p = strchr(attr_value, quote)) == NULL) {
278                                         if (debug_get_mode()) {
279                                                 g_warning("sc_html_get_tag(): syntax error in tag: '%s'",
280                                                                   str);
281                                         } else {
282                                                 gchar *cut = g_strndup(str, 100);
283                                                 g_warning("sc_html_get_tag(): syntax error in tag: '%s%s'",
284                                                                   cut, strlen(str)>100?"...":".");
285                                                 g_free(cut);
286                                         }
287                                         return tag;
288                                 }
289                                 tmpp = p;
290                                 *tmpp++ = '\0';
291                                 while (g_ascii_isspace(*tmpp)) tmpp++;
292                         } else {
293                                 /* name=value */
294                                 attr_value = tmpp;
295                                 while (*tmpp != '\0' && !g_ascii_isspace(*tmpp)) tmpp++;
296                                 if (*tmpp != '\0')
297                                         *tmpp++ = '\0';
298                         }
299                 } else
300                         attr_value = "";
301
302                 g_strchomp(attr_name);
303                 attr = g_new(SC_HTMLAttr, 1);
304                 attr->name = g_utf8_strdown(attr_name, -1);
305                 attr->value = g_strdup(attr_value);
306                 tag->attr = g_list_append(tag->attr, attr);
307         }
308
309         return tag;
310 }
311
312 static void sc_html_free_tag(SC_HTMLTag *tag)
313 {
314         if (!tag) return;
315
316         g_free(tag->name);
317         while (tag->attr != NULL) {
318                 SC_HTMLAttr *attr = (SC_HTMLAttr *)tag->attr->data;
319                 g_free(attr->name);
320                 g_free(attr->value);
321                 g_free(attr);
322                 tag->attr = g_list_remove(tag->attr, tag->attr->data);
323         }
324         g_free(tag);
325 }
326
327 static void decode_href(SC_HTMLParser *parser)
328 {
329         gchar *tmp;
330         SC_HTMLParser *tparser = g_new0(SC_HTMLParser, 1);
331
332         tparser->str = g_string_new(NULL);
333         tparser->buf = g_string_new(parser->href);
334         tparser->bufp = tparser->buf->str;
335
336         tmp = sc_html_parse(tparser);
337         
338         g_free(parser->href);
339         parser->href = g_strdup(tmp);
340
341         sc_html_parser_destroy(tparser);
342 }
343
344 static SC_HTMLState sc_html_parse_tag(SC_HTMLParser *parser)
345 {
346         gchar buf[SC_HTMLBUFSIZE];
347         SC_HTMLTag *tag;
348
349         sc_html_get_parenthesis(parser, buf, sizeof(buf));
350
351         tag = sc_html_get_tag(buf);
352
353         parser->state = SC_HTML_UNKNOWN;
354         if (!tag) return SC_HTML_UNKNOWN;
355
356         if (!strcmp(tag->name, "br") || !strcmp(tag->name, "br/")) {
357                 parser->space = FALSE;
358                 sc_html_append_char(parser, '\n');
359                 parser->state = SC_HTML_BR;
360         } else if (!strcmp(tag->name, "a")) {
361                 GList *cur;
362                 if (parser->href != NULL) {
363                         g_free(parser->href);
364                         parser->href = NULL;
365                 }
366                 for (cur = tag->attr; cur != NULL; cur = cur->next) {
367                         if (cur->data && !strcmp(((SC_HTMLAttr *)cur->data)->name, "href")) {
368                                 g_free(parser->href);
369                                 parser->href = g_strdup(((SC_HTMLAttr *)cur->data)->value);
370                                 decode_href(parser);
371                                 parser->state = SC_HTML_HREF_BEG;
372                                 break;
373                         }
374                 }
375                 if (parser->href == NULL)
376                         parser->href = g_strdup("");
377                 parser->state = SC_HTML_HREF_BEG;
378         } else if (!strcmp(tag->name, "/a")) {
379                 parser->state = SC_HTML_HREF;
380         } else if (!strcmp(tag->name, "p")) {
381                 parser->space = FALSE;
382                 if (!parser->empty_line) {
383                         parser->space = FALSE;
384                         if (!parser->newline) sc_html_append_char(parser, '\n');
385                         sc_html_append_char(parser, '\n');
386                 }
387                 parser->state = SC_HTML_PAR;
388         } else if (!strcmp(tag->name, "pre")) {
389                 parser->pre = TRUE;
390                 parser->state = SC_HTML_PRE;
391         } else if (!strcmp(tag->name, "/pre")) {
392                 parser->pre = FALSE;
393                 parser->state = SC_HTML_NORMAL;
394         } else if (!strcmp(tag->name, "hr")) {
395                 if (!parser->newline) {
396                         parser->space = FALSE;
397                         sc_html_append_char(parser, '\n');
398                 }
399                 sc_html_append_str(parser, HR_STR, -1);
400                 sc_html_append_char(parser, '\n');
401                 parser->state = SC_HTML_HR;
402         } else if (!strcmp(tag->name, "div")    ||
403                    !strcmp(tag->name, "ul")     ||
404                    !strcmp(tag->name, "li")     ||
405                    !strcmp(tag->name, "table")  ||
406                    !strcmp(tag->name, "dd")     ||
407                    !strcmp(tag->name, "tr")) {
408                 if (!parser->newline) {
409                         parser->space = FALSE;
410                         sc_html_append_char(parser, '\n');
411                 }
412                 if (!strcmp(tag->name, "li")) {
413                         sc_html_append_str(parser, LI_STR, -1);
414                 }
415                 parser->state = SC_HTML_NORMAL;
416         } else if (tag->name[0] == 'h' && g_ascii_isdigit(tag->name[1])) {
417                 if (!parser->newline) {
418                         parser->space = FALSE;
419                         sc_html_append_char(parser, '\n');
420                 }
421                 sc_html_append_char(parser, '\n');
422         } else if (!strcmp(tag->name, "blockquote")) {
423                 parser->state = SC_HTML_NORMAL;
424                 parser->indent++;
425         } else if (!strcmp(tag->name, "/blockquote")) {
426                 parser->state = SC_HTML_NORMAL;
427                 parser->indent--;
428         } else if (!strcmp(tag->name, "/table") ||
429                    (tag->name[0] == '/' &&
430                     tag->name[1] == 'h' &&
431                     g_ascii_isdigit(tag->name[2]))) {
432                 if (!parser->empty_line) {
433                         parser->space = FALSE;
434                         if (!parser->newline) sc_html_append_char(parser, '\n');
435                         sc_html_append_char(parser, '\n');
436                 }
437                 parser->state = SC_HTML_NORMAL;
438         } else if (!strcmp(tag->name, "/div")   ||
439                    !strcmp(tag->name, "/ul")    ||
440                    !strcmp(tag->name, "/li")) {
441                 if (!parser->newline) {
442                         parser->space = FALSE;
443                         sc_html_append_char(parser, '\n');
444                 }
445                 parser->state = SC_HTML_NORMAL;
446                         }
447
448         sc_html_free_tag(tag);
449
450         return parser->state;
451 }
452
453 static void sc_html_parse_special(SC_HTMLParser *parser)
454 {
455         gchar *entity;
456
457         parser->state = SC_HTML_UNKNOWN;
458         cm_return_if_fail(*parser->bufp == '&');
459
460         entity = entity_decode(parser->bufp);
461         if (entity != NULL) {
462                 sc_html_append_str(parser, entity, -1);
463                 g_free(entity);
464                 while (*parser->bufp++ != ';');
465         } else {
466                 /* output literal `&' */
467                 sc_html_append_char(parser, *parser->bufp++);
468         }
469         parser->state = SC_HTML_NORMAL;
470 }
471
472 static gchar *sc_html_find_tag(SC_HTMLParser *parser, const gchar *tag)
473 {
474         gchar *cur = parser->bufp;
475         gint len = strlen(tag);
476
477         if (cur == NULL)
478                 return NULL;
479
480         while ((cur = strstr(cur, "<")) != NULL) {
481                 if (!g_ascii_strncasecmp(cur, tag, len))
482                         return cur;
483                 cur += 2;
484         }
485         return NULL;
486 }
487
488 static void sc_html_get_parenthesis(SC_HTMLParser *parser, gchar *buf, gint len)
489 {
490         gchar *p;
491
492         buf[0] = '\0';
493         cm_return_if_fail(*parser->bufp == '<');
494
495         /* ignore comment / CSS / script stuff */
496         if (!strncmp(parser->bufp, "<!--", 4)) {
497                 parser->bufp += 4;
498                 while ((p = strstr(parser->bufp, "-->")) == NULL)
499                         if (sc_html_read_line(parser) == SC_HTML_EOF) return;
500                 parser->bufp = p + 3;
501                 return;
502         }
503         if (!g_ascii_strncasecmp(parser->bufp, "<style", 6)) {
504                 parser->bufp += 6;
505                 while ((p = sc_html_find_tag(parser, "</style>")) == NULL)
506                         if (sc_html_read_line(parser) == SC_HTML_EOF) return;
507                 parser->bufp = p + 8;
508                 return;
509         }
510         if (!g_ascii_strncasecmp(parser->bufp, "<script", 7)) {
511                 parser->bufp += 7;
512                 while ((p = sc_html_find_tag(parser, "</script>")) == NULL)
513                         if (sc_html_read_line(parser) == SC_HTML_EOF) return;
514                 parser->bufp = p + 9;
515                 return;
516         }
517
518         parser->bufp++;
519         while ((p = strchr(parser->bufp, '>')) == NULL)
520                 if (sc_html_read_line(parser) == SC_HTML_EOF) return;
521
522         strncpy2(buf, parser->bufp, MIN(p - parser->bufp + 1, len));
523         g_strstrip(buf);
524         parser->bufp = p + 1;
525 }