/*
* Sylpheed -- a GTK+ based, lightweight, and fast e-mail client
- * Copyright (C) 1999,2000 Hiroyuki Yamamoto
+ * Copyright (C) 1999-2003 Hiroyuki Yamamoto
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
gchar *buf,
gint len);
-#if 0
-static gint g_str_case_equal (gconstpointer v,
- gconstpointer v2);
-static guint g_str_case_hash (gconstpointer key);
-#endif
HTMLParser *html_parser_new(FILE *fp, CodeConverter *conv)
{
parser->str = g_string_new(NULL);
parser->buf = g_string_new(NULL);
parser->bufp = parser->buf->str;
+ parser->state = HTML_NORMAL;
+ parser->href = NULL;
parser->newline = TRUE;
parser->empty_line = TRUE;
parser->space = FALSE;
{
g_string_free(parser->str, TRUE);
g_string_free(parser->buf, TRUE);
+ g_free(parser->href);
g_free(parser);
}
while (*parser->bufp != '\0') {
switch (*parser->bufp) {
- case '<':
- if (parser->str->len == 0)
- html_parse_tag(parser);
- else
+ case '<': {
+ HTMLState st;
+ st = html_parse_tag(parser);
+ /* when we see an href, we need to flush the str
+ * buffer. Then collect all the chars until we
+ * see the end anchor tag
+ */
+ if (HTML_HREF_BEG == st || HTML_HREF == st)
return parser->str->str;
+ }
break;
case '&':
html_parse_special(parser);
}
if (conv_convert(parser->conv, buf2, sizeof(buf2), buf) < 0) {
- g_warning("html_read_line(): code conversion failed\n");
-
index = parser->bufp - parser->buf->str;
- g_string_append(parser->buf, buf);
+ conv_localetodisp(buf2, sizeof(buf2), buf);
+ g_string_append(parser->buf, buf2);
parser->bufp = parser->buf->str + index;
- return HTML_ERR;
+ return HTML_CONV_FAILED;
}
index = parser->bufp - parser->buf->str;
parser->newline = FALSE;
}
+static HTMLTag *html_get_tag(const gchar *str)
+{
+ HTMLTag *tag;
+ gchar *tmp;
+ guchar *tmpp;
+
+ g_return_val_if_fail(str != NULL, NULL);
+
+ if (*str == '\0' || *str == '!') return NULL;
+
+ Xstrdup_a(tmp, str, return NULL);
+
+ tag = g_new0(HTMLTag, 1);
+
+ for (tmpp = tmp; *tmpp != '\0' && !isspace(*tmpp); tmpp++)
+ ;
+
+ if (*tmpp == '\0') {
+ g_strdown(tmp);
+ tag->name = g_strdup(tmp);
+ return tag;
+ } else {
+ *tmpp++ = '\0';
+ g_strdown(tmp);
+ tag->name = g_strdup(tmp);
+ }
+
+ while (*tmpp != '\0') {
+ HTMLAttr *attr;
+ gchar *attr_name;
+ gchar *attr_value;
+ gchar *p;
+ gchar quote;
+
+ while (isspace(*tmpp)) tmpp++;
+ attr_name = tmpp;
+
+ while (*tmpp != '\0' && !isspace(*tmpp) && *tmpp != '=') tmpp++;
+ if (*tmpp != '\0' && *tmpp != '=') {
+ *tmpp++ = '\0';
+ while (isspace(*tmpp)) tmpp++;
+ }
+
+ if (*tmpp == '=') {
+ *tmpp++ = '\0';
+ while (isspace(*tmpp)) tmpp++;
+
+ if (*tmpp == '"' || *tmpp == '\'') {
+ /* name="value" */
+ quote = *tmpp;
+ tmpp++;
+ attr_value = tmpp;
+ if ((p = strchr(attr_value, quote)) == NULL) {
+ g_warning("html_get_tag(): syntax error in tag: '%s'\n", str);
+ return tag;
+ }
+ tmpp = p;
+ *tmpp++ = '\0';
+ while (isspace(*tmpp)) tmpp++;
+ } else {
+ /* name=value */
+ attr_value = tmpp;
+ while (*tmpp != '\0' && !isspace(*tmpp)) tmpp++;
+ if (*tmpp != '\0')
+ *tmpp++ = '\0';
+ }
+ } else
+ attr_value = "";
+
+ g_strchomp(attr_name);
+ g_strdown(attr_name);
+ attr = g_new(HTMLAttr, 1);
+ attr->name = g_strdup(attr_name);
+ attr->value = g_strdup(attr_value);
+ tag->attr = g_list_append(tag->attr, attr);
+ }
+
+ return tag;
+}
+
+static void html_free_tag(HTMLTag *tag)
+{
+ if (!tag) return;
+
+ g_free(tag->name);
+ while (tag->attr != NULL) {
+ HTMLAttr *attr = (HTMLAttr *)tag->attr->data;
+ g_free(attr->name);
+ g_free(attr->value);
+ g_free(attr);
+ tag->attr = g_list_remove(tag->attr, tag->attr->data);
+ }
+ g_free(tag);
+}
+
static HTMLState html_parse_tag(HTMLParser *parser)
{
gchar buf[HTMLBUFSIZE];
- gchar *p;
+ HTMLTag *tag;
html_get_parenthesis(parser, buf, sizeof(buf));
- for (p = buf; *p != '\0'; p++) {
- if (isspace(*p)) {
- *p = '\0';
- break;
- }
- }
+ tag = html_get_tag(buf);
parser->state = HTML_UNKNOWN;
- if (buf[0] == '\0') return parser->state;
-
- g_strdown(buf);
+ if (!tag) return HTML_UNKNOWN;
- if (!strcmp(buf, "br")) {
+ if (!strcmp(tag->name, "br")) {
parser->space = FALSE;
html_append_char(parser, '\n');
parser->state = HTML_BR;
- } else if (!strcmp(buf, "p")) {
+ } else if (!strcmp(tag->name, "a")) {
+ if (tag->attr && tag->attr->data &&
+ !strcmp(((HTMLAttr *)tag->attr->data)->name, "href")) {
+ g_free(parser->href);
+ parser->href =
+ g_strdup(((HTMLAttr *)tag->attr->data)->value);
+ parser->state = HTML_HREF_BEG;
+ }
+ } else if (!strcmp(tag->name, "/a")) {
+ parser->state = HTML_HREF;
+ } else if (!strcmp(tag->name, "p")) {
parser->space = FALSE;
if (!parser->empty_line) {
parser->space = FALSE;
html_append_char(parser, '\n');
}
parser->state = HTML_PAR;
- } else if (!strcmp(buf, "pre")) {
+ } else if (!strcmp(tag->name, "pre")) {
parser->pre = TRUE;
parser->state = HTML_PRE;
- } else if (!strcmp(buf, "/pre")) {
+ } else if (!strcmp(tag->name, "/pre")) {
parser->pre = FALSE;
parser->state = HTML_NORMAL;
- } else if (!strcmp(buf, "hr")) {
+ } else if (!strcmp(tag->name, "hr")) {
if (!parser->newline) {
parser->space = FALSE;
html_append_char(parser, '\n');
}
html_append_str(parser, HR_STR "\n", -1);
parser->state = HTML_HR;
- } else if (!strcmp(buf, "div") ||
- !strcmp(buf, "ul") ||
- !strcmp(buf, "li") ||
- !strcmp(buf, "table") ||
- !strcmp(buf, "tr") ||
- (buf[0] == 'h' && isdigit(buf[1]))) {
+ } else if (!strcmp(tag->name, "div") ||
+ !strcmp(tag->name, "ul") ||
+ !strcmp(tag->name, "li") ||
+ !strcmp(tag->name, "table") ||
+ !strcmp(tag->name, "tr") ||
+ (tag->name[0] == 'h' && isdigit((guchar)tag->name[1]))) {
if (!parser->newline) {
parser->space = FALSE;
html_append_char(parser, '\n');
}
parser->state = HTML_NORMAL;
- } else if (!strcmp(buf, "/table") ||
- (buf[0] == '/' && buf[1] == 'h' && isdigit(buf[1]))) {
+ } else if (!strcmp(tag->name, "/table") ||
+ (tag->name[0] == '/' &&
+ tag->name[1] == 'h' &&
+ isdigit((guchar)tag->name[1]))) {
if (!parser->empty_line) {
parser->space = FALSE;
if (!parser->newline) html_append_char(parser, '\n');
html_append_char(parser, '\n');
}
parser->state = HTML_NORMAL;
- } else if (!strcmp(buf, "/div") ||
- !strcmp(buf, "/ul") ||
- !strcmp(buf, "/li")) {
+ } else if (!strcmp(tag->name, "/div") ||
+ !strcmp(tag->name, "/ul") ||
+ !strcmp(tag->name, "/li")) {
if (!parser->newline) {
parser->space = FALSE;
html_append_char(parser, '\n');
}
parser->state = HTML_NORMAL;
- }
+ }
+
+ html_free_tag(tag);
return parser->state;
}
html_append_str(parser, val, -1);
parser->state = HTML_NORMAL;
return;
- } else if (symbol_name[1] == '#' && isdigit(symbol_name[2])) {
+ } else if (symbol_name[1] == '#' && isdigit((guchar)symbol_name[2])) {
gint ch;
ch = atoi(symbol_name + 2);
html_append_char(parser, ch);
parser->state = HTML_NORMAL;
return;
+ } else {
+ char *symb = NULL;
+ switch (ch) {
+ /* http://www.w3schools.com/html/html_entitiesref.asp */
+ case 338: /* capital ligature OE Œ */
+ symb = "OE";
+ break;
+ case 339: /* small ligature OE œ */
+ symb = "oe";
+ break;
+ case 352: /* capital S w/caron Š */
+ case 353: /* small S w/caron š */
+ case 376: /* cap Y w/ diaeres Ÿ */
+ break;
+ case 710: /* circumflex accent ˆ */
+ symb = "^";
+ break;
+ case 732: /* small tilde ˜ */
+ symb = "~";
+ break;
+ case 8194: /* en space   */
+ case 8195: /* em space   */
+ case 8201: /* thin space   */
+ symb = " ";
+ break;
+ case 8204: /* zero width non-joiner ‌ */
+ case 8205: /* zero width joiner ‍ */
+ case 8206: /* l-t-r mark ‎ */
+ case 8207: /* r-t-l mark &rlm */
+ break;
+ case 8211: /* en dash – */
+ symb = "-";
+ break;
+ case 8212: /* em dash — */
+ symb = "--";
+ break;
+ case 8216: /* l single quot mark ‘ */
+ symb = "`";
+ break;
+ case 8217: /* r single quot mark ’ */
+ symb = "'";
+ break;
+ case 8218: /* single low-9 quot ‚ */
+ symb = ",";
+ break;
+ case 8220: /* l double quot mark “ */
+ symb = "``";
+ break;
+ case 8221: /* r double quot mark ” */
+ symb = "''";
+ break;
+ case 8222: /* double low-9 quot „ */
+ symb = ",,";
+ break;
+ case 8224: /* dagger † */
+ case 8225: /* double dagger ‡ */
+ break;
+ case 8230: /* horizontal ellipsis … */
+ symb = "...";
+ break;
+ case 8240: /* per mile ‰ */
+ symb = "\%o";
+ break;
+ case 8249: /* l-pointing angle quot ‹ */
+ symb = "<";
+ break;
+ case 8250: /* r-pointing angle quot › */
+ symb = ">";
+ break;
+ case 8364: /* euro € */
+ symb = "EUR";
+ break;
+ case 8482: /* trademark ™ */
+ symb = "(TM)";
+ break;
+ default:
+ break;
+ }
+ if (symb) {
+ html_append_str(parser, symb, -1);
+ parser->state = HTML_NORMAL;
+ return;
+ }
}
}
buf[0] = '\0';
g_return_if_fail(*parser->bufp == '<');
- /* ignore comments */
+ /* ignore comment / CSS / script stuff */
if (!strncmp(parser->bufp, "<!--", 4)) {
parser->bufp += 4;
while ((p = strstr(parser->bufp, "-->")) == NULL)
parser->bufp = p + 3;
return;
}
+ if (!g_strncasecmp(parser->bufp, "<style", 6)) {
+ parser->bufp += 6;
+ while ((p = strcasestr(parser->bufp, "</style>")) == NULL)
+ if (html_read_line(parser) == HTML_EOF) return;
+ parser->bufp = p + 8;
+ return;
+ }
+ if (!g_strncasecmp(parser->bufp, "<script", 7)) {
+ parser->bufp += 7;
+ while ((p = strcasestr(parser->bufp, "</script>")) == NULL)
+ if (html_read_line(parser) == HTML_EOF) return;
+ parser->bufp = p + 9;
+ return;
+ }
parser->bufp++;
while ((p = strchr(parser->bufp, '>')) == NULL)
if (html_read_line(parser) == HTML_EOF) return;
strncpy2(buf, parser->bufp, MIN(p - parser->bufp + 1, len));
+ g_strstrip(buf);
parser->bufp = p + 1;
}
-
-/* these hash functions were taken from gstring.c in glib */
-#if 0
-static gint g_str_case_equal(gconstpointer v, gconstpointer v2)
-{
- return strcasecmp((const gchar *)v, (const gchar *)v2) == 0;
-}
-
-static guint g_str_case_hash(gconstpointer key)
-{
- const gchar *p = key;
- guint h = *p;
-
- if (h) {
- h = tolower(h);
- for (p += 1; *p != '\0'; p++)
- h = (h << 5) - h + tolower(*p);
- }
-
- return h;
-}
-#endif