From: Andrej Kacian Date: Thu, 27 Nov 2014 00:00:11 +0000 (+0100) Subject: RSSyl: Wrote more robust replacing of HTML character references. We now handle decima... X-Git-Tag: 3.12.0~211 X-Git-Url: http://git.claws-mail.org/?p=claws.git;a=commitdiff_plain;h=491667f21ad76f34145ac78e7682f115763fe1e0 RSSyl: Wrote more robust replacing of HTML character references. We now handle decimal references (Ӓ), in addition to few selected named references. --- diff --git a/src/plugins/rssyl/strutils.c b/src/plugins/rssyl/strutils.c index 3d9f1c53f..b2704d973 100644 --- a/src/plugins/rssyl/strutils.c +++ b/src/plugins/rssyl/strutils.c @@ -113,22 +113,25 @@ struct _RSSyl_HTMLSymbol gchar *const val; }; +/* TODO: find a way to offload this to a library which knows all the + * defined named entities (over 200). */ static RSSyl_HTMLSymbol symbol_list[] = { - { "<", "<" }, - { ">", ">" }, - { "&", "&" }, - { """, "\"" }, - { "‘", "'" }, - { "’", "'" }, - { "“", "\"" }, - { "”", "\"" }, - { " ", " " }, - { "™", "(TM)" }, - { "™", "(TM)" }, - { "'", "'" }, - { "…", "..." }, - { "…", "..." }, - { "—", "-" }, + { "lt", "<" }, + { "gt", ">" }, + { "amp", "&" }, + { "apos", "'" }, + { "quot", "\"" }, + { "lsquo", "‘" }, + { "rsquo", "’" }, + { "ldquo", "“" }, + { "rdquo", "”" }, + { "nbsp", " " }, + { "trade", "™" }, + { "copy", "©" }, + { "reg", "®" }, + { "hellip", "…" }, + { "mdash", "—" }, + { "euro", "€" }, { NULL, NULL } }; @@ -147,6 +150,71 @@ static RSSyl_HTMLSymbol tag_list[] = { { NULL, NULL } }; +static gchar *rssyl_replace_chrefs(gchar *string) +{ + char *new = g_malloc0(strlen(string)), *ret; + char buf[16], tmp[6]; + int i, ii, j, n, len; + gunichar c; + gboolean valid, replaced; + + /* &xx; */ + ii = 0; + for (i = 0; i < strlen(string); ++i) { + if (string[i] == '&') { + j = i+1; + n = 0; + valid = FALSE; + while (string[j] != '\0' && j < 16) { + if (string[j] != ';') { + buf[n++] = string[j]; + } else { + /* End of entity */ + valid = TRUE; + buf[n] = '\0'; + break; + } + j++; + } + if (strlen(buf) > 0 && valid) { + replaced = FALSE; + + if (buf[0] == '#' && (c = atoi(buf+1)) > 0) { + len = g_unichar_to_utf8(c, tmp); + tmp[len] = '\0'; + g_strlcat(new, tmp, strlen(string)); + ii += len; + replaced = TRUE; + } else { + for (c = 0; symbol_list[c].key != NULL; c++) { + if (!strcmp(buf, symbol_list[c].key)) { + g_strlcat(new, symbol_list[c].val, strlen(string)); + ii += strlen(symbol_list[c].val); + replaced = TRUE; + break; + } + } + } + if (!replaced) { + new[ii++] = '&'; /* & */ + g_strlcat(new, buf, strlen(string)); + ii += strlen(buf); + new[ii++] = ';'; + } + i = j; + } else { + new[ii++] = string[i]; + } + } else { + new[ii++] = string[i]; + } + } + + ret = g_strdup(new); + g_free(new); + return ret; +} + gchar *rssyl_replace_html_stuff(gchar *text, gboolean symbols, gboolean tags) { @@ -155,24 +223,18 @@ gchar *rssyl_replace_html_stuff(gchar *text, g_return_val_if_fail(text != NULL, NULL); - wtext = g_strdup(text); - - /* Ugly, needlessly traverses the string again and again. Probably - * could use a rewrite. */ if( symbols ) { - for( i = 0; symbol_list[i].key != NULL; i++ ) { - if( g_strstr_len(text, strlen(text), symbol_list[i].key) ) { - tmp = rssyl_strreplace(wtext, symbol_list[i].key, symbol_list[i].val); - wtext = g_strdup(tmp); - g_free(tmp); - } - } + wtext = rssyl_replace_chrefs(text); + } else { + wtext = g_strdup(text); } + /* TODO: rewrite this part to work similarly to rssyl_replace_chrefs() */ if( tags ) { for( i = 0; tag_list[i].key != NULL; i++ ) { if( g_strstr_len(text, strlen(text), symbol_list[i].key) ) { tmp = rssyl_strreplace(wtext, tag_list[i].key, tag_list[i].val); + g_free(wtext); wtext = g_strdup(tmp); g_free(tmp); }