RSSyl: Wrote more robust replacing of HTML character references. We now handle decima...
authorAndrej Kacian <ticho@claws-mail.org>
Thu, 27 Nov 2014 00:00:11 +0000 (01:00 +0100)
committerAndrej Kacian <ticho@claws-mail.org>
Thu, 27 Nov 2014 00:05:19 +0000 (01:05 +0100)
src/plugins/rssyl/strutils.c

index 3d9f1c5..b2704d9 100644 (file)
@@ -113,22 +113,25 @@ struct _RSSyl_HTMLSymbol
        gchar *const val;
 };
 
+/* TODO: find a way to offload this to a library which knows all the
+ * defined named entities (over 200). */
 static RSSyl_HTMLSymbol symbol_list[] = {
-       { "&lt;", "<" },
-       { "&gt;", ">" },
-       { "&amp;", "&" },
-       { "&quot;", "\"" },
-       { "&lsquo;",  "'" },
-       { "&rsquo;",  "'" },
-       { "&ldquo;",  "\"" },
-       { "&rdquo;",  "\"" },
-       { "&nbsp;", " " },
-       { "&trade;", "(TM)" },
-       { "&#153;", "(TM)" },
-       { "&#39;", "'" },
-       { "&hellip;", "..." },
-       { "&#8230;", "..." },
-       { "&mdash;", "-" },
+       { "lt", "<" },
+       { "gt", ">" },
+       { "amp", "&" },
+       { "apos", "'" },
+       { "quot", "\"" },
+       { "lsquo",  "‘" },
+       { "rsquo",  "’" },
+       { "ldquo",  "“" },
+       { "rdquo",  "”" },
+       { "nbsp", " " },
+       { "trade", "™" },
+       { "copy", "©" },
+       { "reg", "®" },
+       { "hellip", "…" },
+       { "mdash", "—" },
+       { "euro", "€" },
        { NULL, NULL }
 };
 
@@ -147,6 +150,71 @@ static RSSyl_HTMLSymbol tag_list[] = {
        { NULL, NULL }
 };
 
+static gchar *rssyl_replace_chrefs(gchar *string)
+{
+       char *new = g_malloc0(strlen(string)), *ret;
+       char buf[16], tmp[6];
+       int i, ii, j, n, len;
+       gunichar c;
+       gboolean valid, replaced;
+
+       /* &xx; */
+       ii = 0;
+       for (i = 0; i < strlen(string); ++i) {
+               if (string[i] == '&') {
+                       j = i+1;
+                       n = 0;
+                       valid = FALSE;
+                       while (string[j] != '\0' && j < 16) {
+                               if (string[j] != ';') {
+                                       buf[n++] = string[j];
+                               } else {
+                                       /* End of entity */
+                                       valid = TRUE;
+                                       buf[n] = '\0';
+                                       break;
+                               }
+                               j++;
+                       }
+                       if (strlen(buf) > 0 && valid) {
+                               replaced = FALSE;
+
+                               if (buf[0] == '#' && (c = atoi(buf+1)) > 0) {
+                                       len = g_unichar_to_utf8(c, tmp);
+                                       tmp[len] = '\0';
+                                       g_strlcat(new, tmp, strlen(string));
+                                       ii += len;
+                                       replaced = TRUE;
+                               } else {
+                                       for (c = 0; symbol_list[c].key != NULL; c++) {
+                                               if (!strcmp(buf, symbol_list[c].key)) {
+                                                       g_strlcat(new, symbol_list[c].val, strlen(string));
+                                                       ii += strlen(symbol_list[c].val);
+                                                       replaced = TRUE;
+                                                       break;
+                                               }
+                                       }
+                               }
+                               if (!replaced) {
+                                       new[ii++] = '&'; /* & */
+                                       g_strlcat(new, buf, strlen(string));
+                                       ii += strlen(buf);
+                                       new[ii++] = ';';
+                               }
+                               i = j;
+                       } else {
+                               new[ii++] = string[i];
+                       }
+               } else {
+                       new[ii++] = string[i];
+               }
+       }
+
+       ret = g_strdup(new);
+       g_free(new);
+       return ret;
+}
+
 gchar *rssyl_replace_html_stuff(gchar *text,
                gboolean symbols, gboolean tags)
 {
@@ -155,24 +223,18 @@ gchar *rssyl_replace_html_stuff(gchar *text,
 
        g_return_val_if_fail(text != NULL, NULL);
 
-       wtext = g_strdup(text);
-
-       /* Ugly, needlessly traverses the string again and again. Probably
-        * could use a rewrite. */
        if( symbols ) {
-               for( i = 0; symbol_list[i].key != NULL; i++ ) {
-                       if( g_strstr_len(text, strlen(text), symbol_list[i].key) ) {
-                               tmp = rssyl_strreplace(wtext, symbol_list[i].key, symbol_list[i].val);
-                               wtext = g_strdup(tmp);
-                               g_free(tmp);
-                       }
-               }
+               wtext = rssyl_replace_chrefs(text);
+       } else {
+               wtext = g_strdup(text);
        }
 
+       /* TODO: rewrite this part to work similarly to rssyl_replace_chrefs() */
        if( tags ) {
                for( i = 0; tag_list[i].key != NULL; i++ ) {
                        if( g_strstr_len(text, strlen(text), symbol_list[i].key) ) {
                                tmp = rssyl_strreplace(wtext, tag_list[i].key, tag_list[i].val);
+                               g_free(wtext);
                                wtext = g_strdup(tmp);
                                g_free(tmp);
                        }