Simplify comparison
[claws.git] / src / entity.c
index cc7229168aa7a281f7beef1570493ed1cf8681d8..058c3aa3d5aa588281aeb72dc261c51f1b8bf698 100644 (file)
@@ -20,9 +20,7 @@
 #include "claws-features.h"
 #endif
 
-#include "defs.h"
 #include "utils.h"
-#include "entity.h"
 
 #define ENTITY_MAX_LEN 8
 #define DECODED_MAX_LEN 6
@@ -37,137 +35,285 @@ struct _EntitySymbol
        gchar *const value;
 };
 
+/* in alphabetical order with upper-case version first */
 static EntitySymbol symbolic_entities[] = {
-       /* in alphabetical order with upper-case version first */
-       {"Aacute", "\303\201"},
-       {"aacute", "\303\241"},
-       {"Acirc", "\303\202"},
-       {"acirc", "\303\242"},
-       {"acute", "\302\264"},
-       {"AElig", "\303\206"},
-       {"aelig", "\303\246"},
-       {"Agrave", "\303\200"},
-       {"agrave", "\303\240"},
-       {"amp", "&" },
-       {"apos", "'" },
-       {"Aring", "\303\205"},
-       {"aring", "\303\245"},
-       {"Atilde", "\303\203"},
-       {"atilde", "\303\243"},
-       {"Auml", "\303\204"},
-       {"auml", "\303\244"},
-       {"bdquo", "\342\200\236"},
-       {"brvbar", "\302\246"},
-       {"bull", "\342\200\242"},
-       {"Ccedil", "\303\207"},
-       {"ccedil", "\303\247"},
-       {"cedil", "\302\270"},
-       {"cent", "\302\242"},
-       {"circ", "\313\206"},
-       {"copy", "©" },
-       {"curren", "\302\244"},
-       {"dagger", "\342\200\240"},
-       {"Dagger", "\342\200\241"},
-       {"deg", "\302\260"},
-       {"divide", "\303\267"},
-       {"Eacute", "\303\211"},
-       {"eacute", "\303\251"},
-       {"Ecirc", "\303\212"},
-       {"ecirc", "\303\252"},
-       {"Egrave", "\303\210"},
-       {"egrave", "\303\250"},
-       {"emsp", "\342\200\203"},
-       {"ensp", "\342\200\202"},
-       {"ETH", "\303\220"},
-       {"eth", "\303\260"},
-       {"Euml", "\303\213"},
-       {"euml", "\303\253"},
-       {"euro", "€" },
-       {"frac12", "\302\275"},
-       {"frac14", "\302\274"},
-       {"frac34", "\302\276"},
-       {"gt", ">" },
-       {"hellip", "…" },
-       {"Iacute", "\303\215"},
-       {"iacute", "\303\255"},
-       {"Icirc", "\303\216"},
-       {"icirc", "\303\256"},
-       {"iexcl", "\302\241"},
-       {"Igrave", "\303\214"},
-       {"igrave", "\303\254"},
-       {"iquest", "\302\277"},
-       {"Iuml", "\303\217"},
-       {"iuml", "\303\257"},
-       {"laquo", "\302\253"},
-       {"ldquo",  "“" },
-       {"lsaquo", "\342\200\271"},
-       {"lsquo",  "‘" },
-       {"lt", "<" },
-       {"macr", "\302\257"},
-       {"mdash", "—" },
-       {"micro", "\302\265"},
-       {"middot", "\302\267"},
-       {"nbsp", " " },
-       {"ndash", "\342\200\223"},
-       {"not", "\302\254"},
-       {"Ntilde", "\303\221"},
-       {"ntilde", "\303\261"},
-       {"Oacute", "\303\223"},
-       {"oacute", "\303\263"},
-       {"Ocirc", "\303\224"},
-       {"ocirc", "\303\264"},
-       {"OElig", "\305\222"},
-       {"oelig", "\305\223"},
-       {"Ograve", "\303\222"},
-       {"ograve", "\303\262"},
-       {"ordf", "\302\252"},
-       {"ordm", "\302\272"},
-       {"Oslash", "\303\230"},
-       {"oslash", "\303\270"},
-       {"Otilde", "\303\225"},
-       {"otilde", "\303\265"},
-       {"Ouml", "\303\226"},
-       {"ouml", "\303\266"},
-       {"para", "\302\266"},
-       {"permil", "\342\200\260"},
-       {"plusmn", "\302\261"},
-       {"pound", "\302\243"},
-       {"quot", "\"" },
-       {"raquo", "\302\273"},
-       {"rdquo",  "”" },
-       {"reg", "®" },
-       {"rsaquo", "\342\200\272"},
-       {"rsquo",  "’" },
-       {"sbquo", "\342\200\232"},
-       {"Scaron", "\305\240"},
-       {"scaron", "\305\241"},
-       {"sect", "\302\247"},
-       {"shy", "\302\255"},
-       {"squot", "\47"},
-       {"sup1", "\302\271"},
-       {"sup2", "\302\262"},
-       {"sup3", "\302\263"},
-       {"szlig", "\303\237"},
-       {"thinsp", "\342\200\211"},
-       {"THORN", "\303\236"},
-       {"thorn", "\303\276"},
-       {"tilde", "\313\234"},
-       {"times", "\303\227"},
-       {"trade", "™" },
-       {"Uacute", "\303\232"},
-       {"uacute", "\303\272"},
-       {"Ucirc", "\303\233"},
-       {"ucirc", "\303\273"},
-       {"Ugrave", "\303\231"},
-       {"ugrave", "\303\271"},
-       {"uml", "\302\250"},
-       {"Uuml", "\303\234"},
-       {"uuml", "\303\274"},
-       {"Yacute", "\303\235"},
-       {"yacute", "\303\275"},
-       {"yen", "\302\245"},
-       {"yuml", "\303\277"},
-       {"Yuml", "\305\270"},
+       /* A */
+       {"Aacute", "Á"},
+       {"aacute", "á"},
+       {"Acirc", "Â"},
+       {"acirc", "â"},
+       {"acute", "´"},
+       {"AElig", "Æ"},
+       {"aelig", "æ"},
+       {"Agrave", "À"},
+       {"agrave", "à"},
+       {"alefsym", "ℵ"},
+       {"Alpha", "Α"},
+       {"alpha", "α"},
+       {"amp", "&"},
+       {"and", "∧"},
+       {"ang", "∠"},
+       {"apos", "'"},
+       {"Aring", "Å"},
+       {"aring", "å"},
+       {"asymp", "≈"},
+       {"Atilde", "Ã"},
+       {"atilde", "ã"},
+       {"Auml", "Ä"},
+       {"auml", "ä"},
+       /* B */
+       {"bdquo", "„"},
+       {"Beta", "Β"},
+       {"beta", "β"},
+       {"brvbar", "¦"},
+       {"bull", "•"},
+       /* C */
+       {"cap", "∩"},
+       {"Ccedil", "Ç"},
+       {"ccedil", "ç"},
+       {"cedil", "¸"},
+       {"cent", "¢"},
+       {"Chi", "Χ"},
+       {"chi", "χ"},
+       {"circ", "ˆ"},
+       {"clubs", "♣"},
+       {"cong", "≅"},
+       {"copy", "©"},
+       {"crarr", "↵"},
+       {"cup", "∪"},
+       {"curren", "¤"},
+       /* D */
+       {"dagger", "†"},
+       {"Dagger", "‡"},
+       {"dArr", "⇓"},
+       {"darr", "↓"},
+       {"deg", "°"},
+       {"Delta", "Δ"},
+       {"delta", "δ"},
+       {"diams", "♦"},
+       {"divide", "÷"},
+       /* E */
+       {"Eacute", "É"},
+       {"eacute", "é"},
+       {"Ecirc", "Ê"},
+       {"ecirc", "ê"},
+       {"Egrave", "È"},
+       {"egrave", "è"},
+       {"empty", "∅"},
+       {"emsp", "\xE2\x80\x83"},
+       {"ensp", "\xE2\x80\x82"},
+       {"Epsilon", "Ε"},
+       {"epsilon", "ε"},
+       {"equiv", "≡"},
+       {"Eta", "Η"},
+       {"eta", "η"},
+       {"ETH", "Ð"},
+       {"eth", "ð"},
+       {"Euml", "Ë"},
+       {"euml", "ë"},
+       {"euro", "€"},
+       {"exist", "∃"},
+       /* F */
+       {"fnof", "ƒ"},
+       {"forall", "∀"},
+       {"frac12", "½"},
+       {"frac14", "¼"},
+       {"frac34", "¾"},
+       {"frasl", "⁄"},
+       /* G */
+       {"Gamma", "Γ"},
+       {"gamma", "γ"},
+       {"ge", "≥"},
+       {"gt", ">"},
+       /* H */
+       {"hArr", "⇔"},
+       {"harr", "↔"},
+       {"hearts", "♥"},
+       {"hellip", "…"},
+       /* I */
+       {"Iacute", "Í"},
+       {"iacute", "í"},
+       {"IArr", "⇐"},
+       {"Icirc", "Î"},
+       {"icirc", "î"},
+       {"iexcl", "¡"},
+       {"Igrave", "Ì"},
+       {"igrave", "ì"},
+       {"image", "ℑ"},
+       {"infin", "∞"},
+       {"int", "∫"},
+       {"Iota", "Ι"},
+       {"iota", "ι"},
+       {"iquest", "¿"},
+       {"isin", "∈"},
+       {"Iuml", "Ï"},
+       {"iuml", "ï"},
+       /* K */
+       {"Kappa", "Κ"},
+       {"kappa", "κ"},
+       /* L */
+       {"Lambda", "Λ"},
+       {"lambda", "λ"},
+       {"lang", "〈"},
+       {"laquo", "«"},
+       {"larr", "←"},
+       {"lceil", "⌈"},
+       {"ldquo", "“"},
+       {"le", "≤"},
+       {"lfloor", "⌊"},
+       {"lowast", "∗"},
+       {"loz", "◊"},
+       {"lrm", "\xE2\x80\x8E"},
+       {"lsaquo", "‹"},
+       {"lsquo", "‘"},
+       {"lt", "<"},
+       /* M */
+       {"macr", "¯"},
+       {"mdash", "—"},
+       {"micro", "µ"},
+       {"middot", "·"},
+       {"minus", "−"},
+       {"Mu", "Μ"},
+       {"mu", "μ"},
+       /* N */
+       {"nabla", "∇"},
+       {"nbsp", "\xC2\xA0"},
+       {"ndash", "–"},
+       {"ne", "≠"},
+       {"ni", "∋"},
+       {"not", "¬"},
+       {"notin", "∉"},
+       {"nsub", "⊄"},
+       {"Ntilde", "Ñ"},
+       {"ntilde", "ñ"},
+       {"Nu", "Ν"},
+       {"nu", "ν"},
+       /* O */
+       {"Oacute", "Ó"},
+       {"oacute", "ó"},
+       {"Ocirc", "Ô"},
+       {"ocirc", "ô"},
+       {"OElig", "Œ"},
+       {"oelig", "œ"},
+       {"Ograve", "Ò"},
+       {"ograve", "ò"},
+       {"oline", "‾"},
+       {"Omega", "Ω"},
+       {"omega", "ω"},
+       {"Omicron", "Ο"},
+       {"omicron", "ο"},
+       {"oplus", "⊕"},
+       {"or", "∨"},
+       {"ordf", "ª"},
+       {"ordm", "º"},
+       {"Oslash", "Ø"},
+       {"oslash", "ø"},
+       {"Otilde", "Õ"},
+       {"otilde", "õ"},
+       {"otimes", "⊗"},
+       {"Ouml", "Ö"},
+       {"ouml", "ö"},
+       /* P */
+       {"para", "¶"},
+       {"part", "∂"},
+       {"permil", "‰"},
+       {"perp", "⊥"},
+       {"Phi", "Φ"},
+       {"phi", "φ"},
+       {"Pi", "Π"},
+       {"pi", "π"},
+       {"piv", "ϖ"},
+       {"plusmn", "±"},
+       {"pound", "£"},
+       {"Prime", "″"},
+       {"prime", "′"},
+       {"prod", "∏"},
+       {"prop", "∝"},
+       {"Psi", "Ψ"},
+       {"psi", "ψ"},
+       /* Q */
+       {"quot", "\""},
+       /* R */
+       {"radic", "√"},
+       {"rang", "〉"},
+       {"raquo", "»"},
+       {"rArr", "⇒"},
+       {"rarr", "→"},
+       {"rceil", "⌉"},
+       {"rdquo", "”"},
+       {"real", "ℜ"},
+       {"reg", "®"},
+       {"rfloor", "⌋"},
+       {"Rho", "Ρ"},
+       {"rho", "ρ"},
+       {"rlm", "\xE2\x80\x8F"},
+       {"rsaquo", "›"},
+       {"rsquo", "’"},
+       /* S */
+       {"sbquo", "‚"},
+       {"Scaron", "Š"},
+       {"scaron", "š"},
+       {"sdot", "⋅"},
+       {"sect", "§"},
+       {"shy", "\xC2\xAD"},
+       {"Sigma", "Σ"},
+       {"sigma", "σ"},
+       {"sigmaf", "ς"},
+       {"sim", "∼"},
+       {"spades", "♠"},
+       {"sub", "⊂"},
+       {"sube", "⊆"},
+       {"sum", "∑"},
+       {"sup", "⊃"},
+       {"sup1", "¹"},
+       {"sup2", "²"},
+       {"sup3", "³"},
+       {"supe", "⊇"},
+       {"szlig", "ß"},
+       /* T */
+       {"Tau", "Τ"},
+       {"tau", "τ"},
+       {"there4", "∴"},
+       {"Theta", "Θ"},
+       {"theta", "θ"},
+       {"thetasym", "ϑ"},
+       {"thinsp", "\xE2\x80\x89"},
+       {"THORN", "Þ"},
+       {"thorn", "þ"},
+       {"tilde", "˜"},
+       {"times", "×"},
+       {"trade", "™"},
+       /* U */
+       {"Uacute", "Ú"},
+       {"uacute", "ú"},
+       {"uArr", "⇑"},
+       {"uarr", "↑"},
+       {"Ucirc", "Û"},
+       {"ucirc", "û"},
+       {"Ugrave", "Ù"},
+       {"ugrave", "ù"},
+       {"uml", "¨"},
+       {"upsih", "ϒ"},
+       {"Upsilon", "Υ"},
+       {"upsilon", "υ"},
+       {"Uuml", "Ü"},
+       {"uuml", "ü"},
+       /* W */
+       {"weierp", "℘"},
+       /* X */
+       {"Xi", "Ξ"},
+       {"xi", "ξ"},
+       /* Y */
+       {"Yacute", "Ý"},
+       {"yacute", "ý"},
+       {"yen", "¥"},
+       {"Yuml", "Ÿ"},
+       {"yuml", "ÿ"},
+       /* Z */
+       {"Zeta", "Ζ"},
+       {"zeta", "ζ"},
+       {"zwj", "\xE2\x80\x8D"},
+       {"zwnj", "\xE2\x80\x8C"},
        {NULL, NULL}
 };
 
@@ -179,7 +325,7 @@ static gchar* entity_extract_to_buffer(gchar *p, gchar b[])
                b[i] = *p;
                ++i, ++p;
        }
-       if (*p != ';' || i == ENTITY_MAX_LEN)
+       if (*p != ';' || i == 0 || i == ENTITY_MAX_LEN)
                return NULL;
        b[i] = '\0';
 
@@ -191,7 +337,8 @@ static gchar *entity_decode_numeric(gchar *str)
        gchar b[ENTITY_MAX_LEN];
        gchar *p = str, *res;
        gboolean hex = FALSE;
-       gunichar c;
+       gunichar c = 0;
+       gint ret;
 
        ++p;
        if (*p == '\0')
@@ -207,9 +354,26 @@ static gchar *entity_decode_numeric(gchar *str)
        if (entity_extract_to_buffer (p, b) == NULL)
                return NULL;
 
-       c = g_ascii_strtoll (b, NULL, (hex? 16: 10));
+       if (strlen(b) > 0)
+               c = g_ascii_strtoll (b, NULL, (hex ? 16 : 10));
+
+       if (c < 32)
+               /* An unprintable character; return the Unicode replacement symbol */
+               return g_strdup("\xef\xbf\xbd");
+
+       if (!g_unichar_validate(c)) {
+               /* Make sure the character is valid Unicode */
+               debug_print("Numeric reference '&#%s;' is invalid in Unicode codespace\n", b);
+               return NULL;
+       }
+
        res = g_malloc0 (DECODED_MAX_LEN + 1);
-       g_unichar_to_utf8 (c, res);
+       ret = g_unichar_to_utf8 (c, res);
+       if (ret == 0) {
+               debug_print("Failed to convert unicode character %u to UTF-8\n", c);
+               g_free(res);
+               res = NULL;
+       }
 
        return res;
 }