fcf4c2801350845a08625a3193beab95601cd9d0
[claws.git] / src / entity.c
1 /*
2  * Claws Mail -- a GTK+ based, lightweight, and fast e-mail client
3  * Copyright (C) 2017 Ricardo Mones and the Claws Mail team
4  *
5  * This program is free software; you can redistribute it and/or modify
6  * it under the terms of the GNU General Public License as published by
7  * the Free Software Foundation; either version 3 of the License, or
8  * (at your option) any later version.
9  *
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  * GNU General Public License for more details.
14  *
15  * You should have received a copy of the GNU General Public License
16  * along with this program. If not, see <http://www.gnu.org/licenses/>.
17  */
18 #ifdef HAVE_CONFIG_H
19 #include "config.h"
20 #include "claws-features.h"
21 #endif
22
23 #include "utils.h"
24
25 #define ENTITY_MAX_LEN 8
26 #define DECODED_MAX_LEN 6
27
28 static GHashTable *symbol_table = NULL;
29
30 typedef struct _EntitySymbol EntitySymbol;
31
32 struct _EntitySymbol
33 {
34         gchar *const key;
35         gchar *const value;
36 };
37
38 /* in alphabetical order with upper-case version first */
39 static EntitySymbol symbolic_entities[] = {
40         /* A */
41         {"Aacute", "Á"},
42         {"aacute", "á"},
43         {"Acirc", "Â"},
44         {"acirc", "â"},
45         {"acute", "´"},
46         {"AElig", "Æ"},
47         {"aelig", "æ"},
48         {"Agrave", "À"},
49         {"agrave", "à"},
50         {"alefsym", "ℵ"},
51         {"Alpha", "Α"},
52         {"alpha", "α"},
53         {"amp", "&"},
54         {"and", "∧"},
55         {"ang", "∠"},
56         {"apos", "'"},
57         {"Aring", "Å"},
58         {"aring", "å"},
59         {"asymp", "≈"},
60         {"Atilde", "Ã"},
61         {"atilde", "ã"},
62         {"Auml", "Ä"},
63         {"auml", "ä"},
64         /* B */
65         {"bdquo", "„"},
66         {"Beta", "Β"},
67         {"beta", "β"},
68         {"brvbar", "¦"},
69         {"bull", "•"},
70         /* C */
71         {"cap", "∩"},
72         {"Ccedil", "Ç"},
73         {"ccedil", "ç"},
74         {"cedil", "¸"},
75         {"cent", "¢"},
76         {"Chi", "Χ"},
77         {"chi", "χ"},
78         {"circ", "ˆ"},
79         {"clubs", "♣"},
80         {"cong", "≅"},
81         {"copy", "©"},
82         {"crarr", "↵"},
83         {"cup", "∪"},
84         {"curren", "¤"},
85         /* D */
86         {"dagger", "†"},
87         {"Dagger", "‡"},
88         {"dArr", "⇓"},
89         {"darr", "↓"},
90         {"deg", "°"},
91         {"Delta", "Δ"},
92         {"delta", "δ"},
93         {"diams", "♦"},
94         {"divide", "÷"},
95         /* E */
96         {"Eacute", "É"},
97         {"eacute", "é"},
98         {"Ecirc", "Ê"},
99         {"ecirc", "ê"},
100         {"Egrave", "È"},
101         {"egrave", "è"},
102         {"empty", "∅"},
103         {"emsp", "\xE2\x80\x83"},
104         {"ensp", "\xE2\x80\x82"},
105         {"Epsilon", "Ε"},
106         {"epsilon", "ε"},
107         {"equiv", "≡"},
108         {"Eta", "Η"},
109         {"eta", "η"},
110         {"ETH", "Ð"},
111         {"eth", "ð"},
112         {"Euml", "Ë"},
113         {"euml", "ë"},
114         {"euro", "€"},
115         {"exist", "∃"},
116         /* F */
117         {"fnof", "ƒ"},
118         {"forall", "∀"},
119         {"frac12", "½"},
120         {"frac14", "¼"},
121         {"frac34", "¾"},
122         {"frasl", "⁄"},
123         /* G */
124         {"Gamma", "Γ"},
125         {"gamma", "γ"},
126         {"ge", "≥"},
127         {"gt", ">"},
128         /* H */
129         {"hArr", "⇔"},
130         {"harr", "↔"},
131         {"hearts", "♥"},
132         {"hellip", "…"},
133         /* I */
134         {"Iacute", "Í"},
135         {"iacute", "í"},
136         {"IArr", "⇐"},
137         {"Icirc", "Î"},
138         {"icirc", "î"},
139         {"iexcl", "¡"},
140         {"Igrave", "Ì"},
141         {"igrave", "ì"},
142         {"image", "ℑ"},
143         {"infin", "∞"},
144         {"int", "∫"},
145         {"Iota", "Ι"},
146         {"iota", "ι"},
147         {"iquest", "¿"},
148         {"isin", "∈"},
149         {"Iuml", "Ï"},
150         {"iuml", "ï"},
151         /* K */
152         {"Kappa", "Κ"},
153         {"kappa", "κ"},
154         /* L */
155         {"Lambda", "Λ"},
156         {"lambda", "λ"},
157         {"lang", "〈"},
158         {"laquo", "«"},
159         {"larr", "←"},
160         {"lceil", "⌈"},
161         {"ldquo", "“"},
162         {"le", "≤"},
163         {"lfloor", "⌊"},
164         {"lowast", "∗"},
165         {"loz", "◊"},
166         {"lrm", "\xE2\x80\x8E"},
167         {"lsaquo", "‹"},
168         {"lsquo", "‘"},
169         {"lt", "<"},
170         /* M */
171         {"macr", "¯"},
172         {"mdash", "—"},
173         {"micro", "µ"},
174         {"middot", "·"},
175         {"minus", "−"},
176         {"Mu", "Μ"},
177         {"mu", "μ"},
178         /* N */
179         {"nabla", "∇"},
180         {"nbsp", "\xC2\xA0"},
181         {"ndash", "–"},
182         {"ne", "≠"},
183         {"ni", "∋"},
184         {"not", "¬"},
185         {"notin", "∉"},
186         {"nsub", "⊄"},
187         {"Ntilde", "Ñ"},
188         {"ntilde", "ñ"},
189         {"Nu", "Ν"},
190         {"nu", "ν"},
191         /* O */
192         {"Oacute", "Ó"},
193         {"oacute", "ó"},
194         {"Ocirc", "Ô"},
195         {"ocirc", "ô"},
196         {"OElig", "Œ"},
197         {"oelig", "œ"},
198         {"Ograve", "Ò"},
199         {"ograve", "ò"},
200         {"oline", "‾"},
201         {"Omega", "Ω"},
202         {"omega", "ω"},
203         {"Omicron", "Ο"},
204         {"omicron", "ο"},
205         {"oplus", "⊕"},
206         {"or", "∨"},
207         {"ordf", "ª"},
208         {"ordm", "º"},
209         {"Oslash", "Ø"},
210         {"oslash", "ø"},
211         {"Otilde", "Õ"},
212         {"otilde", "õ"},
213         {"otimes", "⊗"},
214         {"Ouml", "Ö"},
215         {"ouml", "ö"},
216         /* P */
217         {"para", "¶"},
218         {"part", "∂"},
219         {"permil", "‰"},
220         {"perp", "⊥"},
221         {"Phi", "Φ"},
222         {"phi", "φ"},
223         {"Pi", "Π"},
224         {"pi", "π"},
225         {"piv", "ϖ"},
226         {"plusmn", "±"},
227         {"pound", "£"},
228         {"Prime", "″"},
229         {"prime", "′"},
230         {"prod", "∏"},
231         {"prop", "∝"},
232         {"Psi", "Ψ"},
233         {"psi", "ψ"},
234         /* Q */
235         {"quot", "\""},
236         /* R */
237         {"radic", "√"},
238         {"rang", "〉"},
239         {"raquo", "»"},
240         {"rArr", "⇒"},
241         {"rarr", "→"},
242         {"rceil", "⌉"},
243         {"rdquo", "”"},
244         {"real", "ℜ"},
245         {"reg", "®"},
246         {"rfloor", "⌋"},
247         {"Rho", "Ρ"},
248         {"rho", "ρ"},
249         {"rlm", "\xE2\x80\x8F"},
250         {"rsaquo", "›"},
251         {"rsquo", "’"},
252         /* S */
253         {"sbquo", "‚"},
254         {"Scaron", "Š"},
255         {"scaron", "š"},
256         {"sdot", "⋅"},
257         {"sect", "§"},
258         {"shy", "\xC2\xAD"},
259         {"Sigma", "Σ"},
260         {"sigma", "σ"},
261         {"sigmaf", "ς"},
262         {"sim", "∼"},
263         {"spades", "♠"},
264         {"sub", "⊂"},
265         {"sube", "⊆"},
266         {"sum", "∑"},
267         {"sup", "⊃"},
268         {"sup1", "¹"},
269         {"sup2", "²"},
270         {"sup3", "³"},
271         {"supe", "⊇"},
272         {"szlig", "ß"},
273         /* T */
274         {"Tau", "Τ"},
275         {"tau", "τ"},
276         {"there4", "∴"},
277         {"Theta", "Θ"},
278         {"theta", "θ"},
279         {"thetasym", "ϑ"},
280         {"thinsp", "\xE2\x80\x89"},
281         {"THORN", "Þ"},
282         {"thorn", "þ"},
283         {"tilde", "˜"},
284         {"times", "×"},
285         {"trade", "™"},
286         /* U */
287         {"Uacute", "Ú"},
288         {"uacute", "ú"},
289         {"uArr", "⇑"},
290         {"uarr", "↑"},
291         {"Ucirc", "Û"},
292         {"ucirc", "û"},
293         {"Ugrave", "Ù"},
294         {"ugrave", "ù"},
295         {"uml", "¨"},
296         {"upsih", "ϒ"},
297         {"Upsilon", "Υ"},
298         {"upsilon", "υ"},
299         {"Uuml", "Ü"},
300         {"uuml", "ü"},
301         /* W */
302         {"weierp", "℘"},
303         /* X */
304         {"Xi", "Ξ"},
305         {"xi", "ξ"},
306         /* Y */
307         {"Yacute", "Ý"},
308         {"yacute", "ý"},
309         {"yen", "¥"},
310         {"Yuml", "Ÿ"},
311         {"yuml", "ÿ"},
312         /* Z */
313         {"Zeta", "Ζ"},
314         {"zeta", "ζ"},
315         {"zwj", "\xE2\x80\x8D"},
316         {"zwnj", "\xE2\x80\x8C"},
317         {NULL, NULL}
318 };
319
320 static gchar* entity_extract_to_buffer(gchar *p, gchar b[])
321 {
322         gint i = 0;
323
324         while (*p != '\0' && *p != ';' && i < ENTITY_MAX_LEN) {
325                 b[i] = *p;
326                 ++i, ++p;
327         }
328         if (*p != ';' || i == 0 || i == ENTITY_MAX_LEN)
329                 return NULL;
330         b[i] = '\0';
331
332         return b;
333 }
334
335 static gchar *entity_decode_numeric(gchar *str)
336 {
337         gchar b[ENTITY_MAX_LEN];
338         gchar *p = str, *res;
339         gboolean hex = FALSE;
340         gunichar c = 0;
341         gint ret;
342
343         ++p;
344         if (*p == '\0')
345                 return NULL;
346
347         if (*p == 'x') {
348                 hex = TRUE;
349                 ++p;
350                 if (*p == '\0')
351                         return NULL;
352         }
353
354         if (entity_extract_to_buffer (p, b) == NULL)
355                 return NULL;
356
357         if (strlen(b) > 0)
358                 c = g_ascii_strtoll (b, NULL, (hex ? 16 : 10));
359
360         if (c >= 0 && c <= 31)
361                 /* An unprintable character; return the Unicode replacement symbol */
362                 return g_strdup("\xef\xbf\xbd");
363
364         if (!g_unichar_validate(c)) {
365                 /* Make sure the character is valid Unicode */
366                 debug_print("Numeric reference '&#%s;' is invalid in Unicode codespace\n", b);
367                 return NULL;
368         }
369
370         res = g_malloc0 (DECODED_MAX_LEN + 1);
371         ret = g_unichar_to_utf8 (c, res);
372         if (ret == 0) {
373                 debug_print("Failed to convert unicode character %u to UTF-8\n", c);
374                 g_free(res);
375                 res = NULL;
376         }
377
378         return res;
379 }
380
381 static gchar *entity_decode_symbol(gchar *str)
382 {
383         gchar b[ENTITY_MAX_LEN];
384         gchar *decoded;
385
386         if (entity_extract_to_buffer (str, b) == NULL)
387                 return NULL;
388
389         if (symbol_table == NULL) {
390                 gint i;
391
392                 symbol_table = g_hash_table_new (g_str_hash, g_str_equal);
393                 for (i = 0; symbolic_entities[i].key != NULL; ++i) {
394                         g_hash_table_insert (symbol_table,
395                                 symbolic_entities[i].key, symbolic_entities[i].value);
396                 }
397                 debug_print("initialized entities table with %d symbols\n", i);
398         }
399
400         decoded = g_hash_table_lookup (symbol_table, b);
401         if (decoded != NULL)
402                 return g_strdup (decoded);
403
404         return NULL;
405 }
406
407 gchar *entity_decode(gchar *str)
408 {
409         gchar *p = str;
410         if (p == NULL || *p != '&')
411                 return NULL;
412         ++p;
413         if (*p == '\0')
414                 return NULL;
415         if (*p == '#')
416                 return entity_decode_numeric(p);
417         else
418                 return entity_decode_symbol(p);
419 }