RSSyl: Fix handling of feeds with encodings unknown to expat. Turns out the only...
[claws.git] / src / plugins / rssyl / libfeed / parser.c
1 /*
2  * Copyright (C) 2006 Andrej Kacian <andrej@kacian.sk>
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public License as
6  * published by the Free Software Foundation; either version 2 of the
7  * License, or (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12  * General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public
15  * License along with this program; if not, write to the
16  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
17  * Boston, MA 02111-1307, USA.
18  */
19
20 #ifdef HAVE_CONFIG_H
21 # include <config.h>
22 #endif
23
24 #include <glib.h>
25 #include <curl/curl.h>
26 #include <expat.h>
27 #include <string.h>
28 #include <ctype.h>
29 #include <errno.h>
30
31 #include <codeconv.h>
32
33 #include "feed.h"
34
35 #include "parser.h"
36
37 static void _handler_set(XML_Parser parser, guint type)
38 {
39         if( parser == NULL )
40                 return;
41
42         switch(type) {
43                 case FEED_TYPE_RSS_20:
44                         XML_SetElementHandler(parser,
45                                         feed_parser_rss20_start,
46                                         feed_parser_rss20_end);
47                         break;
48
49                 case FEED_TYPE_RDF:
50                         XML_SetElementHandler(parser,
51                                         feed_parser_rdf_start,
52                                         feed_parser_rdf_end);
53                         break;
54
55                 case FEED_TYPE_ATOM_10:
56                         XML_SetElementHandler(parser,
57                                         feed_parser_atom10_start,
58                                         feed_parser_atom10_end);
59                         break;
60         }
61 }
62
63 static void _elparse_start_chooser(void *data,
64                 const gchar *el, const gchar **attr)
65 {
66         FeedParserCtx *ctx = (FeedParserCtx *)data;
67         guint feedtype = FEED_TYPE_NONE;
68         gchar *version;
69
70         if( ctx->depth == 0 ) {
71
72                 /* RSS 2.0 detected */
73                 if( !strcmp(el, "rss") ) {
74                         feedtype = FEED_TYPE_RSS_20;
75                 } else if( !strcmp(el, "rdf:RDF") ) {
76                         feedtype = FEED_TYPE_RDF;
77                 } else if( !strcmp(el, "feed") ) {
78
79                         /* ATOM feed detected, let's check version */
80                         version = feed_parser_get_attribute_value(attr, "xmlns");
81                         if( !strcmp(version, "http://www.w3.org/2005/Atom") ||
82                                         !strcmp(version, "https://www.w3.org/2005/Atom") )
83                                 feedtype = FEED_TYPE_ATOM_10;
84                         else
85                                 feedtype = FEED_TYPE_ATOM_03;
86                 }
87         }
88
89         _handler_set(ctx->parser, feedtype);
90
91         ctx->depth++;
92 }
93
94 static void _elparse_end_dummy(void *data, const gchar *el)
95 {
96         FeedParserCtx *ctx = (FeedParserCtx *)data;
97
98         if( ctx->str != NULL ) {
99                 g_string_free(ctx->str, TRUE);
100                 ctx->str = NULL;
101         }
102
103         ctx->depth--;
104 }
105
106 void libfeed_expat_chparse(void *data, const gchar *s, gint len)
107 {
108         FeedParserCtx *ctx = (FeedParserCtx *)data;
109         gchar *buf = NULL;
110         gint i, xblank = 1;
111
112         buf = malloc(len+1);
113         strncpy(buf, s, len);
114         buf[len] = '\0';
115
116         /* check if the string is blank, ... */
117         for( i = 0; i < strlen(buf); i++ )
118                 if( !isspace(buf[i]) )
119                         xblank = 0;
120
121         /* ...because we do not want the blanks if we're just starting new GString */
122         if( xblank > 0 && ctx->str == NULL ) {
123                 g_free(buf);
124                 return;
125         }
126
127         if( ctx->str == NULL ) {
128                 ctx->str = g_string_sized_new(len + 1);
129         }
130
131         g_string_append(ctx->str, buf);
132         g_free(buf);
133 }
134
135
136 void feed_parser_set_expat_handlers(FeedParserCtx *ctx)
137 {
138         XML_SetUserData(ctx->parser, (void *)ctx);
139
140         XML_SetElementHandler(ctx->parser,
141                         _elparse_start_chooser,
142                         _elparse_end_dummy);
143
144         XML_SetCharacterDataHandler(ctx->parser,
145                 libfeed_expat_chparse);
146
147         XML_SetUnknownEncodingHandler(ctx->parser, feed_parser_unknown_encoding_handler,
148                         NULL);
149 }
150
151 size_t feed_writefunc(void *ptr, size_t size, size_t nmemb, void *data)
152 {
153         gint len = size * nmemb;
154         FeedParserCtx *ctx = (FeedParserCtx *)data;
155         gint status, err;
156
157         status = XML_Parse(ctx->parser, ptr, len, FALSE);
158
159         if( status == XML_STATUS_ERROR ) {
160                 err = XML_GetErrorCode(ctx->parser);
161                 printf("\nExpat: --- %s\n\n", XML_ErrorString(err));
162         }
163
164         return len;
165 }
166
167 gchar *feed_parser_get_attribute_value(const gchar **attr, const gchar *name)
168 {
169         guint i;
170
171         if( attr == NULL || name == NULL )
172                 return NULL;
173
174         for( i = 0; attr[i] != NULL && attr[i+1] != NULL; i += 2 ) {
175                 if( !strcmp( attr[i], name) )
176                         return (gchar *)attr[i+1];
177         }
178
179         /* We haven't found anything. */
180         return NULL;
181 }
182
183 #define CHARSIZEUTF32   4
184
185 enum {
186         LEP_ICONV_OK,
187         LEP_ICONV_FAILED,
188         LEP_ICONV_ILSEQ,
189         LEP_ICONV_INVAL,
190         LEP_ICONV_UNKNOWN
191 };
192
193 static gint giconv_utf32_char(GIConv cd, const gchar *inbuf, size_t insize,
194                 guint32 *p_value)
195 {
196 #ifdef HAVE_ICONV
197         size_t outsize;
198         guchar outbuf[CHARSIZEUTF32];
199         gchar *outbufp;
200         gint r, errno;
201
202         outsize = sizeof(outbuf);
203         outbufp = (gchar *)outbuf;
204 #ifdef HAVE_ICONV_PROTO_CONST
205         r = g_iconv(cd, (const gchar **)&inbuf, &insize,
206                         &outbufp, &outsize);
207 #else
208         r = g_iconv(cd, (gchar **)&inbuf, &insize,
209                         &outbufp, &outsize);
210 #endif
211         if( r == -1 ) {
212                 g_iconv(cd, 0, 0, 0, 0);
213                 switch(errno) {
214                 case EILSEQ:
215                         return LEP_ICONV_ILSEQ;
216                 case EINVAL:
217                         return LEP_ICONV_INVAL;
218                 default:
219                         return LEP_ICONV_UNKNOWN;
220                 }
221         } else {
222                 guint32 value;
223                 guint i;
224
225                 if( (insize > 0) || (outsize > 0) )
226                         return LEP_ICONV_FAILED;
227
228                 value = 0;
229                 for( i = 0; i < sizeof(outbuf); i++ ) {
230                         value = (value << 8) + outbuf[i];
231                 }
232                 *p_value = value;
233                 return LEP_ICONV_OK;
234         }
235 #else
236         return LEP_ICONV_FAILED;
237 #endif
238 }
239
240 static gint feed_parser_setup_unknown_encoding(const gchar *charset,
241                 XML_Encoding *info)
242 {
243         GIConv cd;
244         gint flag, r;
245         gchar buf[4];
246         guint i, j, k;
247         guint32 value;
248
249         cd = g_iconv_open("UTF-32BE", charset);
250         if( cd == (GIConv) -1 )
251                 return -1;
252
253         flag = 0;
254         for( i = 0; i < 256; i++ ) {
255                 /* first char */
256                 buf[0] = i;
257                 info->map[i] = 0;
258                 r = giconv_utf32_char(cd, buf, 1, &value);
259                 if( r == LEP_ICONV_OK) {
260                         info->map[i] = value;
261                 } else if( r != LEP_ICONV_INVAL ) {
262                 } else {
263                         for( j = 0; j < 256; j++ ) {
264                                 /* second char */
265                                 buf[1] = j;
266                                 r = giconv_utf32_char(cd, buf, 2, &value);
267                                 if( r == LEP_ICONV_OK ) {
268                                         flag = 1;
269                                         info->map[i] = -2;
270                                 } else if( r != LEP_ICONV_INVAL ) {
271                                 } else {
272                                         for( k = 0; k < 256; k++ ) {
273                                                 /* third char */
274                                                 buf[2] = k;
275                                                 r = giconv_utf32_char(cd, buf, 3, &value);
276                                                 if( r == LEP_ICONV_OK) {
277                                                         info->map[i] = -3;
278                                                 }
279                                         }
280                                 }
281                         }
282                 }
283         }
284
285         g_iconv_close(cd);
286
287         return flag;
288 }
289
290 struct FeedParserUnknownEncoding {
291         gchar *charset;
292         GIConv cd;
293 };
294
295 static gint feed_parser_unknown_encoding_convert(void *data, const gchar *s)
296 {
297         gint r;
298         struct FeedParserUnknownEncoding *enc_data;
299         size_t insize;
300         guint32 value;
301
302         enc_data = data;
303         insize = 4;
304
305         if( s == NULL )
306                 return -1;
307
308         r = giconv_utf32_char(enc_data->cd, s, insize, &value);
309         if( r != LEP_ICONV_OK )
310                 return -1;
311
312         return 0;
313 }
314
315 static void feed_parser_unknown_encoding_data_free(void *data)
316 {
317         struct FeedParserUnknownEncoding *enc_data;
318
319         enc_data = data;
320         free(enc_data->charset);
321         g_iconv_close(enc_data->cd);
322         free(enc_data);
323 }
324
325 int feed_parser_unknown_encoding_handler(void *encdata, const XML_Char *name,
326                 XML_Encoding *info)
327 {
328         GIConv cd;
329         struct FeedParserUnknownEncoding *data;
330         int result;
331
332         result = feed_parser_setup_unknown_encoding(name, info);
333         if( result == 0 ) {
334                 info->data = NULL;
335                 info->convert = NULL;
336                 info->release = NULL;
337                 return XML_STATUS_OK;
338         }
339
340         cd = g_iconv_open("UTF-32BE", name);
341         if( cd == (GIConv)-1 )
342                 return XML_STATUS_ERROR;
343
344         data = malloc( sizeof(*data) );
345         if( data == NULL ) {
346                 g_iconv_close(cd);
347                 return XML_STATUS_ERROR;
348         }
349
350         data->charset = strdup(name);
351         if( data->charset == NULL ) {
352                 free(data);
353                 g_iconv_close(cd);
354                 return XML_STATUS_ERROR;
355         }
356
357         data->cd = cd;
358         info->data = data;
359         info->convert = feed_parser_unknown_encoding_convert;
360         info->release = feed_parser_unknown_encoding_data_free;
361
362         return XML_STATUS_OK;
363 }