RSSyl: Stop earlier when an invalid feed is encountered.
[claws.git] / src / plugins / rssyl / libfeed / parser.c
1 /*
2  * Copyright (C) 2006 Andrej Kacian <andrej@kacian.sk>
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public License as
6  * published by the Free Software Foundation; either version 2 of the
7  * License, or (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12  * General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public
15  * License along with this program; if not, write to the
16  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
17  * Boston, MA 02111-1307, USA.
18  */
19
20 #ifdef HAVE_CONFIG_H
21 # include <config.h>
22 #endif
23
24 #include <glib.h>
25 #include <curl/curl.h>
26 #include <expat.h>
27 #include <string.h>
28 #include <ctype.h>
29 #include <errno.h>
30
31 #include <codeconv.h>
32
33 #include "feed.h"
34
35 #include "parser.h"
36
37 enum {
38         FEED_TYPE_NONE,
39         FEED_TYPE_RDF,
40         FEED_TYPE_RSS_20,
41         FEED_TYPE_ATOM_03,
42         FEED_TYPE_ATOM_10,
43         FEED_TYPE_OPML
44 } FeedTypes;
45
46 static void _handler_set(XML_Parser parser, guint type)
47 {
48         if( parser == NULL )
49                 return;
50
51         switch(type) {
52                 case FEED_TYPE_RSS_20:
53                         XML_SetElementHandler(parser,
54                                         feed_parser_rss20_start,
55                                         feed_parser_rss20_end);
56                         break;
57
58                 case FEED_TYPE_RDF:
59                         XML_SetElementHandler(parser,
60                                         feed_parser_rdf_start,
61                                         feed_parser_rdf_end);
62                         break;
63
64                 case FEED_TYPE_ATOM_10:
65                         XML_SetElementHandler(parser,
66                                         feed_parser_atom10_start,
67                                         feed_parser_atom10_end);
68                         break;
69         }
70 }
71
72 static void _elparse_start_chooser(void *data,
73                 const gchar *el, const gchar **attr)
74 {
75         FeedParserCtx *ctx = (FeedParserCtx *)data;
76         guint feedtype = FEED_TYPE_NONE;
77         gchar *version;
78
79         if( ctx->depth == 0 ) {
80
81                 /* RSS 2.0 detected */
82                 if( !strcmp(el, "rss") ) {
83                         feedtype = FEED_TYPE_RSS_20;
84                 } else if( !strcmp(el, "rdf:RDF") ) {
85                         feedtype = FEED_TYPE_RDF;
86                 } else if( !strcmp(el, "feed") ) {
87
88                         /* ATOM feed detected, let's check version */
89                         version = feed_parser_get_attribute_value(attr, "xmlns");
90                         if( version != NULL &&
91                                         (!strcmp(version, "http://www.w3.org/2005/Atom") ||
92                                          !strcmp(version, "https://www.w3.org/2005/Atom")) )
93                                 feedtype = FEED_TYPE_ATOM_10;
94                         else
95                                 feedtype = FEED_TYPE_ATOM_03;
96                 } else {
97                         /* Not a known feed type */
98                         ctx->feed->is_valid = FALSE;
99                 }
100         }
101
102         _handler_set(ctx->parser, feedtype);
103
104         ctx->depth++;
105 }
106
107 static void _elparse_end_dummy(void *data, const gchar *el)
108 {
109         FeedParserCtx *ctx = (FeedParserCtx *)data;
110
111         if( ctx->str != NULL ) {
112                 g_string_free(ctx->str, TRUE);
113                 ctx->str = NULL;
114         }
115
116         ctx->depth--;
117 }
118
119 void libfeed_expat_chparse(void *data, const gchar *s, gint len)
120 {
121         FeedParserCtx *ctx = (FeedParserCtx *)data;
122         gchar *buf = NULL;
123         gint i, xblank = 1;
124
125         buf = malloc(len+1);
126         strncpy(buf, s, len);
127         buf[len] = '\0';
128
129         /* check if the string is blank, ... */
130         for( i = 0; i < strlen(buf); i++ )
131                 if( !isspace(buf[i]) )
132                         xblank = 0;
133
134         /* ...because we do not want the blanks if we're just starting new GString */
135         if( xblank > 0 && ctx->str == NULL ) {
136                 g_free(buf);
137                 return;
138         }
139
140         if( ctx->str == NULL ) {
141                 ctx->str = g_string_sized_new(len + 1);
142         }
143
144         g_string_append(ctx->str, buf);
145         g_free(buf);
146 }
147
148
149 void feed_parser_set_expat_handlers(FeedParserCtx *ctx)
150 {
151         XML_SetUserData(ctx->parser, (void *)ctx);
152
153         XML_SetElementHandler(ctx->parser,
154                         _elparse_start_chooser,
155                         _elparse_end_dummy);
156
157         XML_SetCharacterDataHandler(ctx->parser,
158                 libfeed_expat_chparse);
159
160         XML_SetUnknownEncodingHandler(ctx->parser, feed_parser_unknown_encoding_handler,
161                         NULL);
162 }
163
164 size_t feed_writefunc(void *ptr, size_t size, size_t nmemb, void *data)
165 {
166         gint len = size * nmemb;
167         FeedParserCtx *ctx = (FeedParserCtx *)data;
168         gint status, err;
169
170         if (!ctx->feed->is_valid) {
171                 /* We already know that the feed is not valid, so we won't
172                  * try parsing it. Just return correct number so libcurl is
173                  * happy. */
174                 return len;
175         }
176
177         status = XML_Parse(ctx->parser, ptr, len, FALSE);
178
179         if( status == XML_STATUS_ERROR ) {
180                 err = XML_GetErrorCode(ctx->parser);
181                 printf("\nExpat: --- %s\n\n", XML_ErrorString(err));
182                 ctx->feed->is_valid = FALSE;
183         }
184
185         return len;
186 }
187
188 gchar *feed_parser_get_attribute_value(const gchar **attr, const gchar *name)
189 {
190         guint i;
191
192         if( attr == NULL || name == NULL )
193                 return NULL;
194
195         for( i = 0; attr[i] != NULL && attr[i+1] != NULL; i += 2 ) {
196                 if( !strcmp( attr[i], name) )
197                         return (gchar *)attr[i+1];
198         }
199
200         /* We haven't found anything. */
201         return NULL;
202 }
203
204 #define CHARSIZEUTF32   4
205
206 enum {
207         LEP_ICONV_OK,
208         LEP_ICONV_FAILED,
209         LEP_ICONV_ILSEQ,
210         LEP_ICONV_INVAL,
211         LEP_ICONV_UNKNOWN
212 };
213
214 static gint giconv_utf32_char(GIConv cd, const gchar *inbuf, size_t insize,
215                 guint32 *p_value)
216 {
217 #ifdef HAVE_ICONV
218         size_t outsize;
219         guchar outbuf[CHARSIZEUTF32];
220         gchar *outbufp;
221         gint r;
222
223         outsize = sizeof(outbuf);
224         outbufp = (gchar *)outbuf;
225 #ifdef HAVE_ICONV_PROTO_CONST
226         r = g_iconv(cd, (const gchar **)&inbuf, &insize,
227                         &outbufp, &outsize);
228 #else
229         r = g_iconv(cd, (gchar **)&inbuf, &insize,
230                         &outbufp, &outsize);
231 #endif
232         if( r == -1 ) {
233                 g_iconv(cd, 0, 0, 0, 0);
234                 switch(errno) {
235                 case EILSEQ:
236                         return LEP_ICONV_ILSEQ;
237                 case EINVAL:
238                         return LEP_ICONV_INVAL;
239                 default:
240                         return LEP_ICONV_UNKNOWN;
241                 }
242         } else {
243                 guint32 value;
244                 guint i;
245
246                 if( (insize > 0) || (outsize > 0) )
247                         return LEP_ICONV_FAILED;
248
249                 value = 0;
250                 for( i = 0; i < sizeof(outbuf); i++ ) {
251                         value = (value << 8) + outbuf[i];
252                 }
253                 *p_value = value;
254                 return LEP_ICONV_OK;
255         }
256 #else
257         return LEP_ICONV_FAILED;
258 #endif
259 }
260
261 static gint feed_parser_setup_unknown_encoding(const gchar *charset,
262                 XML_Encoding *info)
263 {
264         GIConv cd;
265         gint flag, r;
266         gchar buf[4];
267         guint i, j, k;
268         guint32 value;
269
270         cd = g_iconv_open("UTF-32BE", charset);
271         if( cd == (GIConv) -1 )
272                 return -1;
273
274         flag = 0;
275         for( i = 0; i < 256; i++ ) {
276                 /* first char */
277                 buf[0] = i;
278                 info->map[i] = 0;
279                 r = giconv_utf32_char(cd, buf, 1, &value);
280                 if( r == LEP_ICONV_OK) {
281                         info->map[i] = value;
282                 } else if( r != LEP_ICONV_INVAL ) {
283                 } else {
284                         for( j = 0; j < 256; j++ ) {
285                                 /* second char */
286                                 buf[1] = j;
287                                 r = giconv_utf32_char(cd, buf, 2, &value);
288                                 if( r == LEP_ICONV_OK ) {
289                                         flag = 1;
290                                         info->map[i] = -2;
291                                 } else if( r != LEP_ICONV_INVAL ) {
292                                 } else {
293                                         for( k = 0; k < 256; k++ ) {
294                                                 /* third char */
295                                                 buf[2] = k;
296                                                 r = giconv_utf32_char(cd, buf, 3, &value);
297                                                 if( r == LEP_ICONV_OK) {
298                                                         info->map[i] = -3;
299                                                 }
300                                         }
301                                 }
302                         }
303                 }
304         }
305
306         g_iconv_close(cd);
307
308         return flag;
309 }
310
311 struct FeedParserUnknownEncoding {
312         gchar *charset;
313         GIConv cd;
314 };
315
316 static gint feed_parser_unknown_encoding_convert(void *data, const gchar *s)
317 {
318         gint r;
319         struct FeedParserUnknownEncoding *enc_data;
320         size_t insize;
321         guint32 value;
322
323         enc_data = data;
324         insize = 4;
325
326         if( s == NULL )
327                 return -1;
328
329         r = giconv_utf32_char(enc_data->cd, s, insize, &value);
330         if( r != LEP_ICONV_OK )
331                 return -1;
332
333         return 0;
334 }
335
336 static void feed_parser_unknown_encoding_data_free(void *data)
337 {
338         struct FeedParserUnknownEncoding *enc_data;
339
340         enc_data = data;
341         free(enc_data->charset);
342         g_iconv_close(enc_data->cd);
343         free(enc_data);
344 }
345
346 int feed_parser_unknown_encoding_handler(void *encdata, const XML_Char *name,
347                 XML_Encoding *info)
348 {
349         GIConv cd;
350         struct FeedParserUnknownEncoding *data;
351         int result;
352
353         result = feed_parser_setup_unknown_encoding(name, info);
354         if( result == 0 ) {
355                 info->data = NULL;
356                 info->convert = NULL;
357                 info->release = NULL;
358                 return XML_STATUS_OK;
359         }
360
361         cd = g_iconv_open("UTF-32BE", name);
362         if( cd == (GIConv)-1 )
363                 return XML_STATUS_ERROR;
364
365         data = malloc( sizeof(*data) );
366         if( data == NULL ) {
367                 g_iconv_close(cd);
368                 return XML_STATUS_ERROR;
369         }
370
371         data->charset = strdup(name);
372         if( data->charset == NULL ) {
373                 free(data);
374                 g_iconv_close(cd);
375                 return XML_STATUS_ERROR;
376         }
377
378         data->cd = cd;
379         info->data = data;
380         info->convert = feed_parser_unknown_encoding_convert;
381         info->release = feed_parser_unknown_encoding_data_free;
382
383         return XML_STATUS_OK;
384 }