bb0bb0d2e965ee7a87ead79adaeddfa3934dc4da
[claws.git] / src / plugins / rssyl / libfeed / parser.c
1 /*
2  * Copyright (C) 2006 Andrej Kacian <andrej@kacian.sk>
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public License as
6  * published by the Free Software Foundation; either version 2 of the
7  * License, or (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12  * General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public
15  * License along with this program; if not, write to the
16  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
17  * Boston, MA 02111-1307, USA.
18  */
19
20 #include <glib.h>
21 #include <curl/curl.h>
22 #include <expat.h>
23 #include <string.h>
24 #include <ctype.h>
25 #include <errno.h>
26
27 #include <codeconv.h>
28
29 #include "feed.h"
30
31 #include "parser.h"
32
33 static void _handler_set(XML_Parser parser, guint type)
34 {
35         if( parser == NULL )
36                 return;
37
38         switch(type) {
39                 case FEED_TYPE_RSS_20:
40                         XML_SetElementHandler(parser,
41                                         feed_parser_rss20_start,
42                                         feed_parser_rss20_end);
43                         break;
44
45                 case FEED_TYPE_RDF:
46                         XML_SetElementHandler(parser,
47                                         feed_parser_rdf_start,
48                                         feed_parser_rdf_end);
49                         break;
50
51                 case FEED_TYPE_ATOM_10:
52                         XML_SetElementHandler(parser,
53                                         feed_parser_atom10_start,
54                                         feed_parser_atom10_end);
55                         break;
56         }
57 }
58
59 static void _elparse_start_chooser(void *data,
60                 const gchar *el, const gchar **attr)
61 {
62         FeedParserCtx *ctx = (FeedParserCtx *)data;
63         guint feedtype = FEED_TYPE_NONE;
64         gchar *version;
65
66         if( ctx->depth == 0 ) {
67
68                 /* RSS 2.0 detected */
69                 if( !strcmp(el, "rss") ) {
70                         feedtype = FEED_TYPE_RSS_20;
71                 } else if( !strcmp(el, "rdf:RDF") ) {
72                         feedtype = FEED_TYPE_RDF;
73                 } else if( !strcmp(el, "feed") ) {
74
75                         /* ATOM feed detected, let's check version */
76                         version = feed_parser_get_attribute_value(attr, "xmlns");
77                         if( !strcmp(version, "http://www.w3.org/2005/Atom") ||
78                                         !strcmp(version, "https://www.w3.org/2005/Atom") )
79                                 feedtype = FEED_TYPE_ATOM_10;
80                         else
81                                 feedtype = FEED_TYPE_ATOM_03;
82                 }
83         }
84
85         _handler_set(ctx->parser, feedtype);
86
87         ctx->depth++;
88 }
89
90 static void _elparse_end_dummy(void *data, const gchar *el)
91 {
92         FeedParserCtx *ctx = (FeedParserCtx *)data;
93
94         if( ctx->str != NULL ) {
95                 g_string_free(ctx->str, TRUE);
96                 ctx->str = NULL;
97         }
98
99         ctx->depth--;
100 }
101
102 void libfeed_expat_chparse(void *data, const gchar *s, gint len)
103 {
104         FeedParserCtx *ctx = (FeedParserCtx *)data;
105         gchar *buf = NULL;
106         gint i, xblank = 1;
107
108         buf = malloc(len+1);
109         strncpy(buf, s, len);
110         buf[len] = '\0';
111
112         /* check if the string is blank, ... */
113         for( i = 0; i < strlen(buf); i++ )
114                 if( !isspace(buf[i]) )
115                         xblank = 0;
116
117         /* ...because we do not want the blanks if we're just starting new GString */
118         if( xblank > 0 && ctx->str == NULL ) {
119                 g_free(buf);
120                 return;
121         }
122
123         if( ctx->str == NULL ) {
124                 ctx->str = g_string_sized_new(len + 1);
125         }
126
127         g_string_append(ctx->str, buf);
128         g_free(buf);
129 }
130
131
132 void feed_parser_set_expat_handlers(FeedParserCtx *ctx)
133 {
134         XML_SetUserData(ctx->parser, (void *)ctx);
135
136         XML_SetElementHandler(ctx->parser,
137                         _elparse_start_chooser,
138                         _elparse_end_dummy);
139
140         XML_SetCharacterDataHandler(ctx->parser,
141                 libfeed_expat_chparse);
142
143         XML_SetUnknownEncodingHandler(ctx->parser, feed_parser_unknown_encoding_handler,
144                         NULL);
145 }
146
147 size_t feed_writefunc(void *ptr, size_t size, size_t nmemb, void *data)
148 {
149         gint len = size * nmemb;
150         FeedParserCtx *ctx = (FeedParserCtx *)data;
151         gint status, err;
152
153         status = XML_Parse(ctx->parser, ptr, len, FALSE);
154
155         if( status == XML_STATUS_ERROR ) {
156                 err = XML_GetErrorCode(ctx->parser);
157                 printf("\nExpat: --- %s\n\n", XML_ErrorString(err));
158         }
159
160         return len;
161 }
162
163 gchar *feed_parser_get_attribute_value(const gchar **attr, const gchar *name)
164 {
165         guint i;
166
167         if( attr == NULL || name == NULL )
168                 return NULL;
169
170         for( i = 0; attr[i] != NULL && attr[i+1] != NULL; i += 2 ) {
171                 if( !strcmp( attr[i], name) )
172                         return (gchar *)attr[i+1];
173         }
174
175         /* We haven't found anything. */
176         return NULL;
177 }
178
179 #define CHARSIZEUTF32   4
180
181 enum {
182         LEP_ICONV_OK,
183         LEP_ICONV_FAILED,
184         LEP_ICONV_ILSEQ,
185         LEP_ICONV_INVAL,
186         LEP_ICONV_UNKNOWN
187 };
188
189 static gint giconv_utf32_char(GIConv cd, const gchar *inbuf, size_t insize,
190                 guint32 *p_value)
191 {
192 #ifdef HAVE_ICONV
193         size_t outsize;
194         guchar outbuf[CHARSIZEUTF32];
195         gchar *outbufp;
196         gint r, errno;
197
198         outsize = sizeof(outbuf);
199         outbufp = (gchar *)outbuf;
200 #ifdef HAVE_ICONV_PROTO_CONST
201         r = g_iconv(cd, (const gchar **)&inbuf, &insize,
202                         &outbufp, &outsize);
203 #else
204         r = g_iconv(cd, (gchar **)&inbuf, &insize,
205                         &outbufp, &outsize);
206 #endif
207         if( r == -1 ) {
208                 g_iconv(cd, 0, 0, 0, 0);
209                 switch(errno) {
210                 case EILSEQ:
211                         return LEP_ICONV_ILSEQ;
212                 case EINVAL:
213                         return LEP_ICONV_INVAL;
214                 default:
215                         return LEP_ICONV_UNKNOWN;
216                 }
217         } else {
218                 guint32 value;
219                 guint i;
220
221                 if( (insize > 0) || (outsize > 0) )
222                         return LEP_ICONV_FAILED;
223
224                 value = 0;
225                 for( i = 0; i < sizeof(outbuf); i++ ) {
226                         value = (value << 8) + outbuf[i];
227                 }
228                 *p_value = value;
229                 return LEP_ICONV_OK;
230         }
231 #else
232         return LEP_ICONV_FAILED;
233 #endif
234 }
235
236 static gint feed_parser_setup_unknown_encoding(const gchar *charset,
237                 XML_Encoding *info)
238 {
239         GIConv cd;
240         gint flag, r;
241         gchar buf[4];
242         guint i, j, k;
243         guint32 value;
244
245         cd = g_iconv_open("UTF-32BE", charset);
246         if( cd == (GIConv) -1 )
247                 return -1;
248
249         flag = 0;
250         for( i = 0; i < 256; i++ ) {
251                 /* first char */
252                 buf[0] = i;
253                 info->map[i] = 0;
254                 r = giconv_utf32_char(cd, buf, 1, &value);
255                 if( r == LEP_ICONV_OK) {
256                         info->map[i] = value;
257                 } else if( r != LEP_ICONV_INVAL ) {
258                 } else {
259                         for( j = 0; j < 256; j++ ) {
260                                 /* second char */
261                                 buf[1] = j;
262                                 r = giconv_utf32_char(cd, buf, 2, &value);
263                                 if( r == LEP_ICONV_OK ) {
264                                         flag = 1;
265                                         info->map[i] = -2;
266                                 } else if( r != LEP_ICONV_INVAL ) {
267                                 } else {
268                                         for( k = 0; k < 256; k++ ) {
269                                                 /* third char */
270                                                 buf[2] = k;
271                                                 r = giconv_utf32_char(cd, buf, 3, &value);
272                                                 if( r == LEP_ICONV_OK) {
273                                                         info->map[i] = -3;
274                                                 }
275                                         }
276                                 }
277                         }
278                 }
279         }
280
281         g_iconv_close(cd);
282
283         return flag;
284 }
285
286 struct FeedParserUnknownEncoding {
287         gchar *charset;
288         GIConv cd;
289 };
290
291 static gint feed_parser_unknown_encoding_convert(void *data, const gchar *s)
292 {
293         gint r;
294         struct FeedParserUnknownEncoding *enc_data;
295         size_t insize;
296         guint32 value;
297
298         enc_data = data;
299         insize = 4;
300
301         if( s == NULL )
302                 return -1;
303
304         r = giconv_utf32_char(enc_data->cd, s, insize, &value);
305         if( r != LEP_ICONV_OK )
306                 return -1;
307
308         return 0;
309 }
310
311 static void feed_parser_unknown_encoding_data_free(void *data)
312 {
313         struct FeedParserUnknownEncoding *enc_data;
314
315         enc_data = data;
316         free(enc_data->charset);
317         g_iconv_close(enc_data->cd);
318         free(enc_data);
319 }
320
321 int feed_parser_unknown_encoding_handler(void *encdata, const XML_Char *name,
322                 XML_Encoding *info)
323 {
324         GIConv cd;
325         struct FeedParserUnknownEncoding *data;
326         int result;
327
328         result = feed_parser_setup_unknown_encoding(name, info);
329         if( result == 0 ) {
330                 info->data = NULL;
331                 info->convert = NULL;
332                 info->release = NULL;
333                 return XML_STATUS_OK;
334         }
335
336         cd = g_iconv_open("UTF-32BE", name);
337         if( cd == (GIConv)-1 )
338                 return XML_STATUS_ERROR;
339
340         data = malloc( sizeof(*data) );
341         if( data == NULL ) {
342                 g_iconv_close(cd);
343                 return XML_STATUS_ERROR;
344         }
345
346         data->charset = strdup(name);
347         if( data->charset == NULL ) {
348                 free(data);
349                 g_iconv_close(cd);
350                 return XML_STATUS_ERROR;
351         }
352
353         data->cd = cd;
354         info->data = data;
355         info->convert = feed_parser_unknown_encoding_convert;
356         info->release = feed_parser_unknown_encoding_data_free;
357
358         return XML_STATUS_OK;
359 }