33b50ec23da5e4ab5f7388387d97de144453ca79
[claws.git] / src / plugins / rssyl / parsers.c
1 /*
2  * Sylpheed -- a GTK+ based, lightweight, and fast e-mail client
3  * Copyright (C) 1999-2004 Hiroyuki Yamamoto
4  * This file (C) 2005 Andrej Kacian <andrej@kacian.sk>
5  *
6  * - various feed parsing functions
7  * - this file could use some sorting and/or splitting
8  *
9  * This program is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU General Public License as published by
11  * the Free Software Foundation; either version 2 of the License, or
12  * (at your option) any later version.
13  *
14  * This program is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17  * GNU General Public License for more details.
18  *
19  * You should have received a copy of the GNU General Public License
20  * along with this program; if not, write to the Free Software
21  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
22  */
23
24 #ifdef HAVE_CONFIG_H
25 #  include "config.h"
26 #endif
27
28 #include <glib.h>
29 #include <libxml/parser.h>
30 #include <libxml/xpath.h>
31 #include <libxml/HTMLtree.h>
32
33 #include "date.h"
34 #include "feed.h"
35 #include "strreplace.h"
36 #include "utils.h"
37 #include "procheader.h"
38
39 gint rssyl_parse_rdf(xmlDocPtr doc, RSSylFolderItem *ritem, gchar *parent)
40 {
41         xmlNodePtr rnode, node, n;
42         RSSylFeedItem *fitem = NULL;
43         gint count = 0;
44         gchar *content = NULL;
45         g_return_val_if_fail(doc != NULL, 0);
46         g_return_val_if_fail(ritem != NULL, 0);
47 #ifdef RSSYL_DEBUG
48         gchar *fetched = NULL;
49 #endif  /* RSSYL_DEBUG */
50
51         if( ritem->contents == NULL )
52                 rssyl_read_existing(ritem);
53
54         rnode = xmlDocGetRootElement(doc);
55
56         for( node = rnode->children; node; node = node->next ) {
57                 if( !xmlStrcmp(node->name, "item") ) {
58                         /* We've found an "item" tag, let's poke through its contents */
59                         fitem = g_new0(RSSylFeedItem, 1);
60                         fitem->date = 0;
61 #ifdef RSSYL_DEBUG
62                         fetched = xmlGetProp(rnode, "fetched");
63                         fitem->debug_fetched = atoll(fetched);
64                         xmlFree(fetched);
65 #endif  /* RSSYL_DEBUG */
66
67                         for( n = node->children; n; n = n->next ) {
68                                 /* Title */
69                                 if( !xmlStrcmp(n->name, "title") ) {
70                                         content = xmlNodeGetContent(n);
71                                         fitem->title = rssyl_format_string(content, TRUE, TRUE);
72                                         xmlFree(content);
73                                         debug_print("RSSyl: XML - RDF title is '%s'\n", fitem->title);
74                                 }
75
76                                 /* Text */
77                                 if( !xmlStrcmp(n->name, "description") ) {
78                                         content = xmlNodeGetContent(n);
79                                         fitem->text = rssyl_format_string(content, FALSE, FALSE);
80                                         xmlFree(content);
81                                         debug_print("RSSyl: XML - got RDF text\n");
82                                 }
83
84                                 /* URL */
85                                 if( !xmlStrcmp(n->name, "link") ) {
86                                         content = xmlNodeGetContent(n);
87                                         fitem->link = rssyl_format_string(content, FALSE, TRUE);
88                                         xmlFree(content);
89                                         debug_print("RSSyl: XML - RDF link is '%s'\n", fitem->link);
90                                 }
91
92                                 /* Date - rfc822 format */
93                                 if( !xmlStrcmp(n->name, "pubDate") ) {
94                                         content = xmlNodeGetContent(n);
95                                         fitem->date = procheader_date_parse(NULL, content, 0);
96                                         xmlFree(content);
97                                         if( fitem->date > 0 ) {
98                                                 debug_print("RSSyl: XML - RDF pubDate found\n" );
99                                         } else
100                                                 fitem->date = 0;
101                                 }
102                                 /* Date - ISO8701 format */
103                                 if( !xmlStrcmp(n->name, "date") &&
104                                                 (!xmlStrcmp(n->ns->prefix, "ns")
105                                                  || !xmlStrcmp(n->ns->prefix, "dc")) ) {
106                                         content = xmlNodeGetContent(n);
107                                         fitem->date = parseISO8601Date(content);
108                                         xmlFree(content);
109                                         debug_print("RSSyl: XML - RDF date found\n" );
110                                 }
111
112                                 /* Author */
113                                 if( !xmlStrcmp(n->name, "creator") ) {
114                                         content = xmlNodeGetContent(n);
115                                         fitem->author = rssyl_format_string(content, TRUE, TRUE);
116                                         xmlFree(content);
117                                         debug_print("RSSyl: XML - RDF author is '%s'\n", fitem->author);
118                                 }
119                         }
120                 }
121
122                 if( fitem && fitem->link && fitem->title ) {
123                         if (rssyl_add_feed_item(ritem, fitem) == FALSE) {
124                                 rssyl_free_feeditem(fitem);
125                                 fitem = NULL;
126                         }
127                         fitem = NULL;
128                         count++;
129                 }
130         }
131
132         return count;
133 }
134
135
136 /* rssyl_parse_rss()
137  *
138  * This is where we parse the fetched rss document and create a
139  * RSSylFolderItem from it. Returns number of parsed items
140  */
141 gint rssyl_parse_rss(xmlDocPtr doc, RSSylFolderItem *ritem, gchar *parent)
142 {
143         xmlXPathContextPtr context;
144         xmlXPathObjectPtr result;
145         xmlNodePtr node, n, rnode;
146         gint i, count = 0;
147         RSSylFeedItem *fitem = NULL;
148         gchar *xpath;
149         gboolean got_encoded, got_author;
150         gchar *rootnode = NULL;
151         RSSylFeedItemMedia *media;
152         gchar *media_url, *media_type;
153         gulong media_size = 0;
154 #ifdef RSSYL_DEBUG
155         gchar *fetched = NULL;
156 #endif  /* RSSYL_DEBUG */
157
158         g_return_val_if_fail(doc != NULL, 0);
159         g_return_val_if_fail(ritem != NULL, 0);
160
161         if( ritem->contents == NULL )
162                 rssyl_read_existing(ritem);
163
164         rnode = xmlDocGetRootElement(doc);
165
166         rootnode = g_ascii_strdown(rnode->name, -1);
167         xpath = g_strconcat("/", rootnode,
168                                 "/channel/item",        NULL);
169         g_free(rootnode);
170         context = xmlXPathNewContext(doc);
171         if( !(result = xmlXPathEvalExpression(xpath, context)) ){
172                 debug_print("RSSyl: XML - no result found for '%s'\n", xpath);
173                 xmlXPathFreeContext(context);
174                 g_free(xpath);
175                 return 0;
176         }
177
178         g_free(xpath);
179
180         for( i = 0; i < result->nodesetval->nodeNr; i++ ) {
181                 node = result->nodesetval->nodeTab[i];
182                 
183                 if ((n = node->children) == NULL)
184                         continue;
185
186                 fitem = g_new0(RSSylFeedItem, 1);
187                 fitem->media = NULL;
188                 fitem->date = 0;
189 #ifdef RSSYL_DEBUG
190                 fetched = xmlGetProp(rnode, "fetched");
191                 fitem->debug_fetched = atoll(fetched);
192                 xmlFree(fetched);
193 #endif  /* RSSYL_DEBUG */
194                 fitem->text = NULL;
195                 
196                 if (parent)
197                         fitem->parent_link = g_strdup(parent);
198
199                 got_encoded = FALSE;
200                 got_author = FALSE;
201                 do {
202                         gchar *content = NULL;
203
204                         /* Title */
205                         if( !xmlStrcmp(n->name, "title") ) {
206                                 content = xmlNodeGetContent(n);
207                                 fitem->title = rssyl_format_string(content, TRUE, TRUE);
208                                 xmlFree(content);
209                                 debug_print("RSSyl: XML - item title: '%s'\n", fitem->title);
210                         }
211
212                         /* Text */
213                         if( !xmlStrcmp(n->name, "description") ) {
214                                 if( (fitem->text == NULL) && (got_encoded == FALSE) ) {
215                                         content = xmlNodeGetContent(n);
216                                         debug_print("RSSyl: XML - item text (description) caught\n");
217                                         fitem->text = rssyl_format_string(content, FALSE, FALSE);
218                                         xmlFree(content);
219                                 }
220                         }
221                         if( !xmlStrcmp(n->name, "encoded")
222                                         && !xmlStrcmp(n->ns->prefix, "content") ) {
223                                 debug_print("RSSyl: XML - item text (content) caught\n");
224
225                                 if (fitem->text != NULL)
226                                         g_free(fitem->text); /* free "description" */
227                                         
228                                 content = xmlNodeGetContent(n);
229                                 fitem->text = rssyl_format_string(content, FALSE, FALSE);
230                                 xmlFree(content);
231                                 got_encoded = TRUE;
232                         }
233
234                         /* URL link to the original post */
235                         if( !xmlStrcmp(n->name, "link") ) {
236                                 content = xmlNodeGetContent(n);
237                                 fitem->link = rssyl_format_string(content, FALSE, TRUE);
238                                 xmlFree(content);
239                                 debug_print("RSSyl: XML - item link: '%s'\n", fitem->link);
240                         }
241
242                         /* GUID - sometimes used as link */
243                         if( !xmlStrcmp(n->name, "guid") ) {
244                                 gchar *tmp = xmlGetProp(n, "isPermaLink");
245                                 content = xmlNodeGetContent(n);
246                                 fitem->id_is_permalink = FALSE;
247                                 if( !tmp || xmlStrcmp(tmp, "false") )   /* permalink? */
248                                         fitem->id_is_permalink = TRUE;
249                                 fitem->id = rssyl_format_string(content, FALSE, TRUE);
250                                 xmlFree(content);
251                                 debug_print("RSSyl: XML - item guid: '%s'\n", fitem->id);
252                                 xmlFree(tmp);
253                         }
254
255                         /* Date - rfc822 format */
256                         if( !xmlStrcmp(n->name, "pubDate") ) {
257                                 content = xmlNodeGetContent(n);
258                                 fitem->date = procheader_date_parse(NULL, content, 0);
259                                 xmlFree(content);
260                                 if( fitem->date > 0 ) {
261                                         debug_print("RSSyl: XML - item date found: %d\n", (gint)fitem->date);
262                                 } else
263                                         fitem->date = 0;
264                         }
265                         /* Date - ISO8701 format */
266                         if( !xmlStrcmp(n->name, "date") && !xmlStrcmp(n->ns->prefix, "dc") ) {
267                                 content = xmlNodeGetContent(n);
268                                 fitem->date = parseISO8601Date(content);
269                                 xmlFree(content);
270                                 debug_print("RSSyl: XML - item date found\n" );
271                         }
272
273                         /* Author */
274                         if( !xmlStrcmp(n->name, "author") ) {
275                                 content = xmlNodeGetContent(n);
276                                 fitem->author = rssyl_format_string(content, TRUE, TRUE);
277                                 xmlFree(content);
278                                 debug_print("RSSyl: XML - item author: '%s'\n", fitem->author);
279                                 got_author = TRUE;
280                         }
281
282                         if( !xmlStrcmp(n->name, "creator")
283                                         && !xmlStrcmp(n->ns->prefix, "dc") && !got_author) {
284                                 content = xmlNodeGetContent(n);
285                                 fitem->author = rssyl_format_string(content, TRUE, TRUE);
286                                 xmlFree(content);
287                                 debug_print("RSSyl: XML - item author (creator): '%s'\n", fitem->author);
288                         }
289
290                         /* Media enclosure */
291                         if( !xmlStrcmp(n->name, "enclosure") ) {
292                                 gchar *tmp = xmlGetProp(n, "length");
293                                 media_url = xmlGetProp(n, "url");
294                                 media_type = xmlGetProp(n, "type");
295                                 media_size = (tmp ? atoi(tmp) : 0);
296                                 xmlFree(tmp);
297
298                                 if( media_url != NULL &&
299                                                 media_type != NULL &&
300                                                 media_size != 0 ) {
301                                         debug_print("RSSyl: XML - enclosure: '%s' [%s] (%ld)\n",
302                                                         media_url, media_type, media_size);
303                                         media = g_new(RSSylFeedItemMedia, 1);
304                                         media->url = media_url;
305                                         media->type = media_type;
306                                         media->size = media_size;
307                                         fitem->media = media;
308                                 } else {
309                                         debug_print("RSSyl: XML - enclosure found, but some data is missing\n");
310                                         g_free(media_url);
311                                         g_free(media_type);
312                                 }
313                         }
314
315                         /* Comments */
316                         if( !xmlStrcmp(n->name, "commentRSS") || !xmlStrcmp(n->name, "commentRss") ) {
317                                 content = xmlNodeGetContent(n);
318                                 fitem->comments_link = rssyl_format_string(content, FALSE, TRUE);
319                                 xmlFree(content);
320                                 debug_print("RSSyl: XML - comments_link: '%s'\n", fitem->comments_link);
321                         }
322                 } while( (n = n->next) != NULL);
323
324                 if( (fitem->link || fitem->id) && fitem->title ) {
325                         if (rssyl_add_feed_item(ritem, fitem) == FALSE) {
326                                 rssyl_free_feeditem(fitem);
327                                 fitem = NULL;
328                         }
329                         count++;
330                 }
331         }
332
333         xmlXPathFreeObject(result);
334         xmlXPathFreeContext(context);
335
336         return count;
337 }
338
339 /* rssyl_parse_atom()
340  *
341  * This is where we parse the fetched atom document and create a
342  * RSSylFolderItem from it. Returns number of parsed items
343  */
344 gint rssyl_parse_atom(xmlDocPtr doc, RSSylFolderItem *ritem, gchar *parent)
345 {
346         xmlNodePtr node, n, h;
347         xmlBufferPtr buf = NULL;
348         gint count = 0;
349         RSSylFeedItem *fitem = NULL;
350         RSSylFeedItemMedia *media = NULL;
351         gchar *link_type, *link_href, *link_rel, *tmp, *content = NULL;
352         gulong link_size;
353
354         g_return_val_if_fail(doc != NULL, 0);
355         g_return_val_if_fail(ritem != NULL, 0);
356
357         if( ritem->contents == NULL )
358                 rssyl_read_existing(ritem);
359
360         node = xmlDocGetRootElement(doc);
361
362         if (node == NULL)
363                 return 0;
364
365         node = node->children;
366
367         for (; node; node = node->next) {
368                 gboolean got_content = FALSE;
369                 if (xmlStrcmp(node->name, "entry")) {
370                         continue;
371                 }
372         
373                 n = node->children;
374                 fitem = g_new0(RSSylFeedItem, 1);
375                 fitem->date = 0;
376                 fitem->date_published = 0;
377                 fitem->text = NULL;
378                 
379                 if (parent)
380                         fitem->parent_link = g_strdup(parent);
381
382                 do {
383                         /* Title */
384                         if( !xmlStrcmp(n->name, "title") ) {
385                                 content = xmlNodeGetContent(n);
386                                 fitem->title = rssyl_format_string(content, TRUE, TRUE);
387                                 xmlFree(content);
388                                 debug_print("RSSyl: XML - Atom item title: '%s'\n", fitem->title);
389                         }
390
391                         /* ID */
392                         if( !xmlStrcmp(n->name, "id") ) {
393                                 content = xmlNodeGetContent(n);
394                                 fitem->id = g_strdup_printf("%s%s", (parent?"comment-":""), content);
395                                 xmlFree(content);
396                                 debug_print("RSSyl: XML - Atom id: '%s'\n", fitem->id);
397                         }
398
399                         /* Text */
400                         if( !xmlStrcmp(n->name, "summary") && !got_content ) {
401                                 content = xmlNodeGetContent(n);
402                                 debug_print("RSSyl: XML - Atom item text (summary) caught\n");
403                                 fitem->text = rssyl_format_string(content, FALSE, FALSE);
404                                 xmlFree(content);
405                         }
406
407                         if( !xmlStrcmp(n->name, "content") ) {
408                                 gchar *tmp = xmlGetProp(n, "type");
409                                 debug_print("RSSyl: XML - Atom item text (content) caught\n");
410                                 if (fitem->text)
411                                         g_free(fitem->text);
412                                 if( !xmlStrcmp(tmp, "xhtml")) {
413                                         for( h = n->children; h; h = h->next ) {
414                                                 if( !xmlStrcmp(h->name, "div") ) {
415                                                         buf = xmlBufferCreate();
416                                                         htmlNodeDump(buf, doc, h);
417                                                         content = g_strdup((gchar *)xmlBufferContent(buf));
418                                                         xmlBufferFree(buf);
419                                                 }
420                                         }
421                                 } else
422                                         content = xmlNodeGetContent(n);
423                                 xmlFree(tmp);
424                                 fitem->text = rssyl_format_string(content, FALSE, FALSE);
425                                 xmlFree(content);
426                                 got_content = TRUE;
427                         }
428
429                         /* link */
430                         if( !xmlStrcmp(n->name, "link") ) {
431                                 link_type = xmlGetProp(n, "type");
432                                 link_rel = xmlGetProp(n, "rel");
433                                 link_href = xmlGetProp(n, "href");
434                                 tmp = xmlGetProp(n, "length");
435                                 link_size = (tmp ? atoi(tmp) : 0);
436                                 g_free(tmp);
437
438                                 if( !link_rel || (link_rel && !xmlStrcmp(link_rel, "alternate")) ) {
439                                         fitem->link = link_href;
440                                         debug_print("RSSyl: XML - Atom item link: '%s'\n", fitem->link);
441                                         xmlFree(link_type);
442                                         xmlFree(link_rel);
443                                 } else if( link_rel && !xmlStrcmp(link_rel, "enclosure") ) {
444                                         debug_print("RSSyl: XML - Atom item enclosure: '%s' (%ld) [%s]\n",
445                                                         link_href, link_size, link_type);
446                                         media = g_new(RSSylFeedItemMedia, 1);
447                                         media->url = link_href;
448                                         media->type = link_type;
449                                         media->size = link_size;
450                                         fitem->media = media;
451                                         xmlFree(link_rel);
452                                 } else {
453                                         xmlFree(link_type);
454                                         xmlFree(link_rel);
455                                         xmlFree(link_href);
456                                 }
457                         }
458
459                         /* Date published - ISO8701 format */
460                         if( !xmlStrcmp(n->name, "published") ) {
461                                 content = xmlNodeGetContent(n);
462                                 fitem->date_published = parseISO8601Date(content);
463                                 xmlFree(content);
464                                 debug_print("RSSyl: XML - Atom item 'issued' date found\n" );
465                         }
466
467                         /* Date modified - ISO8701 format */
468                         if( !xmlStrcmp(n->name, "updated") ) {
469                                 content = xmlNodeGetContent(n);
470                                 fitem->date = parseISO8601Date(content);
471                                 xmlFree(content);
472                                 debug_print("RSSyl: XML - Atom item 'updated' date found\n" );
473                         }
474
475                         /* Author */
476                         if( !xmlStrcmp(n->name, "author") ) {
477                                 xmlNodePtr subnode;
478                                 gchar *name = NULL, *mail = NULL;
479                                 gchar *tmp;
480                                 for (subnode = n->children; subnode; subnode = subnode->next) {
481                                         content = xmlNodeGetContent(subnode);
482                                         if (!xmlStrcmp(subnode->name, "name") && !name)
483                                                 name = g_strdup(content);
484                                         if (!xmlStrcmp(subnode->name, "email") && !mail)
485                                                 mail = g_strdup(content);
486                                         xmlFree(content);
487                                 }
488                                 tmp = g_strdup_printf("%s%s%s%s%s",
489                                                         name ? name:"",
490                                                         name && mail ? " <":(mail?"<":""),
491                                                         mail ? mail:"",
492                                                         mail ? ">":"",
493                                                         !name && !mail ? "N/A":"");
494                                 fitem->author = rssyl_format_string(tmp, TRUE, TRUE);
495                                 g_free(tmp);
496                                 g_free(name);
497                                 g_free(mail);
498                                 debug_print("RSSyl: XML - Atom item author: '%s'\n", fitem->author);
499                         }
500
501                         /* Comments */
502                         if( !xmlStrcmp(n->name, "commentRSS") || !xmlStrcmp(n->name, "commentRss")) {
503                                 content = xmlNodeGetContent(n);
504                                 fitem->comments_link = rssyl_format_string(content, FALSE, TRUE);
505                                 xmlFree(content);
506                                 debug_print("RSSyl: XML - comments_link: '%s'\n", fitem->comments_link);
507                         }
508                 } while( (n = n->next) != NULL);
509
510                 if( fitem->id && fitem->title && fitem->date ) {
511
512                         /* If no link is available, and we can safely guess ID
513                          * might be a (perma)link, mark it so. */
514                         if (!fitem->link && fitem->id   /* no url, but we have id */
515                                         && (!strncmp(fitem->id, "http:", 5) /* id looks like an url */
516                                                 || !strncmp(fitem->id, "https:", 6))) {
517                                 if (!ritem->url || strcmp(ritem->url, fitem->id)) {
518                                         /* id is different from feed url (good chance it is a permalink) */
519                                         debug_print("RSSyl: Marking ID as permalink\n");
520                                         fitem->id_is_permalink = TRUE;
521                                 }
522                         }
523
524                         if (rssyl_add_feed_item(ritem, fitem) == FALSE) {
525                                 rssyl_free_feeditem(fitem);
526                                 fitem = NULL;
527                         }
528                         count++;
529                 } else
530                         debug_print("RSSyl: Incomplete Atom entry, need at least 'id', 'title' and 'updated' tags\n");
531         }
532
533         return count;
534 }