When parsing RSS 2.0, ignore <link> tags with a namespace prefix.
[claws.git] / src / plugins / rssyl / parsers.c
1 /*
2  * Sylpheed -- a GTK+ based, lightweight, and fast e-mail client
3  * Copyright (C) 1999-2004 Hiroyuki Yamamoto
4  * This file (C) 2005 Andrej Kacian <andrej@kacian.sk>
5  *
6  * - various feed parsing functions
7  * - this file could use some sorting and/or splitting
8  *
9  * This program is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU General Public License as published by
11  * the Free Software Foundation; either version 2 of the License, or
12  * (at your option) any later version.
13  *
14  * This program is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17  * GNU General Public License for more details.
18  *
19  * You should have received a copy of the GNU General Public License
20  * along with this program; if not, write to the Free Software
21  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
22  */
23
24 #ifdef HAVE_CONFIG_H
25 #  include "config.h"
26 #endif
27
28 #include <glib.h>
29 #include <libxml/parser.h>
30 #include <libxml/xpath.h>
31 #include <libxml/HTMLtree.h>
32
33 #include "date.h"
34 #include "feed.h"
35 #include "strreplace.h"
36 #include "utils.h"
37 #include "procheader.h"
38
39 gint rssyl_parse_rdf(xmlDocPtr doc, RSSylFolderItem *ritem, gchar *parent)
40 {
41         xmlNodePtr rnode, node, n;
42         RSSylFeedItem *fitem = NULL;
43         gint count = 0;
44         gchar *content = NULL;
45         g_return_val_if_fail(doc != NULL, 0);
46         g_return_val_if_fail(ritem != NULL, 0);
47 #ifdef RSSYL_DEBUG
48         gchar *fetched = NULL;
49 #endif  /* RSSYL_DEBUG */
50
51         if( ritem->contents == NULL )
52                 rssyl_read_existing(ritem);
53
54         rnode = xmlDocGetRootElement(doc);
55
56         for( node = rnode->children; node; node = node->next ) {
57                 if( !xmlStrcmp(node->name, "item") ) {
58                         /* We've found an "item" tag, let's poke through its contents */
59                         fitem = g_new0(RSSylFeedItem, 1);
60                         fitem->date = 0;
61 #ifdef RSSYL_DEBUG
62                         fetched = xmlGetProp(rnode, "fetched");
63                         fitem->debug_fetched = atoll(fetched);
64                         xmlFree(fetched);
65 #endif  /* RSSYL_DEBUG */
66
67                         for( n = node->children; n; n = n->next ) {
68                                 /* Title */
69                                 if( !xmlStrcmp(n->name, "title") ) {
70                                         content = xmlNodeGetContent(n);
71                                         fitem->title = rssyl_format_string(content, TRUE, TRUE);
72                                         xmlFree(content);
73                                         debug_print("RSSyl: XML - RDF title is '%s'\n", fitem->title);
74                                 }
75
76                                 /* Text */
77                                 if( !xmlStrcmp(n->name, "description") ) {
78                                         content = xmlNodeGetContent(n);
79                                         fitem->text = rssyl_format_string(content, FALSE, FALSE);
80                                         xmlFree(content);
81                                         debug_print("RSSyl: XML - got RDF text\n");
82                                 }
83
84                                 /* URL */
85                                 if( !xmlStrcmp(n->name, "link") ) {
86                                         content = xmlNodeGetContent(n);
87                                         fitem->link = rssyl_format_string(content, FALSE, TRUE);
88                                         xmlFree(content);
89                                         debug_print("RSSyl: XML - RDF link is '%s'\n", fitem->link);
90                                 }
91
92                                 /* Date - rfc822 format */
93                                 if( !xmlStrcmp(n->name, "pubDate") ) {
94                                         content = xmlNodeGetContent(n);
95                                         fitem->date = procheader_date_parse(NULL, content, 0);
96                                         xmlFree(content);
97                                         if( fitem->date > 0 ) {
98                                                 debug_print("RSSyl: XML - RDF pubDate found\n" );
99                                         } else
100                                                 fitem->date = 0;
101                                 }
102                                 /* Date - ISO8701 format */
103                                 if( !xmlStrcmp(n->name, "date") &&
104                                                 (!xmlStrcmp(n->ns->prefix, "ns")
105                                                  || !xmlStrcmp(n->ns->prefix, "dc")) ) {
106                                         content = xmlNodeGetContent(n);
107                                         fitem->date = parseISO8601Date(content);
108                                         xmlFree(content);
109                                         debug_print("RSSyl: XML - RDF date found\n" );
110                                 }
111
112                                 /* Author */
113                                 if( !xmlStrcmp(n->name, "creator") ) {
114                                         content = xmlNodeGetContent(n);
115                                         fitem->author = rssyl_format_string(content, TRUE, TRUE);
116                                         xmlFree(content);
117                                         debug_print("RSSyl: XML - RDF author is '%s'\n", fitem->author);
118                                 }
119                         }
120                 }
121
122                 if( fitem && fitem->link && fitem->title ) {
123                         if (rssyl_add_feed_item(ritem, fitem) == FALSE) {
124                                 rssyl_free_feeditem(fitem);
125                                 fitem = NULL;
126                         }
127                         fitem = NULL;
128                         count++;
129                 }
130         }
131
132         return count;
133 }
134
135
136 /* rssyl_parse_rss()
137  *
138  * This is where we parse the fetched rss document and create a
139  * RSSylFolderItem from it. Returns number of parsed items
140  */
141 gint rssyl_parse_rss(xmlDocPtr doc, RSSylFolderItem *ritem, gchar *parent)
142 {
143         xmlXPathContextPtr context;
144         xmlXPathObjectPtr result;
145         xmlNodePtr node, n, rnode;
146         gint i, count = 0;
147         RSSylFeedItem *fitem = NULL;
148         gchar *xpath;
149         gboolean got_encoded, got_author;
150         gchar *rootnode = NULL;
151         RSSylFeedItemMedia *media;
152         gchar *media_url, *media_type;
153         gulong media_size = 0;
154 #ifdef RSSYL_DEBUG
155         gchar *fetched = NULL;
156 #endif  /* RSSYL_DEBUG */
157
158         g_return_val_if_fail(doc != NULL, 0);
159         g_return_val_if_fail(ritem != NULL, 0);
160
161         if( ritem->contents == NULL )
162                 rssyl_read_existing(ritem);
163
164         rnode = xmlDocGetRootElement(doc);
165
166         rootnode = g_ascii_strdown(rnode->name, -1);
167         xpath = g_strconcat("/", rootnode,
168                                 "/channel/item",        NULL);
169         g_free(rootnode);
170         context = xmlXPathNewContext(doc);
171         if( !(result = xmlXPathEvalExpression(xpath, context)) ){
172                 debug_print("RSSyl: XML - no result found for '%s'\n", xpath);
173                 xmlXPathFreeContext(context);
174                 g_free(xpath);
175                 return 0;
176         }
177
178         g_free(xpath);
179
180         for( i = 0; i < result->nodesetval->nodeNr; i++ ) {
181                 node = result->nodesetval->nodeTab[i];
182                 
183                 if ((n = node->children) == NULL)
184                         continue;
185
186                 fitem = g_new0(RSSylFeedItem, 1);
187                 fitem->media = NULL;
188                 fitem->date = 0;
189 #ifdef RSSYL_DEBUG
190                 fetched = xmlGetProp(rnode, "fetched");
191                 fitem->debug_fetched = atoll(fetched);
192                 xmlFree(fetched);
193 #endif  /* RSSYL_DEBUG */
194                 fitem->text = NULL;
195                 
196                 if (parent)
197                         fitem->parent_link = g_strdup(parent);
198
199                 got_encoded = FALSE;
200                 got_author = FALSE;
201                 do {
202                         gchar *content = NULL;
203
204                         /* Title */
205                         if( !xmlStrcmp(n->name, "title") ) {
206                                 content = xmlNodeGetContent(n);
207                                 fitem->title = rssyl_format_string(content, TRUE, TRUE);
208                                 xmlFree(content);
209                                 debug_print("RSSyl: XML - item title: '%s'\n", fitem->title);
210                         }
211
212                         /* Text */
213                         if( !xmlStrcmp(n->name, "description") ) {
214                                 if( (fitem->text == NULL) && (got_encoded == FALSE) ) {
215                                         content = xmlNodeGetContent(n);
216                                         debug_print("RSSyl: XML - item text (description) caught\n");
217                                         fitem->text = rssyl_format_string(content, FALSE, FALSE);
218                                         xmlFree(content);
219                                 }
220                         }
221                         if( !xmlStrcmp(n->name, "encoded")
222                                         && !xmlStrcmp(n->ns->prefix, "content") ) {
223                                 debug_print("RSSyl: XML - item text (content) caught\n");
224
225                                 if (fitem->text != NULL)
226                                         g_free(fitem->text); /* free "description" */
227                                         
228                                 content = xmlNodeGetContent(n);
229                                 fitem->text = rssyl_format_string(content, FALSE, FALSE);
230                                 xmlFree(content);
231                                 got_encoded = TRUE;
232                         }
233
234                         /* URL link to the original post */
235                         if( !xmlStrcmp(n->name, "link") &&
236                                         (!n->ns || !n->ns->prefix || !strlen(n->ns->prefix)) ) {
237                                 content = xmlNodeGetContent(n);
238                                 fitem->link = rssyl_format_string(content, FALSE, TRUE);
239                                 xmlFree(content);
240                                 debug_print("RSSyl: XML - item link: '%s'\n", fitem->link);
241                         }
242
243                         /* GUID - sometimes used as link */
244                         if( !xmlStrcmp(n->name, "guid") ) {
245                                 gchar *tmp = xmlGetProp(n, "isPermaLink");
246                                 content = xmlNodeGetContent(n);
247                                 fitem->id_is_permalink = FALSE;
248                                 if( !tmp || xmlStrcmp(tmp, "false") )   /* permalink? */
249                                         fitem->id_is_permalink = TRUE;
250                                 fitem->id = rssyl_format_string(content, FALSE, TRUE);
251                                 xmlFree(content);
252                                 debug_print("RSSyl: XML - item guid: '%s'\n", fitem->id);
253                                 xmlFree(tmp);
254                         }
255
256                         /* Date - rfc822 format */
257                         if( !xmlStrcmp(n->name, "pubDate") ) {
258                                 content = xmlNodeGetContent(n);
259                                 fitem->date = procheader_date_parse(NULL, content, 0);
260                                 xmlFree(content);
261                                 if( fitem->date > 0 ) {
262                                         debug_print("RSSyl: XML - item date found: %d\n", (gint)fitem->date);
263                                 } else
264                                         fitem->date = 0;
265                         }
266                         /* Date - ISO8701 format */
267                         if( !xmlStrcmp(n->name, "date") && !xmlStrcmp(n->ns->prefix, "dc") ) {
268                                 content = xmlNodeGetContent(n);
269                                 fitem->date = parseISO8601Date(content);
270                                 xmlFree(content);
271                                 debug_print("RSSyl: XML - item date found\n" );
272                         }
273
274                         /* Author */
275                         if( !xmlStrcmp(n->name, "author") ) {
276                                 content = xmlNodeGetContent(n);
277                                 fitem->author = rssyl_format_string(content, TRUE, TRUE);
278                                 xmlFree(content);
279                                 debug_print("RSSyl: XML - item author: '%s'\n", fitem->author);
280                                 got_author = TRUE;
281                         }
282
283                         if( !xmlStrcmp(n->name, "creator")
284                                         && !xmlStrcmp(n->ns->prefix, "dc") && !got_author) {
285                                 content = xmlNodeGetContent(n);
286                                 fitem->author = rssyl_format_string(content, TRUE, TRUE);
287                                 xmlFree(content);
288                                 debug_print("RSSyl: XML - item author (creator): '%s'\n", fitem->author);
289                         }
290
291                         /* Media enclosure */
292                         if( !xmlStrcmp(n->name, "enclosure") ) {
293                                 gchar *tmp = xmlGetProp(n, "length");
294                                 media_url = xmlGetProp(n, "url");
295                                 media_type = xmlGetProp(n, "type");
296                                 media_size = (tmp ? atoi(tmp) : 0);
297                                 xmlFree(tmp);
298
299                                 if( media_url != NULL &&
300                                                 media_type != NULL &&
301                                                 media_size != 0 ) {
302                                         debug_print("RSSyl: XML - enclosure: '%s' [%s] (%ld)\n",
303                                                         media_url, media_type, media_size);
304                                         media = g_new(RSSylFeedItemMedia, 1);
305                                         media->url = media_url;
306                                         media->type = media_type;
307                                         media->size = media_size;
308                                         fitem->media = media;
309                                 } else {
310                                         debug_print("RSSyl: XML - enclosure found, but some data is missing\n");
311                                         g_free(media_url);
312                                         g_free(media_type);
313                                 }
314                         }
315
316                         /* Comments */
317                         if( !xmlStrcmp(n->name, "commentRSS") || !xmlStrcmp(n->name, "commentRss") ) {
318                                 content = xmlNodeGetContent(n);
319                                 fitem->comments_link = rssyl_format_string(content, FALSE, TRUE);
320                                 xmlFree(content);
321                                 debug_print("RSSyl: XML - comments_link: '%s'\n", fitem->comments_link);
322                         }
323                 } while( (n = n->next) != NULL);
324
325                 if( (fitem->link || fitem->id) && fitem->title ) {
326                         if (rssyl_add_feed_item(ritem, fitem) == FALSE) {
327                                 rssyl_free_feeditem(fitem);
328                                 fitem = NULL;
329                         }
330                         count++;
331                 }
332         }
333
334         xmlXPathFreeObject(result);
335         xmlXPathFreeContext(context);
336
337         return count;
338 }
339
340 /* rssyl_parse_atom()
341  *
342  * This is where we parse the fetched atom document and create a
343  * RSSylFolderItem from it. Returns number of parsed items
344  */
345 gint rssyl_parse_atom(xmlDocPtr doc, RSSylFolderItem *ritem, gchar *parent)
346 {
347         xmlNodePtr node, n, h;
348         xmlBufferPtr buf = NULL;
349         gint count = 0;
350         RSSylFeedItem *fitem = NULL;
351         RSSylFeedItemMedia *media = NULL;
352         gchar *link_type, *link_href, *link_rel, *tmp, *content = NULL;
353         gulong link_size;
354
355         g_return_val_if_fail(doc != NULL, 0);
356         g_return_val_if_fail(ritem != NULL, 0);
357
358         if( ritem->contents == NULL )
359                 rssyl_read_existing(ritem);
360
361         node = xmlDocGetRootElement(doc);
362
363         if (node == NULL)
364                 return 0;
365
366         node = node->children;
367
368         for (; node; node = node->next) {
369                 gboolean got_content = FALSE;
370                 if (xmlStrcmp(node->name, "entry")) {
371                         continue;
372                 }
373         
374                 n = node->children;
375                 fitem = g_new0(RSSylFeedItem, 1);
376                 fitem->date = 0;
377                 fitem->date_published = 0;
378                 fitem->text = NULL;
379                 
380                 if (parent)
381                         fitem->parent_link = g_strdup(parent);
382
383                 do {
384                         /* Title */
385                         if( !xmlStrcmp(n->name, "title") ) {
386                                 content = xmlNodeGetContent(n);
387                                 fitem->title = rssyl_format_string(content, TRUE, TRUE);
388                                 xmlFree(content);
389                                 debug_print("RSSyl: XML - Atom item title: '%s'\n", fitem->title);
390                         }
391
392                         /* ID */
393                         if( !xmlStrcmp(n->name, "id") ) {
394                                 content = xmlNodeGetContent(n);
395                                 fitem->id = g_strdup_printf("%s%s", (parent?"comment-":""), content);
396                                 xmlFree(content);
397                                 debug_print("RSSyl: XML - Atom id: '%s'\n", fitem->id);
398                         }
399
400                         /* Text */
401                         if( !xmlStrcmp(n->name, "summary") && !got_content ) {
402                                 content = xmlNodeGetContent(n);
403                                 debug_print("RSSyl: XML - Atom item text (summary) caught\n");
404                                 fitem->text = rssyl_format_string(content, FALSE, FALSE);
405                                 xmlFree(content);
406                         }
407
408                         if( !xmlStrcmp(n->name, "content") ) {
409                                 gchar *tmp = xmlGetProp(n, "type");
410                                 debug_print("RSSyl: XML - Atom item text (content) caught\n");
411                                 if (fitem->text)
412                                         g_free(fitem->text);
413                                 if( !xmlStrcmp(tmp, "xhtml")) {
414                                         for( h = n->children; h; h = h->next ) {
415                                                 if( !xmlStrcmp(h->name, "div") ) {
416                                                         buf = xmlBufferCreate();
417                                                         htmlNodeDump(buf, doc, h);
418                                                         content = g_strdup((gchar *)xmlBufferContent(buf));
419                                                         xmlBufferFree(buf);
420                                                 }
421                                         }
422                                 } else
423                                         content = xmlNodeGetContent(n);
424                                 xmlFree(tmp);
425                                 fitem->text = rssyl_format_string(content, FALSE, FALSE);
426                                 xmlFree(content);
427                                 got_content = TRUE;
428                         }
429
430                         /* link */
431                         if( !xmlStrcmp(n->name, "link") ) {
432                                 link_type = xmlGetProp(n, "type");
433                                 link_rel = xmlGetProp(n, "rel");
434                                 link_href = xmlGetProp(n, "href");
435                                 tmp = xmlGetProp(n, "length");
436                                 link_size = (tmp ? atoi(tmp) : 0);
437                                 g_free(tmp);
438
439                                 if( !link_rel || (link_rel && !xmlStrcmp(link_rel, "alternate")) ) {
440                                         fitem->link = link_href;
441                                         debug_print("RSSyl: XML - Atom item link: '%s'\n", fitem->link);
442                                         xmlFree(link_type);
443                                         xmlFree(link_rel);
444                                 } else if( link_rel && !xmlStrcmp(link_rel, "enclosure") ) {
445                                         debug_print("RSSyl: XML - Atom item enclosure: '%s' (%ld) [%s]\n",
446                                                         link_href, link_size, link_type);
447                                         media = g_new(RSSylFeedItemMedia, 1);
448                                         media->url = link_href;
449                                         media->type = link_type;
450                                         media->size = link_size;
451                                         fitem->media = media;
452                                         xmlFree(link_rel);
453                                 } else {
454                                         xmlFree(link_type);
455                                         xmlFree(link_rel);
456                                         xmlFree(link_href);
457                                 }
458                         }
459
460                         /* Date published - ISO8701 format */
461                         if( !xmlStrcmp(n->name, "published") ) {
462                                 content = xmlNodeGetContent(n);
463                                 fitem->date_published = parseISO8601Date(content);
464                                 xmlFree(content);
465                                 debug_print("RSSyl: XML - Atom item 'issued' date found\n" );
466                         }
467
468                         /* Date modified - ISO8701 format */
469                         if( !xmlStrcmp(n->name, "updated") ) {
470                                 content = xmlNodeGetContent(n);
471                                 fitem->date = parseISO8601Date(content);
472                                 xmlFree(content);
473                                 debug_print("RSSyl: XML - Atom item 'updated' date found\n" );
474                         }
475
476                         /* Author */
477                         if( !xmlStrcmp(n->name, "author") ) {
478                                 xmlNodePtr subnode;
479                                 gchar *name = NULL, *mail = NULL;
480                                 gchar *tmp;
481                                 for (subnode = n->children; subnode; subnode = subnode->next) {
482                                         content = xmlNodeGetContent(subnode);
483                                         if (!xmlStrcmp(subnode->name, "name") && !name)
484                                                 name = g_strdup(content);
485                                         if (!xmlStrcmp(subnode->name, "email") && !mail)
486                                                 mail = g_strdup(content);
487                                         xmlFree(content);
488                                 }
489                                 tmp = g_strdup_printf("%s%s%s%s%s",
490                                                         name ? name:"",
491                                                         name && mail ? " <":(mail?"<":""),
492                                                         mail ? mail:"",
493                                                         mail ? ">":"",
494                                                         !name && !mail ? "N/A":"");
495                                 fitem->author = rssyl_format_string(tmp, TRUE, TRUE);
496                                 g_free(tmp);
497                                 g_free(name);
498                                 g_free(mail);
499                                 debug_print("RSSyl: XML - Atom item author: '%s'\n", fitem->author);
500                         }
501
502                         /* Comments */
503                         if( !xmlStrcmp(n->name, "commentRSS") || !xmlStrcmp(n->name, "commentRss")) {
504                                 content = xmlNodeGetContent(n);
505                                 fitem->comments_link = rssyl_format_string(content, FALSE, TRUE);
506                                 xmlFree(content);
507                                 debug_print("RSSyl: XML - comments_link: '%s'\n", fitem->comments_link);
508                         }
509                 } while( (n = n->next) != NULL);
510
511                 if( fitem->id && fitem->title && fitem->date ) {
512
513                         /* If no link is available, and we can safely guess ID
514                          * might be a (perma)link, mark it so. */
515                         if (!fitem->link && fitem->id   /* no url, but we have id */
516                                         && (!strncmp(fitem->id, "http:", 5) /* id looks like an url */
517                                                 || !strncmp(fitem->id, "https:", 6))) {
518                                 if (!ritem->url || strcmp(ritem->url, fitem->id)) {
519                                         /* id is different from feed url (good chance it is a permalink) */
520                                         debug_print("RSSyl: Marking ID as permalink\n");
521                                         fitem->id_is_permalink = TRUE;
522                                 }
523                         }
524
525                         if (rssyl_add_feed_item(ritem, fitem) == FALSE) {
526                                 rssyl_free_feeditem(fitem);
527                                 fitem = NULL;
528                         }
529                         count++;
530                 } else
531                         debug_print("RSSyl: Incomplete Atom entry, need at least 'id', 'title' and 'updated' tags\n");
532         }
533
534         return count;
535 }