2 * Sylpheed -- a GTK+ based, lightweight, and fast e-mail client
3 * Copyright (C) 1999-2004 Hiroyuki Yamamoto
4 * This file (C) 2005 Andrej Kacian <andrej@kacian.sk>
6 * - various feed parsing functions
7 * - this file could use some sorting and/or splitting
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
29 #include <libxml/parser.h>
30 #include <libxml/xpath.h>
31 #include <libxml/HTMLtree.h>
35 #include "strreplace.h"
37 #include "procheader.h"
39 gint rssyl_parse_rdf(xmlDocPtr doc, RSSylFolderItem *ritem, gchar *parent)
41 xmlNodePtr rnode, node, n;
42 RSSylFeedItem *fitem = NULL;
44 gchar *content = NULL;
45 g_return_val_if_fail(doc != NULL, 0);
46 g_return_val_if_fail(ritem != NULL, 0);
48 gchar *fetched = NULL;
49 #endif /* RSSYL_DEBUG */
51 if( ritem->contents == NULL )
52 rssyl_read_existing(ritem);
54 rnode = xmlDocGetRootElement(doc);
56 for( node = rnode->children; node; node = node->next ) {
57 if( !xmlStrcmp(node->name, "item") ) {
58 /* We've found an "item" tag, let's poke through its contents */
59 fitem = g_new0(RSSylFeedItem, 1);
62 fetched = xmlGetProp(rnode, "fetched");
63 fitem->debug_fetched = atoll(fetched);
65 #endif /* RSSYL_DEBUG */
67 for( n = node->children; n; n = n->next ) {
69 if( !xmlStrcmp(n->name, "title") ) {
70 content = xmlNodeGetContent(n);
71 fitem->title = rssyl_format_string(content, TRUE, TRUE);
73 debug_print("RSSyl: XML - RDF title is '%s'\n", fitem->title);
77 if( !xmlStrcmp(n->name, "description") ) {
78 content = xmlNodeGetContent(n);
79 fitem->text = rssyl_format_string(content, FALSE, FALSE);
81 debug_print("RSSyl: XML - got RDF text\n");
85 if( !xmlStrcmp(n->name, "link") ) {
86 content = xmlNodeGetContent(n);
87 fitem->link = rssyl_format_string(content, FALSE, TRUE);
89 debug_print("RSSyl: XML - RDF link is '%s'\n", fitem->link);
92 /* Date - rfc822 format */
93 if( !xmlStrcmp(n->name, "pubDate") ) {
94 content = xmlNodeGetContent(n);
95 fitem->date = procheader_date_parse(NULL, content, 0);
97 if( fitem->date > 0 ) {
98 debug_print("RSSyl: XML - RDF pubDate found\n" );
102 /* Date - ISO8701 format */
103 if( !xmlStrcmp(n->name, "date") &&
104 (!xmlStrcmp(n->ns->prefix, "ns")
105 || !xmlStrcmp(n->ns->prefix, "dc")) ) {
106 content = xmlNodeGetContent(n);
107 fitem->date = parseISO8601Date(content);
109 debug_print("RSSyl: XML - RDF date found\n" );
113 if( !xmlStrcmp(n->name, "creator") ) {
114 content = xmlNodeGetContent(n);
115 fitem->author = rssyl_format_string(content, TRUE, TRUE);
117 debug_print("RSSyl: XML - RDF author is '%s'\n", fitem->author);
122 if( fitem && fitem->link && fitem->title ) {
123 if (rssyl_add_feed_item(ritem, fitem) == FALSE) {
124 rssyl_free_feeditem(fitem);
138 * This is where we parse the fetched rss document and create a
139 * RSSylFolderItem from it. Returns number of parsed items
141 gint rssyl_parse_rss(xmlDocPtr doc, RSSylFolderItem *ritem, gchar *parent)
143 xmlXPathContextPtr context;
144 xmlXPathObjectPtr result;
145 xmlNodePtr node, n, rnode;
147 RSSylFeedItem *fitem = NULL;
149 gboolean got_encoded, got_author;
150 gchar *rootnode = NULL;
151 RSSylFeedItemMedia *media;
152 gchar *media_url, *media_type;
153 gulong media_size = 0;
155 gchar *fetched = NULL;
156 #endif /* RSSYL_DEBUG */
158 g_return_val_if_fail(doc != NULL, 0);
159 g_return_val_if_fail(ritem != NULL, 0);
161 if( ritem->contents == NULL )
162 rssyl_read_existing(ritem);
164 rnode = xmlDocGetRootElement(doc);
166 rootnode = g_ascii_strdown(rnode->name, -1);
167 xpath = g_strconcat("/", rootnode,
168 "/channel/item", NULL);
170 context = xmlXPathNewContext(doc);
171 if( !(result = xmlXPathEvalExpression(xpath, context)) ){
172 debug_print("RSSyl: XML - no result found for '%s'\n", xpath);
173 xmlXPathFreeContext(context);
180 for( i = 0; i < result->nodesetval->nodeNr; i++ ) {
181 node = result->nodesetval->nodeTab[i];
183 if ((n = node->children) == NULL)
186 fitem = g_new0(RSSylFeedItem, 1);
190 fetched = xmlGetProp(rnode, "fetched");
191 fitem->debug_fetched = atoll(fetched);
193 #endif /* RSSYL_DEBUG */
197 fitem->parent_link = g_strdup(parent);
202 gchar *content = NULL;
205 if( !xmlStrcmp(n->name, "title") ) {
206 content = xmlNodeGetContent(n);
207 fitem->title = rssyl_format_string(content, TRUE, TRUE);
209 debug_print("RSSyl: XML - item title: '%s'\n", fitem->title);
213 if( !xmlStrcmp(n->name, "description") ) {
214 if( (fitem->text == NULL) && (got_encoded == FALSE) ) {
215 content = xmlNodeGetContent(n);
216 debug_print("RSSyl: XML - item text (description) caught\n");
217 fitem->text = rssyl_format_string(content, FALSE, FALSE);
221 if( !xmlStrcmp(n->name, "encoded")
222 && !xmlStrcmp(n->ns->prefix, "content") ) {
223 debug_print("RSSyl: XML - item text (content) caught\n");
225 if (fitem->text != NULL)
226 g_free(fitem->text); /* free "description" */
228 content = xmlNodeGetContent(n);
229 fitem->text = rssyl_format_string(content, FALSE, FALSE);
234 /* URL link to the original post */
235 if( !xmlStrcmp(n->name, "link") &&
236 (!n->ns || !n->ns->prefix || !strlen(n->ns->prefix)) ) {
237 content = xmlNodeGetContent(n);
238 fitem->link = rssyl_format_string(content, FALSE, TRUE);
240 debug_print("RSSyl: XML - item link: '%s'\n", fitem->link);
243 /* GUID - sometimes used as link */
244 if( !xmlStrcmp(n->name, "guid") ) {
245 gchar *tmp = xmlGetProp(n, "isPermaLink");
246 content = xmlNodeGetContent(n);
247 fitem->id_is_permalink = FALSE;
248 if( !tmp || xmlStrcmp(tmp, "false") ) /* permalink? */
249 fitem->id_is_permalink = TRUE;
250 fitem->id = rssyl_format_string(content, FALSE, TRUE);
252 debug_print("RSSyl: XML - item guid: '%s'\n", fitem->id);
256 /* Date - rfc822 format */
257 if( !xmlStrcmp(n->name, "pubDate") ) {
258 content = xmlNodeGetContent(n);
259 fitem->date = procheader_date_parse(NULL, content, 0);
261 if( fitem->date > 0 ) {
262 debug_print("RSSyl: XML - item date found: %d\n", (gint)fitem->date);
266 /* Date - ISO8701 format */
267 if( !xmlStrcmp(n->name, "date") && !xmlStrcmp(n->ns->prefix, "dc") ) {
268 content = xmlNodeGetContent(n);
269 fitem->date = parseISO8601Date(content);
271 debug_print("RSSyl: XML - item date found\n" );
275 if( !xmlStrcmp(n->name, "author") ) {
276 content = xmlNodeGetContent(n);
277 fitem->author = rssyl_format_string(content, TRUE, TRUE);
279 debug_print("RSSyl: XML - item author: '%s'\n", fitem->author);
283 if( !xmlStrcmp(n->name, "creator")
284 && !xmlStrcmp(n->ns->prefix, "dc") && !got_author) {
285 content = xmlNodeGetContent(n);
286 fitem->author = rssyl_format_string(content, TRUE, TRUE);
288 debug_print("RSSyl: XML - item author (creator): '%s'\n", fitem->author);
291 /* Media enclosure */
292 if( !xmlStrcmp(n->name, "enclosure") ) {
293 gchar *tmp = xmlGetProp(n, "length");
294 media_url = xmlGetProp(n, "url");
295 media_type = xmlGetProp(n, "type");
296 media_size = (tmp ? atoi(tmp) : 0);
299 if( media_url != NULL &&
300 media_type != NULL &&
302 debug_print("RSSyl: XML - enclosure: '%s' [%s] (%ld)\n",
303 media_url, media_type, media_size);
304 media = g_new(RSSylFeedItemMedia, 1);
305 media->url = media_url;
306 media->type = media_type;
307 media->size = media_size;
308 fitem->media = media;
310 debug_print("RSSyl: XML - enclosure found, but some data is missing\n");
317 if( !xmlStrcmp(n->name, "commentRSS") || !xmlStrcmp(n->name, "commentRss") ) {
318 content = xmlNodeGetContent(n);
319 fitem->comments_link = rssyl_format_string(content, FALSE, TRUE);
321 debug_print("RSSyl: XML - comments_link: '%s'\n", fitem->comments_link);
323 } while( (n = n->next) != NULL);
325 if( (fitem->link || fitem->id) && fitem->title ) {
326 if (rssyl_add_feed_item(ritem, fitem) == FALSE) {
327 rssyl_free_feeditem(fitem);
334 xmlXPathFreeObject(result);
335 xmlXPathFreeContext(context);
340 /* rssyl_parse_atom()
342 * This is where we parse the fetched atom document and create a
343 * RSSylFolderItem from it. Returns number of parsed items
345 gint rssyl_parse_atom(xmlDocPtr doc, RSSylFolderItem *ritem, gchar *parent)
347 xmlNodePtr node, n, h;
348 xmlBufferPtr buf = NULL;
350 RSSylFeedItem *fitem = NULL;
351 RSSylFeedItemMedia *media = NULL;
352 gchar *link_type, *link_href, *link_rel, *tmp, *content = NULL;
355 g_return_val_if_fail(doc != NULL, 0);
356 g_return_val_if_fail(ritem != NULL, 0);
358 if( ritem->contents == NULL )
359 rssyl_read_existing(ritem);
361 node = xmlDocGetRootElement(doc);
366 node = node->children;
368 for (; node; node = node->next) {
369 gboolean got_content = FALSE;
370 if (xmlStrcmp(node->name, "entry")) {
375 fitem = g_new0(RSSylFeedItem, 1);
377 fitem->date_published = 0;
381 fitem->parent_link = g_strdup(parent);
385 if( !xmlStrcmp(n->name, "title") ) {
386 content = xmlNodeGetContent(n);
387 fitem->title = rssyl_format_string(content, TRUE, TRUE);
389 debug_print("RSSyl: XML - Atom item title: '%s'\n", fitem->title);
393 if( !xmlStrcmp(n->name, "id") ) {
394 content = xmlNodeGetContent(n);
395 fitem->id = g_strdup_printf("%s%s", (parent?"comment-":""), content);
397 debug_print("RSSyl: XML - Atom id: '%s'\n", fitem->id);
401 if( !xmlStrcmp(n->name, "summary") && !got_content ) {
402 content = xmlNodeGetContent(n);
403 debug_print("RSSyl: XML - Atom item text (summary) caught\n");
404 fitem->text = rssyl_format_string(content, FALSE, FALSE);
408 if( !xmlStrcmp(n->name, "content") ) {
409 gchar *tmp = xmlGetProp(n, "type");
410 debug_print("RSSyl: XML - Atom item text (content) caught\n");
413 if( !xmlStrcmp(tmp, "xhtml")) {
414 for( h = n->children; h; h = h->next ) {
415 if( !xmlStrcmp(h->name, "div") ) {
416 buf = xmlBufferCreate();
417 htmlNodeDump(buf, doc, h);
418 content = g_strdup((gchar *)xmlBufferContent(buf));
423 content = xmlNodeGetContent(n);
425 fitem->text = rssyl_format_string(content, FALSE, FALSE);
431 if( !xmlStrcmp(n->name, "link") ) {
432 link_type = xmlGetProp(n, "type");
433 link_rel = xmlGetProp(n, "rel");
434 link_href = xmlGetProp(n, "href");
435 tmp = xmlGetProp(n, "length");
436 link_size = (tmp ? atoi(tmp) : 0);
439 if( !link_rel || (link_rel && !xmlStrcmp(link_rel, "alternate")) ) {
440 fitem->link = link_href;
441 debug_print("RSSyl: XML - Atom item link: '%s'\n", fitem->link);
444 } else if( link_rel && !xmlStrcmp(link_rel, "enclosure") ) {
445 debug_print("RSSyl: XML - Atom item enclosure: '%s' (%ld) [%s]\n",
446 link_href, link_size, link_type);
447 media = g_new(RSSylFeedItemMedia, 1);
448 media->url = link_href;
449 media->type = link_type;
450 media->size = link_size;
451 fitem->media = media;
460 /* Date published - ISO8701 format */
461 if( !xmlStrcmp(n->name, "published") ) {
462 content = xmlNodeGetContent(n);
463 fitem->date_published = parseISO8601Date(content);
465 debug_print("RSSyl: XML - Atom item 'issued' date found\n" );
468 /* Date modified - ISO8701 format */
469 if( !xmlStrcmp(n->name, "updated") ) {
470 content = xmlNodeGetContent(n);
471 fitem->date = parseISO8601Date(content);
473 debug_print("RSSyl: XML - Atom item 'updated' date found\n" );
477 if( !xmlStrcmp(n->name, "author") ) {
479 gchar *name = NULL, *mail = NULL;
481 for (subnode = n->children; subnode; subnode = subnode->next) {
482 content = xmlNodeGetContent(subnode);
483 if (!xmlStrcmp(subnode->name, "name") && !name)
484 name = g_strdup(content);
485 if (!xmlStrcmp(subnode->name, "email") && !mail)
486 mail = g_strdup(content);
489 tmp = g_strdup_printf("%s%s%s%s%s",
491 name && mail ? " <":(mail?"<":""),
494 !name && !mail ? "N/A":"");
495 fitem->author = rssyl_format_string(tmp, TRUE, TRUE);
499 debug_print("RSSyl: XML - Atom item author: '%s'\n", fitem->author);
503 if( !xmlStrcmp(n->name, "commentRSS") || !xmlStrcmp(n->name, "commentRss")) {
504 content = xmlNodeGetContent(n);
505 fitem->comments_link = rssyl_format_string(content, FALSE, TRUE);
507 debug_print("RSSyl: XML - comments_link: '%s'\n", fitem->comments_link);
509 } while( (n = n->next) != NULL);
511 if( fitem->id && fitem->title && fitem->date ) {
513 /* If no link is available, and we can safely guess ID
514 * might be a (perma)link, mark it so. */
515 if (!fitem->link && fitem->id /* no url, but we have id */
516 && (!strncmp(fitem->id, "http:", 5) /* id looks like an url */
517 || !strncmp(fitem->id, "https:", 6))) {
518 if (!ritem->url || strcmp(ritem->url, fitem->id)) {
519 /* id is different from feed url (good chance it is a permalink) */
520 debug_print("RSSyl: Marking ID as permalink\n");
521 fitem->id_is_permalink = TRUE;
525 if (rssyl_add_feed_item(ritem, fitem) == FALSE) {
526 rssyl_free_feeditem(fitem);
531 debug_print("RSSyl: Incomplete Atom entry, need at least 'id', 'title' and 'updated' tags\n");