src/plugins/rssyl/parsers.c

   1 /*
   2  * Sylpheed -- a GTK+ based, lightweight, and fast e-mail client
   3  * Copyright (C) 1999-2004 Hiroyuki Yamamoto
   4  * This file (C) 2005 Andrej Kacian <andrej@kacian.sk>
   5  *
   6  * - various feed parsing functions
   7  * - this file could use some sorting and/or splitting
   8  *
   9  * This program is free software; you can redistribute it and/or modify
  10  * it under the terms of the GNU General Public License as published by
  11  * the Free Software Foundation; either version 2 of the License, or
  12  * (at your option) any later version.
  13  *
  14  * This program is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17  * GNU General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU General Public License
  20  * along with this program; if not, write to the Free Software
  21  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  22  */
  23
  24 #ifdef HAVE_CONFIG_H
  25 #  include "config.h"
  26 #endif
  27
  28 #include <glib.h>
  29 #include <libxml/parser.h>
  30 #include <libxml/xpath.h>
  31 #include <libxml/HTMLtree.h>
  32
  33 #include "date.h"
  34 #include "feed.h"
  35 #include "strreplace.h"
  36 #include "utils.h"
  37 #include "procheader.h"
  38
  39 gint rssyl_parse_rdf(xmlDocPtr doc, RSSylFolderItem *ritem, gchar *parent)
  40 {
  41         xmlNodePtr rnode, node, n;
  42         RSSylFeedItem *fitem = NULL;
  43         gint count = 0;
  44         gchar *content = NULL;
  45         g_return_val_if_fail(doc != NULL, 0);
  46         g_return_val_if_fail(ritem != NULL, 0);
  47 #ifdef RSSYL_DEBUG
  48         gchar *fetched = NULL;
  49 #endif  /* RSSYL_DEBUG */
  50
  51         if( ritem->contents == NULL )
  52                 rssyl_read_existing(ritem);
  53
  54         rnode = xmlDocGetRootElement(doc);
  55
  56         for( node = rnode->children; node; node = node->next ) {
  57                 if( !xmlStrcmp(node->name, "item") ) {
  58                         /* We've found an "item" tag, let's poke through its contents */
  59                         fitem = g_new0(RSSylFeedItem, 1);
  60                         fitem->date = 0;
  61 #ifdef RSSYL_DEBUG
  62                         fetched = xmlGetProp(rnode, "fetched");
  63                         fitem->debug_fetched = atoll(fetched);
  64                         xmlFree(fetched);
  65 #endif  /* RSSYL_DEBUG */
  66
  67                         for( n = node->children; n; n = n->next ) {
  68                                 /* Title */
  69                                 if( !xmlStrcmp(n->name, "title") ) {
  70                                         content = xmlNodeGetContent(n);
  71                                         fitem->title = rssyl_format_string(content, TRUE, TRUE);
  72                                         xmlFree(content);
  73                                         debug_print("RSSyl: XML - RDF title is '%s'\n", fitem->title);
  74                                 }
  75
  76                                 /* Text */
  77                                 if( !xmlStrcmp(n->name, "description") ) {
  78                                         content = xmlNodeGetContent(n);
  79                                         fitem->text = rssyl_format_string(content, FALSE, FALSE);
  80                                         xmlFree(content);
  81                                         debug_print("RSSyl: XML - got RDF text\n");
  82                                 }
  83
  84                                 /* URL */
  85                                 if( !xmlStrcmp(n->name, "link") ) {
  86                                         content = xmlNodeGetContent(n);
  87                                         fitem->link = rssyl_format_string(content, FALSE, TRUE);
  88                                         xmlFree(content);
  89                                         debug_print("RSSyl: XML - RDF link is '%s'\n", fitem->link);
  90                                 }
  91
  92                                 /* Date - rfc822 format */
  93                                 if( !xmlStrcmp(n->name, "pubDate") ) {
  94                                         content = xmlNodeGetContent(n);
  95                                         fitem->date = procheader_date_parse(NULL, content, 0);
  96                                         xmlFree(content);
  97                                         if( fitem->date > 0 ) {
  98                                                 debug_print("RSSyl: XML - RDF pubDate found\n" );
  99                                         } else
 100                                                 fitem->date = 0;
 101                                 }
 102                                 /* Date - ISO8701 format */
 103                                 if( !xmlStrcmp(n->name, "date") &&
 104                                                 (!xmlStrcmp(n->ns->prefix, "ns")
 105                                                  || !xmlStrcmp(n->ns->prefix, "dc")) ) {
 106                                         content = xmlNodeGetContent(n);
 107                                         fitem->date = parseISO8601Date(content);
 108                                         xmlFree(content);
 109                                         debug_print("RSSyl: XML - RDF date found\n" );
 110                                 }
 111
 112                                 /* Author */
 113                                 if( !xmlStrcmp(n->name, "creator") ) {
 114                                         content = xmlNodeGetContent(n);
 115                                         fitem->author = rssyl_format_string(content, TRUE, TRUE);
 116                                         xmlFree(content);
 117                                         debug_print("RSSyl: XML - RDF author is '%s'\n", fitem->author);
 118                                 }
 119                         }
 120                 }
 121
 122                 if( fitem && fitem->link && fitem->title ) {
 123                         if (rssyl_add_feed_item(ritem, fitem) == FALSE) {
 124                                 rssyl_free_feeditem(fitem);
 125                                 fitem = NULL;
 126                         }
 127                         fitem = NULL;
 128                         count++;
 129                 }
 130         }
 131
 132         return count;
 133 }
 134
 135
 136 /* rssyl_parse_rss()
 137  *
 138  * This is where we parse the fetched rss document and create a
 139  * RSSylFolderItem from it. Returns number of parsed items
 140  */
 141 gint rssyl_parse_rss(xmlDocPtr doc, RSSylFolderItem *ritem, gchar *parent)
 142 {
 143         xmlXPathContextPtr context;
 144         xmlXPathObjectPtr result;
 145         xmlNodePtr node, n, rnode;
 146         gint i, count = 0;
 147         RSSylFeedItem *fitem = NULL;
 148         gchar *xpath;
 149         gboolean got_encoded, got_author;
 150         gchar *rootnode = NULL;
 151         RSSylFeedItemMedia *media;
 152         gchar *media_url, *media_type;
 153         gulong media_size = 0;
 154 #ifdef RSSYL_DEBUG
 155         gchar *fetched = NULL;
 156 #endif  /* RSSYL_DEBUG */
 157
 158         g_return_val_if_fail(doc != NULL, 0);
 159         g_return_val_if_fail(ritem != NULL, 0);
 160
 161         if( ritem->contents == NULL )
 162                 rssyl_read_existing(ritem);
 163
 164         rnode = xmlDocGetRootElement(doc);
 165
 166         rootnode = g_ascii_strdown(rnode->name, -1);
 167         xpath = g_strconcat("/", rootnode,
 168                                 "/channel/item",        NULL);
 169         g_free(rootnode);
 170         context = xmlXPathNewContext(doc);
 171         if( !(result = xmlXPathEvalExpression(xpath, context)) ){
 172                 debug_print("RSSyl: XML - no result found for '%s'\n", xpath);
 173                 xmlXPathFreeContext(context);
 174                 g_free(xpath);
 175                 return 0;
 176         }
 177
 178         g_free(xpath);
 179
 180         for( i = 0; i < result->nodesetval->nodeNr; i++ ) {
 181                 node = result->nodesetval->nodeTab[i];
 182
 183                 if ((n = node->children) == NULL)
 184                         continue;
 185
 186                 fitem = g_new0(RSSylFeedItem, 1);
 187                 fitem->media = NULL;
 188                 fitem->date = 0;
 189 #ifdef RSSYL_DEBUG
 190                 fetched = xmlGetProp(rnode, "fetched");
 191                 fitem->debug_fetched = atoll(fetched);
 192                 xmlFree(fetched);
 193 #endif  /* RSSYL_DEBUG */
 194                 fitem->text = NULL;
 195
 196                 if (parent)
 197                         fitem->parent_link = g_strdup(parent);
 198
 199                 got_encoded = FALSE;
 200                 got_author = FALSE;
 201                 do {
 202                         gchar *content = NULL;
 203
 204                         /* Title */
 205                         if( !xmlStrcmp(n->name, "title") ) {
 206                                 content = xmlNodeGetContent(n);
 207                                 fitem->title = rssyl_format_string(content, TRUE, TRUE);
 208                                 xmlFree(content);
 209                                 debug_print("RSSyl: XML - item title: '%s'\n", fitem->title);
 210                         }
 211
 212                         /* Text */
 213                         if( !xmlStrcmp(n->name, "description") ) {
 214                                 if( (fitem->text == NULL) && (got_encoded == FALSE) ) {
 215                                         content = xmlNodeGetContent(n);
 216                                         debug_print("RSSyl: XML - item text (description) caught\n");
 217                                         fitem->text = rssyl_format_string(content, FALSE, FALSE);
 218                                         xmlFree(content);
 219                                 }
 220                         }
 221                         if( !xmlStrcmp(n->name, "encoded")
 222                                         && !xmlStrcmp(n->ns->prefix, "content") ) {
 223                                 debug_print("RSSyl: XML - item text (content) caught\n");
 224
 225                                 if (fitem->text != NULL)
 226                                         g_free(fitem->text); /* free "description" */
 227
 228                                 content = xmlNodeGetContent(n);
 229                                 fitem->text = rssyl_format_string(content, FALSE, FALSE);
 230                                 xmlFree(content);
 231                                 got_encoded = TRUE;
 232                         }
 233
 234                         /* URL link to the original post */
 235                         if( !xmlStrcmp(n->name, "link") &&
 236                                         (!n->ns || !n->ns->prefix || !strlen(n->ns->prefix)) ) {
 237                                 content = xmlNodeGetContent(n);
 238                                 fitem->link = rssyl_format_string(content, FALSE, TRUE);
 239                                 xmlFree(content);
 240                                 debug_print("RSSyl: XML - item link: '%s'\n", fitem->link);
 241                         }
 242
 243                         /* GUID - sometimes used as link */
 244                         if( !xmlStrcmp(n->name, "guid") ) {
 245                                 gchar *tmp = xmlGetProp(n, "isPermaLink");
 246                                 content = xmlNodeGetContent(n);
 247                                 fitem->id_is_permalink = FALSE;
 248                                 if( !tmp || xmlStrcmp(tmp, "false") )   /* permalink? */
 249                                         fitem->id_is_permalink = TRUE;
 250                                 fitem->id = rssyl_format_string(content, FALSE, TRUE);
 251                                 xmlFree(content);
 252                                 debug_print("RSSyl: XML - item guid: '%s'\n", fitem->id);
 253                                 xmlFree(tmp);
 254                         }
 255
 256                         /* Date - rfc822 format */
 257                         if( !xmlStrcmp(n->name, "pubDate") ) {
 258                                 content = xmlNodeGetContent(n);
 259                                 fitem->date = procheader_date_parse(NULL, content, 0);
 260                                 xmlFree(content);
 261                                 if( fitem->date > 0 ) {
 262                                         debug_print("RSSyl: XML - item date found: %d\n", (gint)fitem->date);
 263                                 } else
 264                                         fitem->date = 0;
 265                         }
 266                         /* Date - ISO8701 format */
 267                         if( !xmlStrcmp(n->name, "date") && !xmlStrcmp(n->ns->prefix, "dc") ) {
 268                                 content = xmlNodeGetContent(n);
 269                                 fitem->date = parseISO8601Date(content);
 270                                 xmlFree(content);
 271                                 debug_print("RSSyl: XML - item date found\n" );
 272                         }
 273
 274                         /* Author */
 275                         if( !xmlStrcmp(n->name, "author") ) {
 276                                 content = xmlNodeGetContent(n);
 277                                 fitem->author = rssyl_format_string(content, TRUE, TRUE);
 278                                 xmlFree(content);
 279                                 debug_print("RSSyl: XML - item author: '%s'\n", fitem->author);
 280                                 got_author = TRUE;
 281                         }
 282
 283                         if( !xmlStrcmp(n->name, "creator")
 284                                         && !xmlStrcmp(n->ns->prefix, "dc") && !got_author) {
 285                                 content = xmlNodeGetContent(n);
 286                                 fitem->author = rssyl_format_string(content, TRUE, TRUE);
 287                                 xmlFree(content);
 288                                 debug_print("RSSyl: XML - item author (creator): '%s'\n", fitem->author);
 289                         }
 290
 291                         /* Media enclosure */
 292                         if( !xmlStrcmp(n->name, "enclosure") ) {
 293                                 gchar *tmp = xmlGetProp(n, "length");
 294                                 media_url = xmlGetProp(n, "url");
 295                                 media_type = xmlGetProp(n, "type");
 296                                 media_size = (tmp ? atoi(tmp) : 0);
 297                                 xmlFree(tmp);
 298
 299                                 if( media_url != NULL &&
 300                                                 media_type != NULL &&
 301                                                 media_size != 0 ) {
 302                                         debug_print("RSSyl: XML - enclosure: '%s' [%s] (%ld)\n",
 303                                                         media_url, media_type, media_size);
 304                                         media = g_new(RSSylFeedItemMedia, 1);
 305                                         media->url = media_url;
 306                                         media->type = media_type;
 307                                         media->size = media_size;
 308                                         fitem->media = media;
 309                                 } else {
 310                                         debug_print("RSSyl: XML - enclosure found, but some data is missing\n");
 311                                         g_free(media_url);
 312                                         g_free(media_type);
 313                                 }
 314                         }
 315
 316                         /* Comments */
 317                         if( !xmlStrcmp(n->name, "commentRSS") || !xmlStrcmp(n->name, "commentRss") ) {
 318                                 content = xmlNodeGetContent(n);
 319                                 fitem->comments_link = rssyl_format_string(content, FALSE, TRUE);
 320                                 xmlFree(content);
 321                                 debug_print("RSSyl: XML - comments_link: '%s'\n", fitem->comments_link);
 322                         }
 323                 } while( (n = n->next) != NULL);
 324
 325                 if( (fitem->link || fitem->id) && fitem->title ) {
 326                         if (rssyl_add_feed_item(ritem, fitem) == FALSE) {
 327                                 rssyl_free_feeditem(fitem);
 328                                 fitem = NULL;
 329                         }
 330                         count++;
 331                 }
 332         }
 333
 334         xmlXPathFreeObject(result);
 335         xmlXPathFreeContext(context);
 336
 337         return count;
 338 }
 339
 340 /* rssyl_parse_atom()
 341  *
 342  * This is where we parse the fetched atom document and create a
 343  * RSSylFolderItem from it. Returns number of parsed items
 344  */
 345 gint rssyl_parse_atom(xmlDocPtr doc, RSSylFolderItem *ritem, gchar *parent)
 346 {
 347         xmlNodePtr node, n, h;
 348         xmlBufferPtr buf = NULL;
 349         gint count = 0;
 350         RSSylFeedItem *fitem = NULL;
 351         RSSylFeedItemMedia *media = NULL;
 352         gchar *link_type, *link_href, *link_rel, *tmp, *content = NULL;
 353         gulong link_size;
 354
 355         g_return_val_if_fail(doc != NULL, 0);
 356         g_return_val_if_fail(ritem != NULL, 0);
 357
 358         if( ritem->contents == NULL )
 359                 rssyl_read_existing(ritem);
 360
 361         node = xmlDocGetRootElement(doc);
 362
 363         if (node == NULL)
 364                 return 0;
 365
 366         node = node->children;
 367
 368         for (; node; node = node->next) {
 369                 gboolean got_content = FALSE;
 370                 if (xmlStrcmp(node->name, "entry")) {
 371                         continue;
 372                 }
 373
 374                 n = node->children;
 375                 fitem = g_new0(RSSylFeedItem, 1);
 376                 fitem->date = 0;
 377                 fitem->date_published = 0;
 378                 fitem->text = NULL;
 379
 380                 if (parent)
 381                         fitem->parent_link = g_strdup(parent);
 382
 383                 do {
 384                         /* Title */
 385                         if( !xmlStrcmp(n->name, "title") ) {
 386                                 content = xmlNodeGetContent(n);
 387                                 fitem->title = rssyl_format_string(content, TRUE, TRUE);
 388                                 xmlFree(content);
 389                                 debug_print("RSSyl: XML - Atom item title: '%s'\n", fitem->title);
 390                         }
 391
 392                         /* ID */
 393                         if( !xmlStrcmp(n->name, "id") ) {
 394                                 content = xmlNodeGetContent(n);
 395                                 fitem->id = g_strdup_printf("%s%s", (parent?"comment-":""), content);
 396                                 xmlFree(content);
 397                                 debug_print("RSSyl: XML - Atom id: '%s'\n", fitem->id);
 398                         }
 399
 400                         /* Text */
 401                         if( !xmlStrcmp(n->name, "summary") && !got_content ) {
 402                                 content = xmlNodeGetContent(n);
 403                                 debug_print("RSSyl: XML - Atom item text (summary) caught\n");
 404                                 fitem->text = rssyl_format_string(content, FALSE, FALSE);
 405                                 xmlFree(content);
 406                         }
 407
 408                         if( !xmlStrcmp(n->name, "content") ) {
 409                                 gchar *tmp = xmlGetProp(n, "type");
 410                                 debug_print("RSSyl: XML - Atom item text (content) caught\n");
 411                                 if (fitem->text)
 412                                         g_free(fitem->text);
 413                                 if( !xmlStrcmp(tmp, "xhtml")) {
 414                                         for( h = n->children; h; h = h->next ) {
 415                                                 if( !xmlStrcmp(h->name, "div") ) {
 416                                                         buf = xmlBufferCreate();
 417                                                         htmlNodeDump(buf, doc, h);
 418                                                         content = g_strdup((gchar *)xmlBufferContent(buf));
 419                                                         xmlBufferFree(buf);
 420                                                 }
 421                                         }
 422                                 } else
 423                                         content = xmlNodeGetContent(n);
 424                                 xmlFree(tmp);
 425                                 fitem->text = rssyl_format_string(content, FALSE, FALSE);
 426                                 xmlFree(content);
 427                                 got_content = TRUE;
 428                         }
 429
 430                         /* link */
 431                         if( !xmlStrcmp(n->name, "link") ) {
 432                                 link_type = xmlGetProp(n, "type");
 433                                 link_rel = xmlGetProp(n, "rel");
 434                                 link_href = xmlGetProp(n, "href");
 435                                 tmp = xmlGetProp(n, "length");
 436                                 link_size = (tmp ? atoi(tmp) : 0);
 437                                 g_free(tmp);
 438
 439                                 if( !link_rel || (link_rel && !xmlStrcmp(link_rel, "alternate")) ) {
 440                                         fitem->link = link_href;
 441                                         debug_print("RSSyl: XML - Atom item link: '%s'\n", fitem->link);
 442                                         xmlFree(link_type);
 443                                         xmlFree(link_rel);
 444                                 } else if( link_rel && !xmlStrcmp(link_rel, "enclosure") ) {
 445                                         debug_print("RSSyl: XML - Atom item enclosure: '%s' (%ld) [%s]\n",
 446                                                         link_href, link_size, link_type);
 447                                         media = g_new(RSSylFeedItemMedia, 1);
 448                                         media->url = link_href;
 449                                         media->type = link_type;
 450                                         media->size = link_size;
 451                                         fitem->media = media;
 452                                         xmlFree(link_rel);
 453                                 } else {
 454                                         xmlFree(link_type);
 455                                         xmlFree(link_rel);
 456                                         xmlFree(link_href);
 457                                 }
 458                         }
 459
 460                         /* Date published - ISO8701 format */
 461                         if( !xmlStrcmp(n->name, "published") ) {
 462                                 content = xmlNodeGetContent(n);
 463                                 fitem->date_published = parseISO8601Date(content);
 464                                 xmlFree(content);
 465                                 debug_print("RSSyl: XML - Atom item 'issued' date found\n" );
 466                         }
 467
 468                         /* Date modified - ISO8701 format */
 469                         if( !xmlStrcmp(n->name, "updated") ) {
 470                                 content = xmlNodeGetContent(n);
 471                                 fitem->date = parseISO8601Date(content);
 472                                 xmlFree(content);
 473                                 debug_print("RSSyl: XML - Atom item 'updated' date found\n" );
 474                         }
 475
 476                         /* Author */
 477                         if( !xmlStrcmp(n->name, "author") ) {
 478                                 xmlNodePtr subnode;
 479                                 gchar *name = NULL, *mail = NULL;
 480                                 gchar *tmp;
 481                                 for (subnode = n->children; subnode; subnode = subnode->next) {
 482                                         content = xmlNodeGetContent(subnode);
 483                                         if (!xmlStrcmp(subnode->name, "name") && !name)
 484                                                 name = g_strdup(content);
 485                                         if (!xmlStrcmp(subnode->name, "email") && !mail)
 486                                                 mail = g_strdup(content);
 487                                         xmlFree(content);
 488                                 }
 489                                 tmp = g_strdup_printf("%s%s%s%s%s",
 490                                                         name ? name:"",
 491                                                         name && mail ? " <":(mail?"<":""),
 492                                                         mail ? mail:"",
 493                                                         mail ? ">":"",
 494                                                         !name && !mail ? "N/A":"");
 495                                 fitem->author = rssyl_format_string(tmp, TRUE, TRUE);
 496                                 g_free(tmp);
 497                                 g_free(name);
 498                                 g_free(mail);
 499                                 debug_print("RSSyl: XML - Atom item author: '%s'\n", fitem->author);
 500                         }
 501
 502                         /* Comments */
 503                         if( !xmlStrcmp(n->name, "commentRSS") || !xmlStrcmp(n->name, "commentRss")) {
 504                                 content = xmlNodeGetContent(n);
 505                                 fitem->comments_link = rssyl_format_string(content, FALSE, TRUE);
 506                                 xmlFree(content);
 507                                 debug_print("RSSyl: XML - comments_link: '%s'\n", fitem->comments_link);
 508                         }
 509                 } while( (n = n->next) != NULL);
 510
 511                 if( fitem->id && fitem->title && fitem->date ) {
 512
 513                         /* If no link is available, and we can safely guess ID
 514                          * might be a (perma)link, mark it so. */
 515                         if (!fitem->link && fitem->id   /* no url, but we have id */
 516                                         && (!strncmp(fitem->id, "http:", 5) /* id looks like an url */
 517                                                 || !strncmp(fitem->id, "https:", 6))) {
 518                                 if (!ritem->url || strcmp(ritem->url, fitem->id)) {
 519                                         /* id is different from feed url (good chance it is a permalink) */
 520                                         debug_print("RSSyl: Marking ID as permalink\n");
 521                                         fitem->id_is_permalink = TRUE;
 522                                 }
 523                         }
 524
 525                         if (rssyl_add_feed_item(ritem, fitem) == FALSE) {
 526                                 rssyl_free_feeditem(fitem);
 527                                 fitem = NULL;
 528                         }
 529                         count++;
 530                 } else
 531                         debug_print("RSSyl: Incomplete Atom entry, need at least 'id', 'title' and 'updated' tags\n");
 532         }
 533
 534         return count;
 535 }