RSSyl: Handle XHTML content correctly for Atom feeds.
[claws.git] / src / plugins / rssyl / libfeed / parser_atom10.c
index 9cf4ffa8ce5a5a46ef26506e80c4e1d6e58ed53b..0ede945a19a7b368df89ae1e38ff613d100f4115 100644 (file)
 #include <string.h>
 #include <stdio.h>
 
+#include <procheader.h>
+
 #include "feed.h"
 #include "feeditem.h"
 #include "date.h"
 #include "parser.h"
 #include "parser_atom10.h"
 
+enum {
+       FEED_LOC_ATOM10_NONE,
+       FEED_LOC_ATOM10_ENTRY,
+       FEED_LOC_ATOM10_AUTHOR,
+       FEED_LOC_ATOM10_SOURCE,
+       FEED_LOC_ATOM10_CONTENT
+} FeedAtom10Locations;
+
 void feed_parser_atom10_start(void *data, const gchar *el, const gchar **attr)
 {
        FeedParserCtx *ctx = (FeedParserCtx *)data;
@@ -73,12 +83,12 @@ void feed_parser_atom10_start(void *data, const gchar *el, const gchar **attr)
                } else if( !strcmp(el, "link") ) {
                        /* Capture item URL, from the "url" XML attribute. */
                        if (ctx->curitem && ctx->location == FEED_LOC_ATOM10_ENTRY)
-                 ctx->curitem->url = g_strdup(feed_parser_get_attribute_value(attr, "href"));
+                               ctx->curitem->url = g_strdup(feed_parser_get_attribute_value(attr, "href"));
                } else if( !strcmp(el, "source") ) {
                        ctx->location = FEED_LOC_ATOM10_SOURCE;
                } else ctx->location = FEED_LOC_ATOM10_ENTRY;
 
-               if( !strcmp(el, "title") ) {
+               if( !strcmp(el, "title") && ctx->curitem != NULL) {
                        a = feed_parser_get_attribute_value(attr, "type");
                        if( !a || !strcmp(a, "text") )
                                ctx->curitem->title_format = FEED_ITEM_TITLE_TEXT;
@@ -88,15 +98,30 @@ void feed_parser_atom10_start(void *data, const gchar *el, const gchar **attr)
                                ctx->curitem->title_format = FEED_ITEM_TITLE_XHTML;
                        else
                                ctx->curitem->title_format = FEED_ITEM_TITLE_UNKNOWN;
-               } else if (!strcmp(el, "content") ) {
+               } else if (!strcmp(el, "content") && ctx->curitem != NULL) {
+                       ctx->location = FEED_LOC_ATOM10_CONTENT;
                        a = feed_parser_get_attribute_value(attr, "type");
                        if (a && !strcmp(a, "xhtml")) {
                                ctx->curitem->xhtml_content = TRUE;
-                               ctx->location = FEED_LOC_ATOM10_CONTENT;
+                               ctx->xhtml_str = g_string_new(NULL);
                        }
                }
+       } else if (ctx->depth >= 3) {
+               if (ctx->curitem->xhtml_content
+                               && ctx->location == FEED_LOC_ATOM10_CONTENT) {
+                       guint i;
+                       GString *txt = ctx->xhtml_str;
+                       g_string_append_c(txt, '<');
+                       g_string_append(txt, el);
+
+                       for (i = 0; attr[i] != NULL && attr[i+1] != NULL; i += 2) {
+                               g_string_append_printf(txt, " %s='%s'", attr[i], attr[i+1]);
+                       }
+                       g_string_append_c(txt, '>');
+               }
        }
 
+
        ctx->depth++;
 }
 
@@ -159,7 +184,7 @@ void feed_parser_atom10_end(void *data, const gchar *el)
                        } else if( !strcmp(el, "summary" ) ) {
                                FILL(feed->description)
                        } else if( !strcmp(el, "updated" ) ) {
-                               feed->date = parseISO8601Date(text);
+                               feed->date = procheader_date_parse(NULL, text, 0);
                        }
                        /* FIXME: add more later */
 
@@ -178,16 +203,13 @@ void feed_parser_atom10_end(void *data, const gchar *el)
                                                FILL(ctx->curitem->title)
                                        } else if( !strcmp(el, "summary") ) {
                                                FILL(ctx->curitem->summary)
-                                       } else if( !strcmp(el, "content") ) {
-                                               if (!ctx->curitem->xhtml_content)
-                                                       FILL(ctx->curitem->text)
                                        } else if( !strcmp(el, "id") ) {
                                                FILL(ctx->curitem->id)
                                                feed_item_set_id_permalink(ctx->curitem, TRUE);
                                        } else if( !strcmp(el, "published") ) {
-                                               ctx->curitem->date_published = parseISO8601Date(text);
+                                               ctx->curitem->date_published = procheader_date_parse(NULL, text, 0);
                                        } else if( !strcmp(el, "updated") ) {
-                                               ctx->curitem->date_modified = parseISO8601Date(text);
+                                               ctx->curitem->date_modified = procheader_date_parse(NULL, text, 0);
                                        }
 
                                        break;
@@ -208,8 +230,24 @@ void feed_parser_atom10_end(void *data, const gchar *el)
                                        }
 
                                        break;
-                       }
 
+                               case FEED_LOC_ATOM10_CONTENT:
+                                       if( !strcmp(el, "content") ) {
+                                               if (ctx->curitem->xhtml_content) {
+                                                       /* Just in case the <content> tag itself also has some
+                                                        * content of its own, not just the <div> it should,
+                                                        * let's append it to the end. */
+                                                       g_string_append(ctx->xhtml_str, text);
+                                                       ctx->curitem->text = g_string_free(ctx->xhtml_str, FALSE);
+                                                       ctx->xhtml_str = NULL;
+                                               } else {
+                                                       FILL(ctx->curitem->text)
+                                               }
+                                               ctx->location = FEED_LOC_ATOM10_ENTRY;
+                                       }
+
+                                       break;
+                       }
                        break;
 
                case 4:
@@ -236,20 +274,30 @@ void feed_parser_atom10_end(void *data, const gchar *el)
                                        } else if( !strcmp(el, "id" ) ) {
                                                FILL(ctx->curitem->sourceid)
                                        } else if( !strcmp(el, "updated" ) ) {
-                                               ctx->curitem->sourcedate = parseISO8601Date(text);
+                                               ctx->curitem->sourcedate = procheader_date_parse(NULL, text, 0);
                                        }
 
                                        break;
 
                                case FEED_LOC_ATOM10_CONTENT:
-                                       if (!strcmp(el, "div") && ctx->curitem->xhtml_content)
-                                               FILL(ctx->curitem->text)
+                                       if (ctx->curitem->xhtml_content) {
+                                               g_string_append(ctx->xhtml_str, text);
+                                               g_string_append_printf(ctx->xhtml_str, "</%s>", el);
+                                       }
                                        break;
 
                                }
 
 
                        break;
+
+               default:
+                       if (ctx->location == FEED_LOC_ATOM10_CONTENT
+                                       && ctx->curitem->xhtml_content) {
+                               g_string_append(ctx->xhtml_str, text);
+                               g_string_append_printf(ctx->xhtml_str, "</%s>", el);
+                       }
+                       break;
        }
 
        if( ctx->str != NULL ) {