update copyright year
[claws.git] / src / plugins / rssyl / libfeed / parser_atom10.c
1 /*
2  * Copyright (C) 2006 Andrej Kacian <andrej@kacian.sk>
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public License as
6  * published by the Free Software Foundation; either version 2 of the
7  * License, or (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12  * General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public
15  * License along with this program; if not, write to the
16  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
17  * Boston, MA 02111-1307, USA.
18  */
19
20 #include "config.h"
21
22 #include <glib.h>
23 #include <expat.h>
24 #include <string.h>
25 #include <stdio.h>
26
27 #include <procheader.h>
28
29 #include "feed.h"
30 #include "feeditem.h"
31 #include "date.h"
32 #include "parser.h"
33 #include "parser_atom10.h"
34
35 enum {
36         FEED_LOC_ATOM10_NONE,
37         FEED_LOC_ATOM10_ENTRY,
38         FEED_LOC_ATOM10_AUTHOR,
39         FEED_LOC_ATOM10_SOURCE,
40         FEED_LOC_ATOM10_CONTENT
41 } FeedAtom10Locations;
42
43 void feed_parser_atom10_start(void *data, const gchar *el, const gchar **attr)
44 {
45         FeedParserCtx *ctx = (FeedParserCtx *)data;
46         gchar *a = NULL;
47
48         if( ctx->depth == 1 ) {
49
50                 if( !strcmp(el, "entry") ) {
51                         /* Start of new feed item found.
52                          * Create a new FeedItem, freeing the one we already have, if any. */
53                         if( ctx->curitem != NULL )
54                                 feed_item_free(ctx->curitem);
55                         ctx->curitem = feed_item_new(ctx->feed);
56                         ctx->location = FEED_LOC_ATOM10_ENTRY;
57                 } else if( !strcmp(el, "author") ) {
58                         /* Start of author info for the feed found.
59                          * Set correct location. */
60                         ctx->location = FEED_LOC_ATOM10_AUTHOR;
61                 } else if( !strcmp(el, "link") ) {
62                         if (!feed_parser_get_attribute_value(attr, "rel")) {
63                                 /* Link tag for the feed */
64                                 g_free(ctx->feed->link);
65                                 ctx->feed->link =
66                                         g_strdup(feed_parser_get_attribute_value(attr, "href"));
67                         }
68                 } else ctx->location = FEED_LOC_ATOM10_NONE;
69
70         } else if( ctx->depth == 2 ) {
71
72                 /* Make sure we are in one of known locations within the XML structure.
73                  * This condition should never be true on a valid Atom feed. */
74                 if (ctx->location != FEED_LOC_ATOM10_AUTHOR &&
75                                 ctx->location != FEED_LOC_ATOM10_ENTRY) {
76                         ctx->depth++;
77                         return;
78                 }
79
80                 if( !strcmp(el, "author") ) {
81                         /* Start of author info for current feed item.
82                          * Set correct location. */
83                         ctx->location = FEED_LOC_ATOM10_AUTHOR;
84                 } else if( !strcmp(el, "link") ) {
85                         /* Capture item URL, from the "url" XML attribute. */
86                         if (ctx->curitem && ctx->location == FEED_LOC_ATOM10_ENTRY)
87                                 ctx->curitem->url = g_strdup(feed_parser_get_attribute_value(attr, "href"));
88                 } else if( !strcmp(el, "source") ) {
89                         ctx->location = FEED_LOC_ATOM10_SOURCE;
90                 } else ctx->location = FEED_LOC_ATOM10_ENTRY;
91
92                 if( !strcmp(el, "title") && ctx->curitem != NULL) {
93                         a = feed_parser_get_attribute_value(attr, "type");
94                         if( !a || !strcmp(a, "text") )
95                                 ctx->curitem->title_format = FEED_ITEM_TITLE_TEXT;
96                         else if( !strcmp(a, "html") )
97                                 ctx->curitem->title_format = FEED_ITEM_TITLE_HTML;
98                         else if( !strcmp(a, "xhtml") )
99                                 ctx->curitem->title_format = FEED_ITEM_TITLE_XHTML;
100                         else
101                                 ctx->curitem->title_format = FEED_ITEM_TITLE_UNKNOWN;
102                 } else if (!strcmp(el, "content") && ctx->curitem != NULL) {
103                         ctx->location = FEED_LOC_ATOM10_CONTENT;
104                         a = feed_parser_get_attribute_value(attr, "type");
105                         if (a && !strcmp(a, "xhtml")) {
106                                 ctx->curitem->xhtml_content = TRUE;
107                                 ctx->xhtml_str = g_string_new(NULL);
108                         }
109                 }
110         } else if (ctx->depth >= 3) {
111                 if (ctx->location == FEED_LOC_ATOM10_CONTENT
112                                 && ctx->curitem != NULL
113                                 && ctx->curitem->xhtml_content) {
114                         guint i;
115                         GString *txt = ctx->xhtml_str;
116                         g_string_append_c(txt, '<');
117                         g_string_append(txt, el);
118
119                         for (i = 0; attr[i] != NULL && attr[i+1] != NULL; i += 2) {
120                                 g_string_append_printf(txt, " %s='%s'", attr[i], attr[i+1]);
121                         }
122                         g_string_append_c(txt, '>');
123                 }
124         }
125
126
127         ctx->depth++;
128 }
129
130 void feed_parser_atom10_end(void *data, const gchar *el)
131 {
132         FeedParserCtx *ctx = (FeedParserCtx *)data;
133         Feed *feed = ctx->feed;
134         gchar *text = NULL, *tmp;
135
136         if( ctx->str != NULL )
137                 text = g_strstrip(g_strdup(ctx->str->str));
138         else
139                 text = "";
140
141         switch( ctx->depth ) {
142
143                 case 0:
144                         /* Just in case. */
145                         break;
146
147                 case 1:
148
149                         if( !strcmp(el, "feed") ) {
150                                 /* We have finished parsing the feed, reverse the list
151                                  * so it's not upside down. */
152                                 feed->items = g_slist_reverse(ctx->feed->items);
153                         }
154
155                         break;
156
157                 case 2:
158
159                         /* decide if we just received </entry>, so we can
160                          * add a complete item to feed */
161                         if( !strcmp(el, "entry") ) {
162
163                                 /* Fix up URL, if it is relative */
164                                 if (ctx->curitem->url != NULL &&
165                                                 !strstr(ctx->curitem->url, "://") &&
166                                                 ctx->feed->link != NULL) {
167                                         tmp = g_strconcat(ctx->feed->link,
168                                                         (ctx->curitem->url[0] == '/' ? "" : "/"),
169                                                         ctx->curitem->url, NULL);
170                                         feed_item_set_url(ctx->curitem, tmp);
171                                         g_free(tmp);
172                                 }
173
174                                 /* append the complete feed item */
175                                 if( ctx->curitem->id && ctx->curitem->title
176                                                 && ctx->curitem->date_modified ) {
177                                         feed->items = 
178                                                 g_slist_prepend(feed->items, (gpointer)ctx->curitem);
179                                 }
180                                 
181                                 /* since it's in the linked list, lose this pointer */
182                                 ctx->curitem = NULL;
183
184                         } else if( !strcmp(el, "title") ) {     /* so it wasn't end of item */
185                                 FILL(feed->title)
186                         } else if( !strcmp(el, "summary" ) ) {
187                                 FILL(feed->description)
188                         } else if( !strcmp(el, "updated" ) ) {
189                                 feed->date = procheader_date_parse(NULL, text, 0);
190                         }
191                         /* FIXME: add more later */
192
193                         break;
194
195                 case 3:
196
197                         if( ctx->curitem == NULL )
198                                 break;
199
200                         switch(ctx->location) {
201
202                                 /* We're in feed/entry */
203                                 case FEED_LOC_ATOM10_ENTRY:
204                                         if( !strcmp(el, "title") ) {
205                                                 FILL(ctx->curitem->title)
206                                         } else if( !strcmp(el, "summary") ) {
207                                                 FILL(ctx->curitem->summary)
208                                         } else if( !strcmp(el, "id") ) {
209                                                 FILL(ctx->curitem->id)
210                                                 feed_item_set_id_permalink(ctx->curitem, TRUE);
211                                         } else if( !strcmp(el, "published") ) {
212                                                 ctx->curitem->date_published = procheader_date_parse(NULL, text, 0);
213                                         } else if( !strcmp(el, "updated") ) {
214                                                 ctx->curitem->date_modified = procheader_date_parse(NULL, text, 0);
215                                         }
216
217                                         break;
218
219                                 /* We're in feed/author or about to leave feed/entry/author */
220                                 case FEED_LOC_ATOM10_AUTHOR:
221                                         if( !strcmp(el, "author" ) ) {
222                                                 /* We just finished parsing <author> */
223                                                 ctx->curitem->author = g_strdup_printf("%s%s%s%s%s",
224                                                                 ctx->name ? ctx->name : "",
225                                                                 ctx->name && ctx->mail ? " <" : ctx->mail ? "<" : "",
226                                                                 ctx->mail ? ctx->mail : "",
227                                                                 ctx->mail ? ">" : "",
228                                                                 !ctx->name && !ctx->mail ? "N/A" : "");
229                                                 ctx->location = FEED_LOC_ATOM10_ENTRY;
230                                         } else if( !strcmp(el, "name") ) {
231                                                 FILL(feed->author)
232                                         }
233
234                                         break;
235
236                                 case FEED_LOC_ATOM10_CONTENT:
237                                         if( !strcmp(el, "content") ) {
238                                                 if (ctx->curitem->xhtml_content) {
239                                                         /* Just in case the <content> tag itself also has some
240                                                          * content of its own, not just the <div> it should,
241                                                          * let's append it to the end. */
242                                                         g_string_append(ctx->xhtml_str, text);
243                                                         ctx->curitem->text = g_string_free(ctx->xhtml_str, FALSE);
244                                                         ctx->xhtml_str = NULL;
245                                                 } else {
246                                                         FILL(ctx->curitem->text)
247                                                 }
248                                                 ctx->location = FEED_LOC_ATOM10_ENTRY;
249                                         }
250
251                                         break;
252                         }
253                         break;
254
255                 case 4:
256
257                         if( ctx->curitem == NULL )
258                                 break;
259
260                         switch(ctx->location) {
261
262                                 /* We're in feed/entry/author */
263                                 case FEED_LOC_ATOM10_AUTHOR:
264                                         if( !strcmp(el, "name") ) {
265                                                 FILL(ctx->name)
266                                         } else if( !strcmp(el, "email") ) {
267                                                 FILL(ctx->mail)
268                                         }
269
270                                         break;
271
272                                 /* We're in feed/entry/source */
273                                 case FEED_LOC_ATOM10_SOURCE:
274                                         if( !strcmp(el, "title" ) ) {
275                                                 FILL(ctx->curitem->sourcetitle)
276                                         } else if( !strcmp(el, "id" ) ) {
277                                                 FILL(ctx->curitem->sourceid)
278                                         } else if( !strcmp(el, "updated" ) ) {
279                                                 ctx->curitem->sourcedate = procheader_date_parse(NULL, text, 0);
280                                         }
281
282                                         break;
283
284                                 case FEED_LOC_ATOM10_CONTENT:
285                                         if (ctx->curitem->xhtml_content) {
286                                                 g_string_append(ctx->xhtml_str, text);
287                                                 g_string_append_printf(ctx->xhtml_str, "</%s>", el);
288                                         }
289                                         break;
290
291                                 }
292
293
294                         break;
295
296                 default:
297                         if (ctx->location == FEED_LOC_ATOM10_CONTENT
298                                         && ctx->curitem->xhtml_content) {
299                                 g_string_append(ctx->xhtml_str, text);
300                                 g_string_append_printf(ctx->xhtml_str, "</%s>", el);
301                         }
302                         break;
303         }
304
305         if( ctx->str != NULL ) {
306                 g_free(text);
307                 g_string_free(ctx->str, TRUE);
308                 ctx->str = NULL;
309         }
310         ctx->str = NULL;
311
312         ctx->depth--;
313 }