RSSyl: Do not run html entity replacement on URL and item id strings when adding...
[claws.git] / src / plugins / rssyl / libfeed / parser_atom10.c
1 /*
2  * Copyright (C) 2006 Andrej Kacian <andrej@kacian.sk>
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public License as
6  * published by the Free Software Foundation; either version 2 of the
7  * License, or (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12  * General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public
15  * License along with this program; if not, write to the
16  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
17  * Boston, MA 02111-1307, USA.
18  */
19 #define __USE_GNU
20
21 #include <glib.h>
22 #include <expat.h>
23 #include <string.h>
24 #include <stdio.h>
25
26 #include "feed.h"
27 #include "feeditem.h"
28 #include "date.h"
29 #include "parser.h"
30 #include "parser_atom10.h"
31
32 void feed_parser_atom10_start(void *data, const gchar *el, const gchar **attr)
33 {
34         FeedParserCtx *ctx = (FeedParserCtx *)data;
35         gchar *a = NULL;
36
37         if( ctx->depth == 1 ) {
38
39                 if( !strcmp(el, "entry") ) {
40                         /* Start of new feed item found.
41                          * Create a new FeedItem, freeing the one we already have, if any. */
42                         if( ctx->curitem != NULL )
43                                 feed_item_free(ctx->curitem);
44                         ctx->curitem = feed_item_new(ctx->feed);
45                         ctx->location = FEED_LOC_ATOM10_ENTRY;
46                 } else if( !strcmp(el, "author") ) {
47                         /* Start of author info for the feed found.
48                          * Set correct location. */
49                         ctx->location = FEED_LOC_ATOM10_AUTHOR;
50                 } else if( !strcmp(el, "link") ) {
51                         /* Link tag for the feed */
52                         g_free(ctx->feed->link);
53                         ctx->feed->link = g_strdup(feed_parser_get_attribute_value(attr, "href"));
54                 } else ctx->location = FEED_LOC_ATOM10_NONE;
55
56         } else if( ctx->depth == 2 ) {
57
58                 /* Make sure we are in one of known locations within the XML structure.
59                  * This condition should never be true on a valid Atom feed. */
60                 if (ctx->location != FEED_LOC_ATOM10_AUTHOR &&
61                                 ctx->location != FEED_LOC_ATOM10_ENTRY) {
62                         ctx->depth++;
63                         return;
64                 }
65
66                 if( !strcmp(el, "author") ) {
67                         /* Start of author info for current feed item.
68                          * Set correct location. */
69                         ctx->location = FEED_LOC_ATOM10_AUTHOR;
70                 } else if( !strcmp(el, "link") ) {
71                         /* Capture item URL, from the "url" XML attribute. */
72                         if (ctx->curitem && ctx->location == FEED_LOC_ATOM10_ENTRY)
73                   ctx->curitem->url = g_strdup(feed_parser_get_attribute_value(attr, "href"));
74                 } else if( !strcmp(el, "source") ) {
75                         ctx->location = FEED_LOC_ATOM10_SOURCE;
76                 } else ctx->location = FEED_LOC_ATOM10_ENTRY;
77
78                 if( !strcmp(el, "title") ) {
79                         a = feed_parser_get_attribute_value(attr, "type");
80                         if( !a || !strcmp(a, "text") )
81                                 ctx->curitem->title_format = FEED_ITEM_TITLE_TEXT;
82                         else if( !strcmp(a, "html") )
83                                 ctx->curitem->title_format = FEED_ITEM_TITLE_HTML;
84                         else if( !strcmp(a, "xhtml") )
85                                 ctx->curitem->title_format = FEED_ITEM_TITLE_XHTML;
86                         else
87                                 ctx->curitem->title_format = FEED_ITEM_TITLE_UNKNOWN;
88                 } else if (!strcmp(el, "content") ) {
89                         a = feed_parser_get_attribute_value(attr, "type");
90                         if (a && !strcmp(a, "xhtml")) {
91                                 ctx->curitem->xhtml_content = TRUE;
92                                 ctx->location = FEED_LOC_ATOM10_CONTENT;
93                         }
94                 }
95         }
96
97         ctx->depth++;
98 }
99
100 void feed_parser_atom10_end(void *data, const gchar *el)
101 {
102         FeedParserCtx *ctx = (FeedParserCtx *)data;
103         Feed *feed = ctx->feed;
104         gchar *text = NULL, *tmp;
105
106         if( ctx->str != NULL )
107                 text = ctx->str->str;
108         else
109                 text = "";
110
111         switch( ctx->depth ) {
112
113                 case 0:
114                         /* Just in case. */
115                         break;
116
117                 case 1:
118
119                         if( !strcmp(el, "feed") ) {
120                                 /* We have finished parsing the feed, reverse the list
121                                  * so it's not upside down. */
122                                 feed->items = g_slist_reverse(ctx->feed->items);
123                         }
124
125                         break;
126
127                 case 2:
128
129                         /* decide if we just received </entry>, so we can
130                          * add a complete item to feed */
131                         if( !strcmp(el, "entry") ) {
132
133                                 /* Fix up URL, if it is relative */
134                                 if (ctx->curitem->url != NULL &&
135                                                 !strstr("://", ctx->curitem->url) &&
136                                                 ctx->feed->link != NULL) {
137                                         tmp = g_strconcat(ctx->feed->link,
138                                                         (ctx->curitem->url[0] == '/' ? "" : "/"),
139                                                         ctx->curitem->url, NULL);
140                                         feed_item_set_url(ctx->curitem, tmp);
141                                         g_free(tmp);
142                                 }
143
144                                 /* append the complete feed item */
145                                 if( ctx->curitem->id && ctx->curitem->title
146                                                 && ctx->curitem->date_modified ) {
147                                         feed->items = 
148                                                 g_slist_prepend(feed->items, (gpointer)ctx->curitem);
149                                 }
150                                 
151                                 /* since it's in the linked list, lose this pointer */
152                                 ctx->curitem = NULL;
153
154                         } else if( !strcmp(el, "title") ) {     /* so it wasn't end of item */
155                                 FILL(feed->title)
156                         } else if( !strcmp(el, "summary" ) ) {
157                                 FILL(feed->description)
158                         } else if( !strcmp(el, "updated" ) ) {
159                                 feed->date = parseISO8601Date(text);
160                         }
161                         /* FIXME: add more later */
162
163                         break;
164
165                 case 3:
166
167                         if( ctx->curitem == NULL )
168                                 break;
169
170                         switch(ctx->location) {
171
172                                 /* We're in feed/entry */
173                                 case FEED_LOC_ATOM10_ENTRY:
174                                         if( !strcmp(el, "title") ) {
175                                                 FILL(ctx->curitem->title)
176                                         } else if( !strcmp(el, "summary") ) {
177                                                 FILL(ctx->curitem->summary)
178                                         } else if( !strcmp(el, "content") ) {
179                                                 if (!ctx->curitem->xhtml_content)
180                                                         FILL(ctx->curitem->text)
181                                         } else if( !strcmp(el, "id") ) {
182                                                 FILL(ctx->curitem->id)
183                                                 feed_item_set_id_permalink(ctx->curitem, TRUE);
184                                         } else if( !strcmp(el, "published") ) {
185                                                 ctx->curitem->date_published = parseISO8601Date(text);
186                                         } else if( !strcmp(el, "updated") ) {
187                                                 ctx->curitem->date_modified = parseISO8601Date(text);
188                                         }
189
190                                         break;
191
192                                 /* We're in feed/author or about to leave feed/entry/author */
193                                 case FEED_LOC_ATOM10_AUTHOR:
194                                         if( !strcmp(el, "author" ) ) {
195                                                 /* We just finished parsing <author> */
196                                                 ctx->curitem->author = g_strdup_printf("%s%s%s%s%s",
197                                                                 ctx->name ? ctx->name : "",
198                                                                 ctx->name && ctx->mail ? " <" : ctx->mail ? "<" : "",
199                                                                 ctx->mail ? ctx->mail : "",
200                                                                 ctx->mail ? ">" : "",
201                                                                 !ctx->name && !ctx->mail ? "N/A" : "");
202                                                 ctx->location = FEED_LOC_ATOM10_ENTRY;
203                                         } else if( !strcmp(el, "name") ) {
204                                                 FILL(feed->author)
205                                         }
206
207                                         break;
208                         }
209
210                         break;
211
212                 case 4:
213
214                         if( ctx->curitem == NULL )
215                                 break;
216
217                         switch(ctx->location) {
218
219                                 /* We're in feed/entry/author */
220                                 case FEED_LOC_ATOM10_AUTHOR:
221                                         if( !strcmp(el, "name") ) {
222                                                 FILL(ctx->name)
223                                         } else if( !strcmp(el, "email") ) {
224                                                 FILL(ctx->mail)
225                                         }
226
227                                         break;
228
229                                 /* We're in feed/entry/source */
230                                 case FEED_LOC_ATOM10_SOURCE:
231                                         if( !strcmp(el, "title" ) ) {
232                                                 FILL(ctx->curitem->sourcetitle)
233                                         } else if( !strcmp(el, "id" ) ) {
234                                                 FILL(ctx->curitem->sourceid)
235                                         } else if( !strcmp(el, "updated" ) ) {
236                                                 ctx->curitem->sourcedate = parseISO8601Date(text);
237                                         }
238
239                                         break;
240
241                                 case FEED_LOC_ATOM10_CONTENT:
242                                         if (!strcmp(el, "div") && ctx->curitem->xhtml_content)
243                                                 FILL(ctx->curitem->text)
244                                         break;
245
246                                 }
247
248
249                         break;
250         }
251
252         if( ctx->str != NULL ) {
253                 g_string_free(ctx->str, TRUE);
254                 ctx->str = NULL;
255         }
256         ctx->str = NULL;
257
258         ctx->depth--;
259 }