while (*parser->bufp != '\0') {
switch (*parser->bufp) {
- case '<':
- if (parser->str->len == 0)
- html_parse_tag(parser);
- else
- return parser->str->str;
- break;
+ case '<': {
+ HTMLState st;
+ st = html_parse_tag(parser);
+ /* when we see an href, we need to flush the str
+ * buffer. Then collect all the chars until we
+ * see the end anchor tag
+ */
+ if (HTML_HREF_BEG == st || HTML_HREF == st)
+ return parser->str->str;
+ } break;
case '&':
html_parse_special(parser);
break;
g_free(parser->href);
parser->href =
g_strdup(((HTMLAttr *)tag->attr->data)->value);
- parser->state = HTML_HREF;
+ parser->state = HTML_HREF_BEG;
}
} else if (!strcmp(tag->name, "/a")) {
- g_free(parser->href);
- parser->href = NULL;
- parser->state = HTML_NORMAL;
+ parser->state = HTML_HREF;
} else if (!strcmp(tag->name, "p")) {
parser->space = FALSE;
if (!parser->empty_line) {
html_append_char(parser, ch);
parser->state = HTML_NORMAL;
return;
+ } else {
+ char *symb = NULL;
+ switch (ch) {
+ /* http://www.w3schools.com/html/html_entitiesref.asp */
+ case 338: /* capital ligature OE Œ */
+ symb = "OE"; break;
+ case 339: /* small ligature OE œ */
+ symb = "oe"; break;
+ case 352: /* capital S w/caron Š */
+ case 353: /* small S w/caron š */
+ case 376: /* cap Y w/ diaeres Ÿ */
+ break;
+ case 710: /* circumflex accent ˆ */
+ symb = "^"; break;
+ case 732: /* small tilde ˜ */
+ symb = "~"; break;
+ case 8194: /* en space   */
+ case 8195: /* em space   */
+ case 8201: /* thin space   */
+ symb = " "; break;
+ case 8204: /* zero width non-joiner ‌ */
+ case 8205: /* zero width joiner ‍ */
+ case 8206: /* l-t-r mark ‎ */
+ case 8207: /* r-t-l mark &rlm */
+ break;
+ case 8211: /* en dash – */
+ symb = "-"; break;
+ case 8212: /* em dash — */
+ symb = "--"; break;
+ case 8216: /* l single quot mark ‘ */
+ symb = "`"; break;
+ case 8217: /* r single quot mark ’ */
+ symb = "'"; break;
+ case 8218: /* single low-9 quot ‚ */
+ symb = ","; break;
+ case 8220: /* l double quot mark “ */
+ symb = "``"; break;
+ case 8221: /* r double quot mark ” */
+ symb = "''"; break;
+ case 8222: /* double low-9 quot „ */
+ symb = ",,"; break;
+ case 8224: /* dagger † */
+ case 8225: /* double dagger ‡ */
+ break;
+ case 8230: /* horizontal ellipsis … */
+ symb = "..."; break;
+ case 8240: /* per mile ‰ */
+ symb = "\%o"; break;
+ case 8249: /* l-pointing angle quot ‹ */
+ symb = "<"; break;
+ case 8250: /* r-pointing angle quot › */
+ symb = ">"; break;
+ case 8364: /* euro € */
+ symb = "&euro"; break;
+ case 8482: /* trademark ™ */
+ symb = "(TM)"; break;
+ default: break;
+ }
+ if (symb) {
+ html_append_str(parser, symb, -1);
+ parser->state = HTML_NORMAL;
+ return;
+ }
}
}