while (*parser->bufp != '\0') {
switch (*parser->bufp) {
- case '<':
- if (parser->str->len == 0)
- html_parse_tag(parser);
- else
+ case '<': {
+ HTMLState st;
+ st = html_parse_tag(parser);
+ /* when we see an href, we need to flush the str
+ * buffer. Then collect all the chars until we
+ * see the end anchor tag
+ */
+ if (HTML_HREF_BEG == st || HTML_HREF == st)
return parser->str->str;
+ }
break;
case '&':
html_parse_special(parser);
{
HTMLTag *tag;
gchar *tmp;
- gchar *tmpp;
+ guchar *tmpp;
g_return_val_if_fail(str != NULL, NULL);
g_free(parser->href);
parser->href =
g_strdup(((HTMLAttr *)tag->attr->data)->value);
- parser->state = HTML_HREF;
+ parser->state = HTML_HREF_BEG;
}
} else if (!strcmp(tag->name, "/a")) {
- g_free(parser->href);
- parser->href = NULL;
- parser->state = HTML_NORMAL;
+ parser->state = HTML_HREF;
} else if (!strcmp(tag->name, "p")) {
parser->space = FALSE;
if (!parser->empty_line) {
!strcmp(tag->name, "li") ||
!strcmp(tag->name, "table") ||
!strcmp(tag->name, "tr") ||
- (tag->name[0] == 'h' && isdigit(tag->name[1]))) {
+ (tag->name[0] == 'h' && isdigit((guchar)tag->name[1]))) {
if (!parser->newline) {
parser->space = FALSE;
html_append_char(parser, '\n');
} else if (!strcmp(tag->name, "/table") ||
(tag->name[0] == '/' &&
tag->name[1] == 'h' &&
- isdigit(tag->name[1]))) {
+ isdigit((guchar)tag->name[1]))) {
if (!parser->empty_line) {
parser->space = FALSE;
if (!parser->newline) html_append_char(parser, '\n');
html_append_str(parser, val, -1);
parser->state = HTML_NORMAL;
return;
- } else if (symbol_name[1] == '#' && isdigit(symbol_name[2])) {
+ } else if (symbol_name[1] == '#' && isdigit((guchar)symbol_name[2])) {
gint ch;
ch = atoi(symbol_name + 2);
html_append_char(parser, ch);
parser->state = HTML_NORMAL;
return;
+ } else {
+ char *symb = NULL;
+ switch (ch) {
+ /* http://www.w3schools.com/html/html_entitiesref.asp */
+ case 338: /* capital ligature OE Œ */
+ symb = "OE";
+ break;
+ case 339: /* small ligature OE œ */
+ symb = "oe";
+ break;
+ case 352: /* capital S w/caron Š */
+ case 353: /* small S w/caron š */
+ case 376: /* cap Y w/ diaeres Ÿ */
+ break;
+ case 710: /* circumflex accent ˆ */
+ symb = "^";
+ break;
+ case 732: /* small tilde ˜ */
+ symb = "~";
+ break;
+ case 8194: /* en space   */
+ case 8195: /* em space   */
+ case 8201: /* thin space   */
+ symb = " ";
+ break;
+ case 8204: /* zero width non-joiner ‌ */
+ case 8205: /* zero width joiner ‍ */
+ case 8206: /* l-t-r mark ‎ */
+ case 8207: /* r-t-l mark &rlm */
+ break;
+ case 8211: /* en dash – */
+ symb = "-";
+ break;
+ case 8212: /* em dash — */
+ symb = "--";
+ break;
+ case 8216: /* l single quot mark ‘ */
+ symb = "`";
+ break;
+ case 8217: /* r single quot mark ’ */
+ symb = "'";
+ break;
+ case 8218: /* single low-9 quot ‚ */
+ symb = ",";
+ break;
+ case 8220: /* l double quot mark “ */
+ symb = "``";
+ break;
+ case 8221: /* r double quot mark ” */
+ symb = "''";
+ break;
+ case 8222: /* double low-9 quot „ */
+ symb = ",,";
+ break;
+ case 8224: /* dagger † */
+ case 8225: /* double dagger ‡ */
+ break;
+ case 8230: /* horizontal ellipsis … */
+ symb = "...";
+ break;
+ case 8240: /* per mile ‰ */
+ symb = "\%o";
+ break;
+ case 8249: /* l-pointing angle quot ‹ */
+ symb = "<";
+ break;
+ case 8250: /* r-pointing angle quot › */
+ symb = ">";
+ break;
+ case 8364: /* euro € */
+ symb = "EUR";
+ break;
+ case 8482: /* trademark ™ */
+ symb = "(TM)";
+ break;
+ default:
+ break;
+ }
+ if (symb) {
+ html_append_str(parser, symb, -1);
+ parser->state = HTML_NORMAL;
+ return;
+ }
}
}