--- /dev/null
+#include "html.h"\r
+#include "utf8_strings.h"\r
+\r
+\r
+litehtml::utf8_to_wchar::utf8_to_wchar(const char* val)\r
+{\r
+ m_utf8 = (const byte*) val;\r
+ while (true)\r
+ {\r
+ ucode_t wch = get_char();\r
+ if (!wch) break;\r
+ m_str += wch;\r
+ }\r
+}\r
+\r
+litehtml::ucode_t litehtml::utf8_to_wchar::get_char()\r
+{\r
+ ucode_t b1 = getb();\r
+\r
+ if (!b1)\r
+ {\r
+ return 0;\r
+ }\r
+\r
+ // Determine whether we are dealing\r
+ // with a one-, two-, three-, or four-\r
+ // byte sequence.\r
+ if ((b1 & 0x80) == 0)\r
+ {\r
+ // 1-byte sequence: 000000000xxxxxxx = 0xxxxxxx\r
+ return b1;\r
+ }\r
+ else if ((b1 & 0xe0) == 0xc0)\r
+ {\r
+ // 2-byte sequence: 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx\r
+ ucode_t r = (b1 & 0x1f) << 6;\r
+ r |= get_next_utf8(getb());\r
+ return r;\r
+ }\r
+ else if ((b1 & 0xf0) == 0xe0)\r
+ {\r
+ // 3-byte sequence: zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx\r
+ ucode_t r = (b1 & 0x0f) << 12;\r
+ r |= get_next_utf8(getb()) << 6;\r
+ r |= get_next_utf8(getb());\r
+ return r;\r
+ }\r
+ else if ((b1 & 0xf8) == 0xf0)\r
+ {\r
+ // 4-byte sequence: 11101110wwwwzzzzyy + 110111yyyyxxxxxx\r
+ // = 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx\r
+ // (uuuuu = wwww + 1)\r
+ int b2 = get_next_utf8(getb());\r
+ int b3 = get_next_utf8(getb());\r
+ int b4 = get_next_utf8(getb());\r
+ return ((b1 & 7) << 18) | ((b2 & 0x3f) << 12) |\r
+ ((b3 & 0x3f) << 6) | (b4 & 0x3f);\r
+ }\r
+\r
+ //bad start for UTF-8 multi-byte sequence\r
+ return '?';\r
+}\r
+\r
+litehtml::wchar_to_utf8::wchar_to_utf8(const wchar_t* val)\r
+{\r
+ unsigned int code;\r
+ for (int i = 0; val[i]; i++)\r
+ {\r
+ code = val[i];\r
+ if (code <= 0x7F)\r
+ {\r
+ m_str += (char)code;\r
+ }\r
+ else if (code <= 0x7FF)\r
+ {\r
+ m_str += (code >> 6) + 192;\r
+ m_str += (code & 63) + 128;\r
+ }\r
+ else if (0xd800 <= code && code <= 0xdfff)\r
+ {\r
+ //invalid block of utf8\r
+ }\r
+ else if (code <= 0xFFFF)\r
+ {\r
+ m_str += (code >> 12) + 224;\r
+ m_str += ((code >> 6) & 63) + 128;\r
+ m_str += (code & 63) + 128;\r
+ }\r
+ else if (code <= 0x10FFFF)\r
+ {\r
+ m_str += (code >> 18) + 240;\r
+ m_str += ((code >> 12) & 63) + 128;\r
+ m_str += ((code >> 6) & 63) + 128;\r
+ m_str += (code & 63) + 128;\r
+ }\r
+ }\r
+}\r