Initial commit of litehtml_viewer
[claws.git] / src / plugins / litehtml_viewer / litehtml / utf8_strings.cpp
diff --git a/src/plugins/litehtml_viewer/litehtml/utf8_strings.cpp b/src/plugins/litehtml_viewer/litehtml/utf8_strings.cpp
new file mode 100644 (file)
index 0000000..fcb18c8
--- /dev/null
@@ -0,0 +1,97 @@
+#include "html.h"\r
+#include "utf8_strings.h"\r
+\r
+\r
+litehtml::utf8_to_wchar::utf8_to_wchar(const char* val)\r
+{\r
+       m_utf8 = (const byte*) val;\r
+       while (true)\r
+       {\r
+               ucode_t wch = get_char();\r
+               if (!wch) break;\r
+               m_str += wch;\r
+       }\r
+}\r
+\r
+litehtml::ucode_t litehtml::utf8_to_wchar::get_char()\r
+{\r
+       ucode_t b1 = getb();\r
+\r
+       if (!b1)\r
+       {\r
+               return 0;\r
+       }\r
+\r
+       // Determine whether we are dealing\r
+       // with a one-, two-, three-, or four-\r
+       // byte sequence.\r
+       if ((b1 & 0x80) == 0)\r
+       {\r
+               // 1-byte sequence: 000000000xxxxxxx = 0xxxxxxx\r
+               return b1;\r
+       }\r
+       else if ((b1 & 0xe0) == 0xc0)\r
+       {\r
+               // 2-byte sequence: 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx\r
+               ucode_t r = (b1 & 0x1f) << 6;\r
+               r |= get_next_utf8(getb());\r
+               return r;\r
+       }\r
+       else if ((b1 & 0xf0) == 0xe0)\r
+       {\r
+               // 3-byte sequence: zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx\r
+               ucode_t r = (b1 & 0x0f) << 12;\r
+               r |= get_next_utf8(getb()) << 6;\r
+               r |= get_next_utf8(getb());\r
+               return r;\r
+       }\r
+       else if ((b1 & 0xf8) == 0xf0)\r
+       {\r
+               // 4-byte sequence: 11101110wwwwzzzzyy + 110111yyyyxxxxxx\r
+               //     = 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx\r
+               // (uuuuu = wwww + 1)\r
+               int b2 = get_next_utf8(getb());\r
+               int b3 = get_next_utf8(getb());\r
+               int b4 = get_next_utf8(getb());\r
+               return ((b1 & 7) << 18) | ((b2 & 0x3f) << 12) |\r
+                       ((b3 & 0x3f) << 6) | (b4 & 0x3f);\r
+       }\r
+\r
+       //bad start for UTF-8 multi-byte sequence\r
+       return '?';\r
+}\r
+\r
+litehtml::wchar_to_utf8::wchar_to_utf8(const wchar_t* val)\r
+{\r
+       unsigned int code;\r
+       for (int i = 0; val[i]; i++)\r
+       {\r
+               code = val[i];\r
+               if (code <= 0x7F)\r
+               {\r
+                       m_str += (char)code;\r
+               }\r
+               else if (code <= 0x7FF)\r
+               {\r
+                       m_str += (code >> 6) + 192;\r
+                       m_str += (code & 63) + 128;\r
+               }\r
+               else if (0xd800 <= code && code <= 0xdfff)\r
+               {\r
+                       //invalid block of utf8\r
+               }\r
+               else if (code <= 0xFFFF)\r
+               {\r
+                       m_str += (code >> 12) + 224;\r
+                       m_str += ((code >> 6) & 63) + 128;\r
+                       m_str += (code & 63) + 128;\r
+               }\r
+               else if (code <= 0x10FFFF)\r
+               {\r
+                       m_str += (code >> 18) + 240;\r
+                       m_str += ((code >> 12) & 63) + 128;\r
+                       m_str += ((code >> 6) & 63) + 128;\r
+                       m_str += (code & 63) + 128;\r
+               }\r
+       }\r
+}\r