sync with 0.8.6cvs22
[claws.git] / src / codeconv.c
1 /*
2  * Sylpheed -- a GTK+ based, lightweight, and fast e-mail client
3  * Copyright (C) 1999-2002 Hiroyuki Yamamoto
4  *
5  * This program is free software; you can redistribute it and/or modify
6  * it under the terms of the GNU General Public License as published by
7  * the Free Software Foundation; either version 2 of the License, or
8  * (at your option) any later version.
9  *
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  * GNU General Public License for more details.
14  *
15  * You should have received a copy of the GNU General Public License
16  * along with this program; if not, write to the Free Software
17  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
18  */
19
20 #ifdef HAVE_CONFIG_H
21 #  include "config.h"
22 #endif
23
24 #include <glib.h>
25 #include <string.h>
26 #include <ctype.h>
27 #include <stdlib.h>
28
29 #if (HAVE_WCTYPE_H && HAVE_WCHAR_H)
30 #  include <wchar.h>
31 #  include <wctype.h>
32 #endif
33
34 #if HAVE_LOCALE_H
35 #  include <locale.h>
36 #endif
37
38 #if HAVE_LIBJCONV
39 #  include <jconv.h>
40 #endif
41
42 #include "intl.h"
43 #include "codeconv.h"
44 #include "unmime.h"
45 #include "base64.h"
46 #include "quoted-printable.h"
47 #include "utils.h"
48 #include "prefs_common.h"
49
50 typedef enum
51 {
52         JIS_ASCII,
53         JIS_KANJI,
54         JIS_HWKANA,
55         JIS_AUXKANJI
56 } JISState;
57
58 #define SUBST_CHAR      '_'
59 #define ESC             '\033'
60
61 #define iseuckanji(c) \
62         (((c) & 0xff) >= 0xa1 && ((c) & 0xff) <= 0xfe)
63 #define iseuchwkana1(c) \
64         (((c) & 0xff) == 0x8e)
65 #define iseuchwkana2(c) \
66         (((c) & 0xff) >= 0xa1 && ((c) & 0xff) <= 0xdf)
67 #define iseucaux(c) \
68         (((c) & 0xff) == 0x8f)
69 #define isunprintableeuckanji(c) \
70         (((c) & 0xff) >= 0xa9 && ((c) & 0xff) <= 0xaf)
71 #define issjiskanji1(c) \
72         ((((c) & 0xff) >= 0x81 && ((c) & 0xff) <= 0x9f) || \
73          (((c) & 0xff) >= 0xe0 && ((c) & 0xff) <= 0xfc))
74 #define issjiskanji2(c) \
75         ((((c) & 0xff) >= 0x40 && ((c) & 0xff) <= 0x7e) || \
76          (((c) & 0xff) >= 0x80 && ((c) & 0xff) <= 0xfc))
77 #define issjishwkana(c) \
78         (((c) & 0xff) >= 0xa1 && ((c) & 0xff) <= 0xdf)
79
80 #define K_IN()                          \
81         if (state != JIS_KANJI) {       \
82                 *out++ = ESC;           \
83                 *out++ = '$';           \
84                 *out++ = 'B';           \
85                 state = JIS_KANJI;      \
86         }
87
88 #define K_OUT()                         \
89         if (state != JIS_ASCII) {       \
90                 *out++ = ESC;           \
91                 *out++ = '(';           \
92                 *out++ = 'B';           \
93                 state = JIS_ASCII;      \
94         }
95
96 #define HW_IN()                         \
97         if (state != JIS_HWKANA) {      \
98                 *out++ = ESC;           \
99                 *out++ = '(';           \
100                 *out++ = 'I';           \
101                 state = JIS_HWKANA;     \
102         }
103
104 #define AUX_IN()                        \
105         if (state != JIS_AUXKANJI) {    \
106                 *out++ = ESC;           \
107                 *out++ = '$';           \
108                 *out++ = '(';           \
109                 *out++ = 'D';           \
110                 state = JIS_AUXKANJI;   \
111         }
112
113 void conv_jistoeuc(gchar *outbuf, gint outlen, const gchar *inbuf)
114 {
115         const guchar *in = inbuf;
116         guchar *out = outbuf;
117         JISState state = JIS_ASCII;
118
119         while (*in != '\0') {
120                 if (*in == ESC) {
121                         in++;
122                         if (*in == '$') {
123                                 if (*(in + 1) == '@' || *(in + 1) == 'B') {
124                                         state = JIS_KANJI;
125                                         in += 2;
126                                 } else if (*(in + 1) == '(' &&
127                                            *(in + 2) == 'D') {
128                                         state = JIS_AUXKANJI;
129                                         in += 3;
130                                 } else {
131                                         /* unknown escape sequence */
132                                         state = JIS_ASCII;
133                                 }
134                         } else if (*in == '(') {
135                                 if (*(in + 1) == 'B' || *(in + 1) == 'J') {
136                                         state = JIS_ASCII;
137                                         in += 2;
138                                 } else if (*(in + 1) == 'I') {
139                                         state = JIS_HWKANA;
140                                         in += 2;
141                                 } else {
142                                         /* unknown escape sequence */
143                                         state = JIS_ASCII;
144                                 }
145                         } else {
146                                 /* unknown escape sequence */
147                                 state = JIS_ASCII;
148                         }
149                 } else if (*in == 0x0e) {
150                         state = JIS_HWKANA;
151                         in++;
152                 } else if (*in == 0x0f) {
153                         state = JIS_ASCII;
154                         in++;
155                 } else {
156                         switch (state) {
157                         case JIS_ASCII:
158                                 *out++ = *in++;
159                                 break;
160                         case JIS_KANJI:
161                                 *out++ = *in++ | 0x80;
162                                 if (*in == '\0') break;
163                                 *out++ = *in++ | 0x80;
164                                 break;
165                         case JIS_HWKANA:
166                                 *out++ = 0x8e;
167                                 *out++ = *in++ | 0x80;
168                                 break;
169                         case JIS_AUXKANJI:
170                                 *out++ = 0x8f;
171                                 *out++ = *in++ | 0x80;
172                                 if (*in == '\0') break;
173                                 *out++ = *in++ | 0x80;
174                                 break;
175                         }
176                 }
177         }
178
179         *out = '\0';
180 }
181
182 void conv_euctojis(gchar *outbuf, gint outlen, const gchar *inbuf)
183 {
184         const guchar *in = inbuf;
185         guchar *out = outbuf;
186         JISState state = JIS_ASCII;
187
188         while (*in != '\0') {
189                 if (isascii(*in)) {
190                         K_OUT();
191                         *out++ = *in++;
192                 } else if (iseuckanji(*in)) {
193                         if (iseuckanji(*(in + 1))) {
194                                 K_IN();
195                                 *out++ = *in++ & 0x7f;
196                                 *out++ = *in++ & 0x7f;
197                         } else {
198                                 K_OUT();
199                                 *out++ = SUBST_CHAR;
200                                 in++;
201                                 if (*in != '\0' && !isascii(*in)) {
202                                         *out++ = SUBST_CHAR;
203                                         in++;
204                                 }
205                         }
206                 } else if (iseuchwkana1(*in)) {
207                         in++;
208                         if (iseuchwkana2(*in)) {
209                                 HW_IN();
210                                 *out++ = *in++ & 0x7f;
211                         } else {
212                                 K_OUT();
213                                 if (*in != '\0' && !isascii(*in)) {
214                                         *out++ = SUBST_CHAR;
215                                         in++;
216                                 }
217                         }
218                 } else if (iseucaux(*in)) {
219                         in++;
220                         if (iseuckanji(*in) && iseuckanji(*(in + 1))) {
221                                 AUX_IN();
222                                 *out++ = *in++ & 0x7f;
223                                 *out++ = *in++ & 0x7f;
224                         } else {
225                                 K_OUT();
226                                 if (*in != '\0' && !isascii(*in)) {
227                                         *out++ = SUBST_CHAR;
228                                         in++;
229                                         if (*in != '\0' && !isascii(*in)) {
230                                                 *out++ = SUBST_CHAR;
231                                                 in++;
232                                         }
233                                 }
234                         }
235                 } else {
236                         K_OUT();
237                         *out++ = SUBST_CHAR;
238                         in++;
239                 }
240         }
241
242         K_OUT();
243         *out = '\0';
244 }
245
246 void conv_sjistoeuc(gchar *outbuf, gint outlen, const gchar *inbuf)
247 {
248         const guchar *in = inbuf;
249         guchar *out = outbuf;
250
251         while (*in != '\0') {
252                 if (isascii(*in)) {
253                         *out++ = *in++;
254                 } else if (issjiskanji1(*in)) {
255                         if (issjiskanji2(*(in + 1))) {
256                                 guchar out1 = *in;
257                                 guchar out2 = *(in + 1);
258                                 guchar row;
259
260                                 row = out1 < 0xa0 ? 0x70 : 0xb0;
261                                 if (out2 < 0x9f) {
262                                         out1 = (out1 - row) * 2 - 1;
263                                         out2 -= out2 > 0x7f ? 0x20 : 0x1f;
264                                 } else {
265                                         out1 = (out1 - row) * 2;
266                                         out2 -= 0x7e;
267                                 }
268
269                                 *out++ = out1 | 0x80;
270                                 *out++ = out2 | 0x80;
271                                 in += 2;
272                         } else {
273                                 *out++ = SUBST_CHAR;
274                                 in++;
275                                 if (*in != '\0' && !isascii(*in)) {
276                                         *out++ = SUBST_CHAR;
277                                         in++;
278                                 }
279                         }
280                 } else if (issjishwkana(*in)) {
281                         *out++ = 0x8e;
282                         *out++ = *in++;
283                 } else {
284                         *out++ = SUBST_CHAR;
285                         in++;
286                 }
287         }
288
289         *out = '\0';
290 }
291
292 void conv_anytoeuc(gchar *outbuf, gint outlen, const gchar *inbuf)
293 {
294         switch (conv_guess_encoding(inbuf)) {
295         case C_ISO_2022_JP:
296                 conv_jistoeuc(outbuf, outlen, inbuf);
297                 break;
298         case C_SHIFT_JIS:
299                 conv_sjistoeuc(outbuf, outlen, inbuf);
300                 break;
301         default:
302                 strncpy2(outbuf, inbuf, outlen);
303                 break;
304         }
305 }
306
307 void conv_anytojis(gchar *outbuf, gint outlen, const gchar *inbuf)
308 {
309         switch (conv_guess_encoding(inbuf)) {
310         case C_EUC_JP:
311                 conv_euctojis(outbuf, outlen, inbuf);
312                 break;
313         default:
314                 strncpy2(outbuf, inbuf, outlen);
315                 break;
316         }
317 }
318
319 void conv_unreadable_eucjp(gchar *str)
320 {
321         register guchar *p = str;
322
323         while (*p != '\0') {
324                 if (isascii(*p)) {
325                         /* convert CR+LF -> LF */
326                         if (*p == '\r' && *(p + 1) == '\n')
327                                 memmove(p, p + 1, strlen(p));
328                         /* printable 7 bit code */
329                         p++;
330                 } else if (iseuckanji(*p)) {
331                         if (iseuckanji(*(p + 1)) && !isunprintableeuckanji(*p))
332                                 /* printable euc-jp code */
333                                 p += 2;
334                         else {
335                                 /* substitute unprintable code */
336                                 *p++ = SUBST_CHAR;
337                                 if (*p != '\0') {
338                                         if (isascii(*p))
339                                                 p++;
340                                         else
341                                                 *p++ = SUBST_CHAR;
342                                 }
343                         }
344                 } else if (iseuchwkana1(*p)) {
345                         if (iseuchwkana2(*(p + 1)))
346                                 /* euc-jp hankaku kana */
347                                 p += 2;
348                         else
349                                 *p++ = SUBST_CHAR;
350                 } else if (iseucaux(*p)) {
351                         if (iseuckanji(*(p + 1)) && iseuckanji(*(p + 2))) {
352                                 /* auxiliary kanji */
353                                 p += 3;
354                         } else
355                                 *p++ = SUBST_CHAR;
356                 } else
357                         /* substitute unprintable 1 byte code */
358                         *p++ = SUBST_CHAR;
359         }
360 }
361
362 void conv_unreadable_8bit(gchar *str)
363 {
364         register guchar *p = str;
365
366         while (*p != '\0') {
367                 /* convert CR+LF -> LF */
368                 if (*p == '\r' && *(p + 1) == '\n')
369                         memmove(p, p + 1, strlen(p));
370                 else if (!isascii(*p)) *p = SUBST_CHAR;
371                 p++;
372         }
373 }
374
375 void conv_unreadable_latin(gchar *str)
376 {
377         register guchar *p = str;
378
379         while (*p != '\0') {
380                 /* convert CR+LF -> LF */
381                 if (*p == '\r' && *(p + 1) == '\n')
382                         memmove(p, p + 1, strlen(p));
383                 else if ((*p & 0xff) >= 0x80 && (*p & 0xff) <= 0x9f)
384                         *p = SUBST_CHAR;
385                 p++;
386         }
387 }
388
389 #define NCV     '\0'
390
391 void conv_mb_alnum(gchar *str)
392 {
393         static guchar char_tbl[] = {
394                 /* 0xa0 - 0xaf */
395                 NCV, ' ', NCV, NCV, ',', '.', NCV, ':',
396                 ';', '?', '!', NCV, NCV, NCV, NCV, NCV,
397                 /* 0xb0 - 0xbf */
398                 NCV, NCV, NCV, NCV, NCV, NCV, NCV, NCV,
399                 NCV, NCV, NCV, NCV, NCV, NCV, NCV, NCV,
400                 /* 0xc0 - 0xcf */
401                 NCV, NCV, NCV, NCV, NCV, NCV, NCV, NCV,
402                 NCV, NCV, '(', ')', NCV, NCV, '[', ']',
403                 /* 0xd0 - 0xdf */
404                 '{', '}', NCV, NCV, NCV, NCV, NCV, NCV,
405                 NCV, NCV, NCV, NCV, '+', '-', NCV, NCV,
406                 /* 0xe0 - 0xef */
407                 NCV, '=', NCV, '<', '>', NCV, NCV, NCV,
408                 NCV, NCV, NCV, NCV, NCV, NCV, NCV, NCV
409         };
410
411         register guchar *p = str;
412         register gint len;
413
414         len = strlen(str);
415
416         while (len > 1) {
417                 if (*p == 0xa3) {
418                         register guchar ch = *(p + 1);
419
420                         if (ch >= 0xb0 && ch <= 0xfa) {
421                                 /* [a-zA-Z] */
422                                 *p = ch & 0x7f;
423                                 p++;
424                                 len--;
425                                 memmove(p, p + 1, len);
426                                 len--;
427                         } else  {
428                                 p += 2;
429                                 len -= 2;
430                         }
431                 } else if (*p == 0xa1) {
432                         register guchar ch = *(p + 1);
433
434                         if (ch >= 0xa0 && ch <= 0xef &&
435                             NCV != char_tbl[ch - 0xa0]) {
436                                 *p = char_tbl[ch - 0xa0];
437                                 p++;
438                                 len--;
439                                 memmove(p, p + 1, len);
440                                 len--;
441                         } else {
442                                 p += 2;
443                                 len -= 2;
444                         }
445                 } else if (iseuckanji(*p)) {
446                         p += 2;
447                         len -= 2;
448                 } else {
449                         p++;
450                         len--;
451                 }
452         }
453 }
454
455 CharSet conv_guess_encoding(const gchar *str)
456 {
457         const guchar *p = str;
458         CharSet guessed = C_US_ASCII;
459
460         while (*p != '\0') {
461                 if (*p == ESC && (*(p + 1) == '$' || *(p + 1) == '(')) {
462                         if (guessed == C_US_ASCII)
463                                 return C_ISO_2022_JP;
464                         p += 2;
465                 } else if (isascii(*p)) {
466                         p++;
467                 } else if (iseuckanji(*p) && iseuckanji(*(p + 1))) {
468                         if (*p >= 0xfd && *p <= 0xfe)
469                                 return C_EUC_JP;
470                         else if (guessed == C_SHIFT_JIS) {
471                                 if ((issjiskanji1(*p) &&
472                                      issjiskanji2(*(p + 1))) ||
473                                     issjishwkana(*p))
474                                         guessed = C_SHIFT_JIS;
475                                 else
476                                         guessed = C_EUC_JP;
477                         } else
478                                 guessed = C_EUC_JP;
479                         p += 2;
480                 } else if (issjiskanji1(*p) && issjiskanji2(*(p + 1))) {
481                         if (iseuchwkana1(*p) && iseuchwkana2(*(p + 1)))
482                                 guessed = C_SHIFT_JIS;
483                         else
484                                 return C_SHIFT_JIS;
485                         p += 2;
486                 } else if (issjishwkana(*p)) {
487                         guessed = C_SHIFT_JIS;
488                         p++;
489                 } else {
490                         p++;
491                 }
492         }
493
494         return guessed;
495 }
496
497 void conv_jistodisp(gchar *outbuf, gint outlen, const gchar *inbuf)
498 {
499         conv_jistoeuc(outbuf, outlen, inbuf);
500         conv_unreadable_eucjp(outbuf);
501 }
502
503 void conv_sjistodisp(gchar *outbuf, gint outlen, const gchar *inbuf)
504 {
505         conv_sjistoeuc(outbuf, outlen, inbuf);
506         conv_unreadable_eucjp(outbuf);
507 }
508
509 void conv_euctodisp(gchar *outbuf, gint outlen, const gchar *inbuf)
510 {
511         strncpy2(outbuf, inbuf, outlen);
512         conv_unreadable_eucjp(outbuf);
513 }
514
515 void conv_anytodisp(gchar *outbuf, gint outlen, const gchar *inbuf)
516 {
517         conv_anytoeuc(outbuf, outlen, inbuf);
518         conv_unreadable_eucjp(outbuf);
519 }
520
521 void conv_ustodisp(gchar *outbuf, gint outlen, const gchar *inbuf)
522 {
523         strncpy2(outbuf, inbuf, outlen);
524         conv_unreadable_8bit(outbuf);
525 }
526
527 void conv_latintodisp(gchar *outbuf, gint outlen, const gchar *inbuf)
528 {
529         strncpy2(outbuf, inbuf, outlen);
530         conv_unreadable_latin(outbuf);
531 }
532
533 void conv_noconv(gchar *outbuf, gint outlen, const gchar *inbuf)
534 {
535         strncpy2(outbuf, inbuf, outlen);
536 }
537
538 CodeConverter *conv_code_converter_new(const gchar *charset)
539 {
540         CodeConverter *conv;
541
542         conv = g_new0(CodeConverter, 1);
543 #if !HAVE_LIBJCONV
544         conv->code_conv_func = conv_get_code_conv_func(charset);
545 #endif
546         conv->charset_str = g_strdup(charset);
547         conv->charset = conv_get_charset_from_str(charset);
548
549         return conv;
550 }
551
552 void conv_code_converter_destroy(CodeConverter *conv)
553 {
554         g_free(conv->charset_str);
555         g_free(conv);
556 }
557
558 gint conv_convert(CodeConverter *conv, gchar *outbuf, gint outlen,
559                   const gchar *inbuf)
560 {
561 #if HAVE_LIBJCONV
562         gchar *str;
563
564         str = conv_codeset_strdup(inbuf, conv->charset_str, NULL);
565         if (!str)
566                 return -1;
567         else {
568                 strncpy2(outbuf, str, outlen);
569                 g_free(str);
570         }
571 #else /* !HAVE_LIBJCONV */
572         conv->code_conv_func(outbuf, outlen, inbuf);
573 #endif
574
575         return 0;
576 }
577
578 gchar *conv_codeset_strdup(const gchar *inbuf,
579                            const gchar *src_codeset, const gchar *dest_codeset)
580 {
581         gchar *buf;
582         size_t len;
583 #if HAVE_LIBJCONV
584         gint actual_codeset;
585         const gchar *const *codesets;
586         gint n_codesets;
587 #else /* !HAVE_LIBJCONV */
588         CharSet src_charset = C_AUTO, dest_charset = C_AUTO;
589 #endif
590
591         if (!dest_codeset) {
592                 CodeConvFunc func;
593
594                 func = conv_get_code_conv_func(src_codeset);
595                 if (func != conv_noconv) {
596                         if (func == conv_jistodisp ||
597                             func == conv_sjistodisp ||
598                             func == conv_anytodisp)
599                                 len = strlen(inbuf) * 2 + 1;
600                         else
601                                 len = strlen(inbuf) + 1;
602                         buf = g_malloc(len);
603                         if (!buf) return NULL;
604                         func(buf, len, inbuf);
605                         buf = g_realloc(buf, strlen(buf) + 1);
606                         return buf;
607                 }
608         }
609
610         /* don't convert if src and dest codeset are identical */
611         if (src_codeset && dest_codeset &&
612             !strcasecmp(src_codeset, dest_codeset))
613                 return g_strdup(inbuf);
614
615 #if HAVE_LIBJCONV
616         if (src_codeset) {
617                 codesets = &src_codeset;
618                 n_codesets = 1;
619         } else
620                 codesets = jconv_info_get_pref_codesets(&n_codesets);
621         if (!dest_codeset) {
622                 dest_codeset = conv_get_current_charset_str();
623                 /* don't convert if current codeset is US-ASCII */
624                 if (!strcasecmp(dest_codeset, CS_US_ASCII))
625                         return g_strdup(inbuf);
626         }
627
628         if (jconv_alloc_conv(inbuf, strlen(inbuf), &buf, &len,
629                              codesets, n_codesets,
630                              &actual_codeset, dest_codeset)
631             == 0)
632                 return buf;
633         else {
634 #if 0
635                 g_warning("code conversion from %s to %s failed\n",
636                           codesets && codesets[0] ? codesets[0] : "(unknown)",
637                           dest_codeset);
638 #endif /* 0 */
639                 return NULL;
640         }
641 #else /* !HAVE_LIBJCONV */
642         if (src_codeset) {
643                 if (!strcasecmp(src_codeset, CS_EUC_JP) ||
644                     !strcasecmp(src_codeset, CS_EUCJP))
645                         src_charset = C_EUC_JP;
646                 else if (!strcasecmp(src_codeset, CS_SHIFT_JIS) ||
647                          !strcasecmp(src_codeset, "SHIFT-JIS") ||
648                          !strcasecmp(src_codeset, "SJIS"))
649                         src_charset = C_SHIFT_JIS;
650                 if (dest_codeset && !strcasecmp(dest_codeset, CS_ISO_2022_JP))
651                         dest_charset = C_ISO_2022_JP;
652         }
653
654         if ((src_charset == C_EUC_JP || src_charset == C_SHIFT_JIS) &&
655             dest_charset == C_ISO_2022_JP) {
656                 len = (strlen(inbuf) + 1) * 3;
657                 buf = g_malloc(len);
658                 if (buf) {
659                         if (src_charset == C_EUC_JP)
660                                 conv_euctojis(buf, len, inbuf);
661                         else
662                                 conv_anytojis(buf, len, inbuf);
663                         buf = g_realloc(buf, strlen(buf) + 1);
664                 }
665         } else
666                 buf = g_strdup(inbuf);
667
668         return buf;
669 #endif /* !HAVE_LIBJCONV */
670 }
671
672 CodeConvFunc conv_get_code_conv_func(const gchar *charset)
673 {
674         CodeConvFunc code_conv;
675         CharSet cur_charset;
676
677         if (!charset) {
678                 cur_charset = conv_get_current_charset();
679                 if (cur_charset == C_EUC_JP || cur_charset == C_SHIFT_JIS)
680                         return conv_anytodisp;
681                 else
682                         return conv_noconv;
683         }
684
685         if (!strcasecmp(charset, CS_ISO_2022_JP) ||
686             !strcasecmp(charset, CS_ISO_2022_JP_2))
687                 code_conv = conv_jistodisp;
688         else if (!strcasecmp(charset, CS_US_ASCII))
689                 code_conv = conv_ustodisp;
690         else if (!strncasecmp(charset, CS_ISO_8859_1, 10))
691                 code_conv = conv_latintodisp;
692 #if !HAVE_LIBJCONV
693         else if (!strncasecmp(charset, "ISO-8859-", 9))
694                 code_conv = conv_latintodisp;
695 #endif
696         else if (!strcasecmp(charset, CS_SHIFT_JIS) ||
697                  !strcasecmp(charset, "SHIFT-JIS")  ||
698                  !strcasecmp(charset, "SJIS")       ||
699                  !strcasecmp(charset, "X-SJIS"))
700                 code_conv = conv_sjistodisp;
701         else if (!strcasecmp(charset, CS_EUC_JP) ||
702                  !strcasecmp(charset, CS_EUCJP))
703                 code_conv = conv_euctodisp;
704         else
705                 code_conv = conv_noconv;
706
707         return code_conv;
708 }
709
710 static const struct {
711         CharSet charset;
712         gchar *const name;
713 } charsets[] = {
714         {C_US_ASCII,            CS_US_ASCII},
715         {C_US_ASCII,            CS_ANSI_X3_4_1968},
716         {C_UTF_8,               CS_UTF_8},
717         {C_ISO_8859_1,          CS_ISO_8859_1},
718         {C_ISO_8859_2,          CS_ISO_8859_2},
719         {C_ISO_8859_4,          CS_ISO_8859_4},
720         {C_ISO_8859_5,          CS_ISO_8859_5},
721         {C_ISO_8859_7,          CS_ISO_8859_7},
722         {C_ISO_8859_8,          CS_ISO_8859_8},
723         {C_ISO_8859_9,          CS_ISO_8859_9},
724         {C_ISO_8859_11,         CS_ISO_8859_11},
725         {C_ISO_8859_13,         CS_ISO_8859_13},
726         {C_ISO_8859_15,         CS_ISO_8859_15},
727         {C_BALTIC,              CS_BALTIC},
728         {C_CP1251,              CS_CP1251},
729         {C_WINDOWS_1251,        CS_WINDOWS_1251},
730         {C_KOI8_R,              CS_KOI8_R},
731         {C_KOI8_U,              CS_KOI8_U},
732         {C_ISO_2022_JP,         CS_ISO_2022_JP},
733         {C_ISO_2022_JP_2,       CS_ISO_2022_JP_2},
734         {C_EUC_JP,              CS_EUC_JP},
735         {C_EUC_JP,              CS_EUCJP},
736         {C_SHIFT_JIS,           CS_SHIFT_JIS},
737         {C_ISO_2022_KR,         CS_ISO_2022_KR},
738         {C_EUC_KR,              CS_EUC_KR},
739         {C_ISO_2022_CN,         CS_ISO_2022_CN},
740         {C_EUC_CN,              CS_EUC_CN},
741         {C_GB2312,              CS_GB2312},
742         {C_EUC_TW,              CS_EUC_TW},
743         {C_BIG5,                CS_BIG5},
744         {C_TIS_620,             CS_TIS_620},
745         {C_WINDOWS_874,         CS_WINDOWS_874},
746 };
747
748 #if !HAVE_LIBJCONV
749 static const struct {
750         gchar *const locale;
751         CharSet charset;
752         CharSet out_charset;
753 } locale_table[] = {
754         {"ja_JP.eucJP"  , C_EUC_JP      , C_ISO_2022_JP},
755         {"ja_JP.ujis"   , C_EUC_JP      , C_ISO_2022_JP},
756         {"ja_JP.EUC"    , C_EUC_JP      , C_ISO_2022_JP},
757         {"ja_JP.SJIS"   , C_SHIFT_JIS   , C_ISO_2022_JP},
758         {"ja_JP.JIS"    , C_ISO_2022_JP , C_ISO_2022_JP},
759         {"ja_JP"        , C_EUC_JP      , C_ISO_2022_JP},
760         {"ko_KR"        , C_EUC_KR      , C_EUC_KR},
761         {"zh_CN.GB2312" , C_GB2312      , C_GB2312},
762         {"zh_CN"        , C_GB2312      , C_GB2312},
763         {"zh_TW.eucTW"  , C_EUC_TW      , C_BIG5},
764         {"zh_TW.Big5"   , C_BIG5        , C_BIG5},
765         {"zh_TW"        , C_BIG5        , C_BIG5},
766
767         {"ru_RU.KOI8-R" , C_KOI8_R      , C_ISO_8859_5},
768         {"ru_RU.CP1251" , C_WINDOWS_1251, C_ISO_8859_5},
769
770         {"bg_BG"        , C_WINDOWS_1251, C_WINDOWS_1251},
771
772         {"en_US"        , C_ISO_8859_1  , C_ISO_8859_1},
773         {"ca_ES"        , C_ISO_8859_1  , C_ISO_8859_1},
774         {"da_DK"        , C_ISO_8859_1  , C_ISO_8859_1},
775         {"de_DE"        , C_ISO_8859_1  , C_ISO_8859_1},
776         {"nl_NL"        , C_ISO_8859_1  , C_ISO_8859_1},
777         {"et_EE"        , C_ISO_8859_1  , C_ISO_8859_1},
778         {"fi_FI"        , C_ISO_8859_1  , C_ISO_8859_1},
779         {"fr_FR"        , C_ISO_8859_1  , C_ISO_8859_1},
780         {"is_IS"        , C_ISO_8859_1  , C_ISO_8859_1},
781         {"it_IT"        , C_ISO_8859_1  , C_ISO_8859_1},
782         {"no_NO"        , C_ISO_8859_1  , C_ISO_8859_1},
783         {"pt_PT"        , C_ISO_8859_1  , C_ISO_8859_1},
784         {"pt_BR"        , C_ISO_8859_1  , C_ISO_8859_1},
785         {"es_ES"        , C_ISO_8859_1  , C_ISO_8859_1},
786         {"sv_SE"        , C_ISO_8859_1  , C_ISO_8859_1},
787
788         {"hr_HR"        , C_ISO_8859_2  , C_ISO_8859_2},
789         {"hu_HU"        , C_ISO_8859_2  , C_ISO_8859_2},
790         {"pl_PL"        , C_ISO_8859_2  , C_ISO_8859_2},
791         {"ro_RO"        , C_ISO_8859_2  , C_ISO_8859_2},
792         {"sk_SK"        , C_ISO_8859_2  , C_ISO_8859_2},
793         {"sl_SI"        , C_ISO_8859_2  , C_ISO_8859_2},
794         {"ru_RU"        , C_ISO_8859_5  , C_ISO_8859_5},
795         {"el_GR"        , C_ISO_8859_7  , C_ISO_8859_7},
796         {"iw_IL"        , C_ISO_8859_8  , C_ISO_8859_8},
797         {"tr_TR"        , C_ISO_8859_9  , C_ISO_8859_9},
798
799         {"th_TH"        , C_TIS_620     , C_TIS_620},
800         /* {"th_TH"     , C_WINDOWS_874}, */
801         /* {"th_TH"     , C_ISO_8859_11}, */
802
803         {"lt_LT.iso88594"       , C_ISO_8859_4  , C_ISO_8859_4},
804         {"lt_LT.ISO8859-4"      , C_ISO_8859_4  , C_ISO_8859_4},
805         {"lt_LT.ISO_8859-4"     , C_ISO_8859_4  , C_ISO_8859_4},
806         {"lt_LT"                , C_ISO_8859_13 , C_ISO_8859_13},
807         {"lv_LV"                , C_ISO_8859_13 , C_ISO_8859_13},
808
809         {"C"                    , C_US_ASCII    , C_US_ASCII},
810         {"POSIX"                , C_US_ASCII    , C_US_ASCII},
811         {"ANSI_X3.4-1968"       , C_US_ASCII    , C_US_ASCII},
812 };
813 #endif /* !HAVE_LIBJCONV */
814
815 const gchar *conv_get_charset_str(CharSet charset)
816 {
817         gint i;
818
819         for (i = 0; i < sizeof(charsets) / sizeof(charsets[0]); i++) {
820                 if (charsets[i].charset == charset)
821                         return charsets[i].name;
822         }
823
824         return NULL;
825 }
826
827 CharSet conv_get_charset_from_str(const gchar *charset)
828 {
829         gint i;
830
831         if (!charset) return C_AUTO;
832
833         for (i = 0; i < sizeof(charsets) / sizeof(charsets[0]); i++) {
834                 if (!strcasecmp(charsets[i].name, charset))
835                         return charsets[i].charset;
836         }
837
838         return C_AUTO;
839 }
840
841 CharSet conv_get_current_charset(void)
842 {
843         static CharSet cur_charset = -1;
844         gint i;
845
846 #if HAVE_LIBJCONV
847         const gchar *cur_codeset;
848 #else
849         const gchar *cur_locale;
850 #endif
851
852         if (cur_charset != -1)
853                 return cur_charset;
854
855 #if HAVE_LIBJCONV
856         cur_codeset = jconv_info_get_current_codeset();
857         for (i = 0; i < sizeof(charsets) / sizeof(charsets[0]); i++) {
858                 if (!strcasecmp(cur_codeset, charsets[i].name)) {
859                         cur_charset = charsets[i].charset;
860                         return cur_charset;
861                 }
862         }
863 #else
864         cur_locale = conv_get_current_locale();
865         if (!cur_locale) {
866                 cur_charset = C_US_ASCII;
867                 return cur_charset;
868         }
869
870         if (strcasestr(cur_locale, "UTF-8")) {
871                 cur_charset = C_UTF_8;
872                 return cur_charset;
873         }
874
875         for (i = 0; i < sizeof(locale_table) / sizeof(locale_table[0]); i++) {
876                 const gchar *p;
877
878                 /* "ja_JP.EUC" matches with "ja_JP.eucJP" and "ja_JP.EUC" */
879                 /* "ja_JP" matches with "ja_JP.xxxx" and "ja" */
880                 if (!strncasecmp(cur_locale, locale_table[i].locale,
881                                  strlen(locale_table[i].locale))) {
882                         cur_charset = locale_table[i].charset;
883                         return cur_charset;
884                 } else if ((p = strchr(locale_table[i].locale, '_')) &&
885                          !strchr(p + 1, '.')) {
886                         if (strlen(cur_locale) == 2 &&
887                             !strncasecmp(cur_locale, locale_table[i].locale, 2)) {
888                                 cur_charset = locale_table[i].charset;
889                                 return cur_charset;
890                         }
891                 }
892         }
893 #endif
894
895         cur_charset = C_AUTO;
896         return cur_charset;
897 }
898
899 const gchar *conv_get_current_charset_str(void)
900 {
901         static const gchar *codeset = NULL;
902
903         if (!codeset)
904                 codeset = conv_get_charset_str(conv_get_current_charset());
905
906         return codeset ? codeset : "US-ASCII";
907 }
908
909 CharSet conv_get_outgoing_charset(void)
910 {
911         static CharSet out_charset = -1;
912         gint i;
913
914 #if HAVE_LIBJCONV
915         gint j, n_pref_codesets;
916         const gchar *const *pref_codesets;
917 #else
918         const gchar *cur_locale;
919 #endif
920
921         if (out_charset != -1)
922                 return out_charset;
923
924 #if HAVE_LIBJCONV
925         /* skip US-ASCII and UTF-8 */
926         pref_codesets = jconv_info_get_pref_codesets(&n_pref_codesets);
927         for (i = 0; i < n_pref_codesets; i++) {
928                 for (j = 3; j < sizeof(charsets) / sizeof(charsets[0]); j++) {
929                         if (!strcasecmp(pref_codesets[i], charsets[j].name)) {
930                                 out_charset = charsets[j].charset;
931                                 return out_charset;
932                         }
933                 }
934         }
935
936         for (i = 0; i < n_pref_codesets; i++) {
937                 if (!strcasecmp(pref_codesets[i], "UTF-8")) {
938                         out_charset = C_UTF_8;
939                         return out_charset;
940                 }
941         }
942
943         out_charset = C_AUTO;
944 #else
945         cur_locale = conv_get_current_locale();
946         if (!cur_locale) {
947                 out_charset = C_AUTO;
948                 return out_charset;
949         }
950
951         for (i = 0; i < sizeof(locale_table) / sizeof(locale_table[0]); i++) {
952                 const gchar *p;
953
954                 if (!strncasecmp(cur_locale, locale_table[i].locale,
955                                  strlen(locale_table[i].locale))) {
956                         out_charset = locale_table[i].out_charset;
957                         break;
958                 } else if ((p = strchr(locale_table[i].locale, '_')) &&
959                          !strchr(p + 1, '.')) {
960                         if (strlen(cur_locale) == 2 &&
961                             !strncasecmp(cur_locale, locale_table[i].locale, 2)) {
962                                 out_charset = locale_table[i].out_charset;
963                                 break;
964                         }
965                 }
966         }
967
968         /* encoding conversion without libjconv is only supported
969            on Japanese locale for now */
970         if (out_charset == C_ISO_2022_JP)
971                 return out_charset;
972
973         out_charset = conv_get_current_charset();
974 #endif
975
976         return out_charset;
977 }
978
979 const gchar *conv_get_outgoing_charset_str(void)
980 {
981         CharSet out_charset;
982         const gchar *str;
983
984         if (prefs_common.outgoing_charset) {
985                 if (!isalpha(prefs_common.outgoing_charset[0])) {
986                         g_free(prefs_common.outgoing_charset);
987                         prefs_common.outgoing_charset = g_strdup(CS_AUTO);
988                 } else if (strcmp(prefs_common.outgoing_charset, CS_AUTO) != 0)
989                         return prefs_common.outgoing_charset;
990         }
991
992         out_charset = conv_get_outgoing_charset();
993         str = conv_get_charset_str(out_charset);
994
995         return str ? str : "US-ASCII";
996 }
997
998 const gchar *conv_get_current_locale(void)
999 {
1000         gchar *cur_locale;
1001
1002         cur_locale = g_getenv("LC_ALL");
1003         if (!cur_locale) cur_locale = g_getenv("LC_CTYPE");
1004         if (!cur_locale) cur_locale = g_getenv("LANG");
1005         if (!cur_locale) cur_locale = setlocale(LC_CTYPE, NULL);
1006
1007         debug_print("current locale: %s\n",
1008                     cur_locale ? cur_locale : "(none)");
1009
1010         return cur_locale;
1011 }
1012
1013 void conv_unmime_header_overwrite(gchar *str)
1014 {
1015         gchar *buf;
1016         gint buflen;
1017         CharSet cur_charset;
1018
1019         cur_charset = conv_get_current_charset();
1020
1021         if (cur_charset == C_EUC_JP) {
1022                 buflen = strlen(str) * 2 + 1;
1023                 Xalloca(buf, buflen, return);
1024                 conv_anytodisp(buf, buflen, str);
1025                 unmime_header(str, buf);
1026         } else {
1027                 buflen = strlen(str) + 1;
1028                 Xalloca(buf, buflen, return);
1029                 unmime_header(buf, str);
1030                 strncpy2(str, buf, buflen);
1031         }
1032 }
1033
1034 void conv_unmime_header(gchar *outbuf, gint outlen, const gchar *str,
1035                         const gchar *charset)
1036 {
1037         CharSet cur_charset;
1038
1039         cur_charset = conv_get_current_charset();
1040
1041         if (cur_charset == C_EUC_JP) {
1042                 gchar *buf;
1043                 gint buflen;
1044
1045                 buflen = strlen(str) * 2 + 1;
1046                 Xalloca(buf, buflen, return);
1047                 conv_anytodisp(buf, buflen, str);
1048                 unmime_header(outbuf, buf);
1049         } else
1050                 unmime_header(outbuf, str);
1051 }
1052
1053 #define MAX_LINELEN     76
1054 #define MIMESEP_BEGIN   "=?"
1055 #define MIMESEP_END     "?="
1056
1057 #define B64LEN(len)     ((len) / 3 * 4 + ((len) % 3 ? 4 : 0))
1058
1059 #define LBREAK_IF_REQUIRED(cond)                                \
1060 {                                                               \
1061         if (len - (destp - dest) < MAX_LINELEN + 2) {           \
1062                 *destp = '\0';                                  \
1063                 return;                                         \
1064         }                                                       \
1065                                                                 \
1066         if (cond) {                                             \
1067                 if (destp > dest && isspace(*(destp - 1)))      \
1068                         destp--;                                \
1069                 *destp++ = '\n';                                \
1070                 *destp++ = ' ';                                 \
1071                 left = MAX_LINELEN - 1;                         \
1072         }                                                       \
1073 }
1074
1075 void conv_encode_header(gchar *dest, gint len, const gchar *src,
1076                         gint header_len)
1077 {
1078         const gchar *cur_encoding;
1079         const gchar *out_encoding;
1080         gint mimestr_len;
1081         gchar *mimesep_enc;
1082         gint left;
1083         const gchar *srcp = src;
1084         gchar *destp = dest;
1085         gboolean use_base64;
1086
1087         if (MB_CUR_MAX > 1) {
1088                 use_base64 = TRUE;
1089                 mimesep_enc = "?B?";
1090         } else {
1091                 use_base64 = FALSE;
1092                 mimesep_enc = "?Q?";
1093         }
1094
1095         cur_encoding = conv_get_current_charset_str();
1096         out_encoding = conv_get_outgoing_charset_str();
1097         if (!strcmp(out_encoding, "US-ASCII"))
1098                 out_encoding = "ISO-8859-1";
1099
1100         mimestr_len = strlen(MIMESEP_BEGIN) + strlen(out_encoding) +
1101                 strlen(mimesep_enc) + strlen(MIMESEP_END);
1102
1103         left = MAX_LINELEN - header_len;
1104
1105         while (*srcp) {
1106                 LBREAK_IF_REQUIRED(left <= 0);
1107
1108                 while (isspace(*srcp)) {
1109                         *destp++ = *srcp++;
1110                         left--;
1111                         LBREAK_IF_REQUIRED(left <= 0);
1112                 }
1113
1114                 /* output as it is if the next word is ASCII string */
1115                 if (!is_next_nonascii(srcp)) {
1116                         gint word_len;
1117
1118                         word_len = get_next_word_len(srcp);
1119                         LBREAK_IF_REQUIRED(left < word_len);
1120                         while(*srcp && !isspace(*srcp)) {
1121                                 *destp++ = *srcp++;
1122                                 left--;
1123                                 LBREAK_IF_REQUIRED(left <= 0);
1124                         }
1125
1126                         continue;
1127                 }
1128
1129                 while (1) {
1130                         gint mb_len = 0;
1131                         gint cur_len = 0;
1132                         gchar *part_str;
1133                         gchar *out_str;
1134                         gchar *enc_str;
1135                         const gchar *p = srcp;
1136                         gint out_str_len;
1137                         gint out_enc_str_len;
1138                         gint mime_block_len;
1139                         gboolean cont = FALSE;
1140
1141                         while (*p != '\0') {
1142                                 if (isspace(*p) && !is_next_nonascii(p + 1))
1143                                         break;
1144
1145                                 mb_len = mblen(p, MB_CUR_MAX);
1146                                 if (mb_len < 0) {
1147                                         g_warning("invalid multibyte character encountered\n");
1148                                         break;
1149                                 }
1150
1151                                 Xstrndup_a(part_str, srcp, cur_len + mb_len, );
1152                                 out_str = conv_codeset_strdup
1153                                         (part_str, cur_encoding, out_encoding);
1154                                 out_str_len = strlen(out_str);
1155
1156                                 if (use_base64)
1157                                         out_enc_str_len = B64LEN(out_str_len);
1158                                 else
1159                                         out_enc_str_len =
1160                                                 qp_get_q_encoding_len(out_str);
1161
1162                                 g_free(out_str);
1163
1164                                 if (mimestr_len + out_enc_str_len <= left) {
1165                                         cur_len += mb_len;
1166                                         p += mb_len;
1167                                 } else if (cur_len == 0) {
1168                                         LBREAK_IF_REQUIRED(1);
1169                                         continue;
1170                                 } else {
1171                                         cont = TRUE;
1172                                         break;
1173                                 }
1174                         }
1175
1176                         if (cur_len > 0) {
1177                                 Xstrndup_a(part_str, srcp, cur_len, );
1178                                 out_str = conv_codeset_strdup
1179                                         (part_str, cur_encoding, out_encoding);
1180                                 out_str_len = strlen(out_str);
1181
1182                                 if (use_base64)
1183                                         out_enc_str_len = B64LEN(out_str_len);
1184                                 else
1185                                         out_enc_str_len =
1186                                                 qp_get_q_encoding_len(out_str);
1187
1188                                 Xalloca(enc_str, out_enc_str_len + 1, );
1189                                 if (use_base64)
1190                                         base64_encode(enc_str, out_str, out_str_len);
1191                                 else
1192                                         qp_q_encode(enc_str, out_str);
1193
1194                                 g_free(out_str);
1195
1196                                 /* output MIME-encoded string block */
1197                                 mime_block_len = mimestr_len + strlen(enc_str);
1198                                 g_snprintf(destp, mime_block_len + 1,
1199                                            MIMESEP_BEGIN "%s%s%s" MIMESEP_END,
1200                                            out_encoding, mimesep_enc, enc_str);
1201                                 destp += mime_block_len;
1202                                 srcp += cur_len;
1203
1204                                 left -= mime_block_len;
1205                         }
1206
1207                         LBREAK_IF_REQUIRED(cont);
1208
1209                         if (cur_len == 0)
1210                                 break;
1211                 }
1212         }
1213
1214         *destp = '\0';
1215 }
1216
1217 #undef LBREAK_IF_REQUIRED