src/codeconv.c

   1 /*
   2  * Sylpheed -- a GTK+ based, lightweight, and fast e-mail client
   3  * Copyright (C) 1999-2002 Hiroyuki Yamamoto
   4  *
   5  * This program is free software; you can redistribute it and/or modify
   6  * it under the terms of the GNU General Public License as published by
   7  * the Free Software Foundation; either version 2 of the License, or
   8  * (at your option) any later version.
   9  *
  10  * This program is distributed in the hope that it will be useful,
  11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13  * GNU General Public License for more details.
  14  *
  15  * You should have received a copy of the GNU General Public License
  16  * along with this program; if not, write to the Free Software
  17  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  18  */
  19
  20 #ifdef HAVE_CONFIG_H
  21 #  include "config.h"
  22 #endif
  23
  24 #include <glib.h>
  25 #include <string.h>
  26 #include <ctype.h>
  27 #include <stdlib.h>
  28
  29 #if (HAVE_WCTYPE_H && HAVE_WCHAR_H)
  30 #  include <wchar.h>
  31 #  include <wctype.h>
  32 #endif
  33
  34 #if HAVE_LOCALE_H
  35 #  include <locale.h>
  36 #endif
  37
  38 #if HAVE_LIBJCONV
  39 #  include <jconv.h>
  40 #endif
  41
  42 #include "intl.h"
  43 #include "codeconv.h"
  44 #include "unmime.h"
  45 #include "base64.h"
  46 #include "quoted-printable.h"
  47 #include "utils.h"
  48 #include "prefs_common.h"
  49
  50 typedef enum
  51 {
  52         JIS_ASCII,
  53         JIS_KANJI,
  54         JIS_HWKANA,
  55         JIS_AUXKANJI
  56 } JISState;
  57
  58 #define SUBST_CHAR      '_'
  59 #define ESC             '\033'
  60
  61 #define iseuckanji(c) \
  62         (((c) & 0xff) >= 0xa1 && ((c) & 0xff) <= 0xfe)
  63 #define iseuchwkana1(c) \
  64         (((c) & 0xff) == 0x8e)
  65 #define iseuchwkana2(c) \
  66         (((c) & 0xff) >= 0xa1 && ((c) & 0xff) <= 0xdf)
  67 #define iseucaux(c) \
  68         (((c) & 0xff) == 0x8f)
  69 #define isunprintableeuckanji(c) \
  70         (((c) & 0xff) >= 0xa9 && ((c) & 0xff) <= 0xaf)
  71 #define issjiskanji1(c) \
  72         ((((c) & 0xff) >= 0x81 && ((c) & 0xff) <= 0x9f) || \
  73          (((c) & 0xff) >= 0xe0 && ((c) & 0xff) <= 0xfc))
  74 #define issjiskanji2(c) \
  75         ((((c) & 0xff) >= 0x40 && ((c) & 0xff) <= 0x7e) || \
  76          (((c) & 0xff) >= 0x80 && ((c) & 0xff) <= 0xfc))
  77 #define issjishwkana(c) \
  78         (((c) & 0xff) >= 0xa1 && ((c) & 0xff) <= 0xdf)
  79
  80 #define K_IN()                          \
  81         if (state != JIS_KANJI) {       \
  82                 *out++ = ESC;           \
  83                 *out++ = '$';           \
  84                 *out++ = 'B';           \
  85                 state = JIS_KANJI;      \
  86         }
  87
  88 #define K_OUT()                         \
  89         if (state != JIS_ASCII) {       \
  90                 *out++ = ESC;           \
  91                 *out++ = '(';           \
  92                 *out++ = 'B';           \
  93                 state = JIS_ASCII;      \
  94         }
  95
  96 #define HW_IN()                         \
  97         if (state != JIS_HWKANA) {      \
  98                 *out++ = ESC;           \
  99                 *out++ = '(';           \
 100                 *out++ = 'I';           \
 101                 state = JIS_HWKANA;     \
 102         }
 103
 104 #define AUX_IN()                        \
 105         if (state != JIS_AUXKANJI) {    \
 106                 *out++ = ESC;           \
 107                 *out++ = '$';           \
 108                 *out++ = '(';           \
 109                 *out++ = 'D';           \
 110                 state = JIS_AUXKANJI;   \
 111         }
 112
 113 void conv_jistoeuc(gchar *outbuf, gint outlen, const gchar *inbuf)
 114 {
 115         const guchar *in = inbuf;
 116         guchar *out = outbuf;
 117         JISState state = JIS_ASCII;
 118
 119         while (*in != '\0') {
 120                 if (*in == ESC) {
 121                         in++;
 122                         if (*in == '$') {
 123                                 if (*(in + 1) == '@' || *(in + 1) == 'B') {
 124                                         state = JIS_KANJI;
 125                                         in += 2;
 126                                 } else if (*(in + 1) == '(' &&
 127                                            *(in + 2) == 'D') {
 128                                         state = JIS_AUXKANJI;
 129                                         in += 3;
 130                                 } else {
 131                                         /* unknown escape sequence */
 132                                         state = JIS_ASCII;
 133                                 }
 134                         } else if (*in == '(') {
 135                                 if (*(in + 1) == 'B' || *(in + 1) == 'J') {
 136                                         state = JIS_ASCII;
 137                                         in += 2;
 138                                 } else if (*(in + 1) == 'I') {
 139                                         state = JIS_HWKANA;
 140                                         in += 2;
 141                                 } else {
 142                                         /* unknown escape sequence */
 143                                         state = JIS_ASCII;
 144                                 }
 145                         } else {
 146                                 /* unknown escape sequence */
 147                                 state = JIS_ASCII;
 148                         }
 149                 } else if (*in == 0x0e) {
 150                         state = JIS_HWKANA;
 151                         in++;
 152                 } else if (*in == 0x0f) {
 153                         state = JIS_ASCII;
 154                         in++;
 155                 } else {
 156                         switch (state) {
 157                         case JIS_ASCII:
 158                                 *out++ = *in++;
 159                                 break;
 160                         case JIS_KANJI:
 161                                 *out++ = *in++ | 0x80;
 162                                 if (*in == '\0') break;
 163                                 *out++ = *in++ | 0x80;
 164                                 break;
 165                         case JIS_HWKANA:
 166                                 *out++ = 0x8e;
 167                                 *out++ = *in++ | 0x80;
 168                                 break;
 169                         case JIS_AUXKANJI:
 170                                 *out++ = 0x8f;
 171                                 *out++ = *in++ | 0x80;
 172                                 if (*in == '\0') break;
 173                                 *out++ = *in++ | 0x80;
 174                                 break;
 175                         }
 176                 }
 177         }
 178
 179         *out = '\0';
 180 }
 181
 182 void conv_euctojis(gchar *outbuf, gint outlen, const gchar *inbuf)
 183 {
 184         const guchar *in = inbuf;
 185         guchar *out = outbuf;
 186         JISState state = JIS_ASCII;
 187
 188         while (*in != '\0') {
 189                 if (isascii(*in)) {
 190                         K_OUT();
 191                         *out++ = *in++;
 192                 } else if (iseuckanji(*in)) {
 193                         if (iseuckanji(*(in + 1))) {
 194                                 K_IN();
 195                                 *out++ = *in++ & 0x7f;
 196                                 *out++ = *in++ & 0x7f;
 197                         } else {
 198                                 K_OUT();
 199                                 *out++ = SUBST_CHAR;
 200                                 in++;
 201                                 if (*in != '\0' && !isascii(*in)) {
 202                                         *out++ = SUBST_CHAR;
 203                                         in++;
 204                                 }
 205                         }
 206                 } else if (iseuchwkana1(*in)) {
 207                         in++;
 208                         if (iseuchwkana2(*in)) {
 209                                 HW_IN();
 210                                 *out++ = *in++ & 0x7f;
 211                         } else {
 212                                 K_OUT();
 213                                 if (*in != '\0' && !isascii(*in)) {
 214                                         *out++ = SUBST_CHAR;
 215                                         in++;
 216                                 }
 217                         }
 218                 } else if (iseucaux(*in)) {
 219                         in++;
 220                         if (iseuckanji(*in) && iseuckanji(*(in + 1))) {
 221                                 AUX_IN();
 222                                 *out++ = *in++ & 0x7f;
 223                                 *out++ = *in++ & 0x7f;
 224                         } else {
 225                                 K_OUT();
 226                                 if (*in != '\0' && !isascii(*in)) {
 227                                         *out++ = SUBST_CHAR;
 228                                         in++;
 229                                         if (*in != '\0' && !isascii(*in)) {
 230                                                 *out++ = SUBST_CHAR;
 231                                                 in++;
 232                                         }
 233                                 }
 234                         }
 235                 } else {
 236                         K_OUT();
 237                         *out++ = SUBST_CHAR;
 238                         in++;
 239                 }
 240         }
 241
 242         K_OUT();
 243         *out = '\0';
 244 }
 245
 246 void conv_sjistoeuc(gchar *outbuf, gint outlen, const gchar *inbuf)
 247 {
 248         const guchar *in = inbuf;
 249         guchar *out = outbuf;
 250
 251         while (*in != '\0') {
 252                 if (isascii(*in)) {
 253                         *out++ = *in++;
 254                 } else if (issjiskanji1(*in)) {
 255                         if (issjiskanji2(*(in + 1))) {
 256                                 guchar out1 = *in;
 257                                 guchar out2 = *(in + 1);
 258                                 guchar row;
 259
 260                                 row = out1 < 0xa0 ? 0x70 : 0xb0;
 261                                 if (out2 < 0x9f) {
 262                                         out1 = (out1 - row) * 2 - 1;
 263                                         out2 -= out2 > 0x7f ? 0x20 : 0x1f;
 264                                 } else {
 265                                         out1 = (out1 - row) * 2;
 266                                         out2 -= 0x7e;
 267                                 }
 268
 269                                 *out++ = out1 | 0x80;
 270                                 *out++ = out2 | 0x80;
 271                                 in += 2;
 272                         } else {
 273                                 *out++ = SUBST_CHAR;
 274                                 in++;
 275                                 if (*in != '\0' && !isascii(*in)) {
 276                                         *out++ = SUBST_CHAR;
 277                                         in++;
 278                                 }
 279                         }
 280                 } else if (issjishwkana(*in)) {
 281                         *out++ = 0x8e;
 282                         *out++ = *in++;
 283                 } else {
 284                         *out++ = SUBST_CHAR;
 285                         in++;
 286                 }
 287         }
 288
 289         *out = '\0';
 290 }
 291
 292 void conv_anytoeuc(gchar *outbuf, gint outlen, const gchar *inbuf)
 293 {
 294         switch (conv_guess_encoding(inbuf)) {
 295         case C_ISO_2022_JP:
 296                 conv_jistoeuc(outbuf, outlen, inbuf);
 297                 break;
 298         case C_SHIFT_JIS:
 299                 conv_sjistoeuc(outbuf, outlen, inbuf);
 300                 break;
 301         default:
 302                 strncpy2(outbuf, inbuf, outlen);
 303                 break;
 304         }
 305 }
 306
 307 void conv_anytojis(gchar *outbuf, gint outlen, const gchar *inbuf)
 308 {
 309         switch (conv_guess_encoding(inbuf)) {
 310         case C_EUC_JP:
 311                 conv_euctojis(outbuf, outlen, inbuf);
 312                 break;
 313         default:
 314                 strncpy2(outbuf, inbuf, outlen);
 315                 break;
 316         }
 317 }
 318
 319 void conv_unreadable_eucjp(gchar *str)
 320 {
 321         register guchar *p = str;
 322
 323         while (*p != '\0') {
 324                 if (isascii(*p)) {
 325                         /* convert CR+LF -> LF */
 326                         if (*p == '\r' && *(p + 1) == '\n')
 327                                 memmove(p, p + 1, strlen(p));
 328                         /* printable 7 bit code */
 329                         p++;
 330                 } else if (iseuckanji(*p)) {
 331                         if (iseuckanji(*(p + 1)) && !isunprintableeuckanji(*p))
 332                                 /* printable euc-jp code */
 333                                 p += 2;
 334                         else {
 335                                 /* substitute unprintable code */
 336                                 *p++ = SUBST_CHAR;
 337                                 if (*p != '\0') {
 338                                         if (isascii(*p))
 339                                                 p++;
 340                                         else
 341                                                 *p++ = SUBST_CHAR;
 342                                 }
 343                         }
 344                 } else if (iseuchwkana1(*p)) {
 345                         if (iseuchwkana2(*(p + 1)))
 346                                 /* euc-jp hankaku kana */
 347                                 p += 2;
 348                         else
 349                                 *p++ = SUBST_CHAR;
 350                 } else if (iseucaux(*p)) {
 351                         if (iseuckanji(*(p + 1)) && iseuckanji(*(p + 2))) {
 352                                 /* auxiliary kanji */
 353                                 p += 3;
 354                         } else
 355                                 *p++ = SUBST_CHAR;
 356                 } else
 357                         /* substitute unprintable 1 byte code */
 358                         *p++ = SUBST_CHAR;
 359         }
 360 }
 361
 362 void conv_unreadable_8bit(gchar *str)
 363 {
 364         register guchar *p = str;
 365
 366         while (*p != '\0') {
 367                 /* convert CR+LF -> LF */
 368                 if (*p == '\r' && *(p + 1) == '\n')
 369                         memmove(p, p + 1, strlen(p));
 370                 else if (!isascii(*p)) *p = SUBST_CHAR;
 371                 p++;
 372         }
 373 }
 374
 375 void conv_unreadable_latin(gchar *str)
 376 {
 377         register guchar *p = str;
 378
 379         while (*p != '\0') {
 380                 /* convert CR+LF -> LF */
 381                 if (*p == '\r' && *(p + 1) == '\n')
 382                         memmove(p, p + 1, strlen(p));
 383                 else if ((*p & 0xff) >= 0x80 && (*p & 0xff) <= 0x9f)
 384                         *p = SUBST_CHAR;
 385                 p++;
 386         }
 387 }
 388
 389 #define NCV     '\0'
 390
 391 void conv_mb_alnum(gchar *str)
 392 {
 393         static guchar char_tbl[] = {
 394                 /* 0xa0 - 0xaf */
 395                 NCV, ' ', NCV, NCV, ',', '.', NCV, ':',
 396                 ';', '?', '!', NCV, NCV, NCV, NCV, NCV,
 397                 /* 0xb0 - 0xbf */
 398                 NCV, NCV, NCV, NCV, NCV, NCV, NCV, NCV,
 399                 NCV, NCV, NCV, NCV, NCV, NCV, NCV, NCV,
 400                 /* 0xc0 - 0xcf */
 401                 NCV, NCV, NCV, NCV, NCV, NCV, NCV, NCV,
 402                 NCV, NCV, '(', ')', NCV, NCV, '[', ']',
 403                 /* 0xd0 - 0xdf */
 404                 '{', '}', NCV, NCV, NCV, NCV, NCV, NCV,
 405                 NCV, NCV, NCV, NCV, '+', '-', NCV, NCV,
 406                 /* 0xe0 - 0xef */
 407                 NCV, '=', NCV, '<', '>', NCV, NCV, NCV,
 408                 NCV, NCV, NCV, NCV, NCV, NCV, NCV, NCV
 409         };
 410
 411         register guchar *p = str;
 412         register gint len;
 413
 414         len = strlen(str);
 415
 416         while (len > 1) {
 417                 if (*p == 0xa3) {
 418                         register guchar ch = *(p + 1);
 419
 420                         if (ch >= 0xb0 && ch <= 0xfa) {
 421                                 /* [a-zA-Z] */
 422                                 *p = ch & 0x7f;
 423                                 p++;
 424                                 len--;
 425                                 memmove(p, p + 1, len);
 426                                 len--;
 427                         } else  {
 428                                 p += 2;
 429                                 len -= 2;
 430                         }
 431                 } else if (*p == 0xa1) {
 432                         register guchar ch = *(p + 1);
 433
 434                         if (ch >= 0xa0 && ch <= 0xef &&
 435                             NCV != char_tbl[ch - 0xa0]) {
 436                                 *p = char_tbl[ch - 0xa0];
 437                                 p++;
 438                                 len--;
 439                                 memmove(p, p + 1, len);
 440                                 len--;
 441                         } else {
 442                                 p += 2;
 443                                 len -= 2;
 444                         }
 445                 } else if (iseuckanji(*p)) {
 446                         p += 2;
 447                         len -= 2;
 448                 } else {
 449                         p++;
 450                         len--;
 451                 }
 452         }
 453 }
 454
 455 CharSet conv_guess_encoding(const gchar *str)
 456 {
 457         const guchar *p = str;
 458         CharSet guessed = C_US_ASCII;
 459
 460         while (*p != '\0') {
 461                 if (*p == ESC && (*(p + 1) == '$' || *(p + 1) == '(')) {
 462                         if (guessed == C_US_ASCII)
 463                                 return C_ISO_2022_JP;
 464                         p += 2;
 465                 } else if (isascii(*p)) {
 466                         p++;
 467                 } else if (iseuckanji(*p) && iseuckanji(*(p + 1))) {
 468                         if (*p >= 0xfd && *p <= 0xfe)
 469                                 return C_EUC_JP;
 470                         else if (guessed == C_SHIFT_JIS) {
 471                                 if ((issjiskanji1(*p) &&
 472                                      issjiskanji2(*(p + 1))) ||
 473                                     issjishwkana(*p))
 474                                         guessed = C_SHIFT_JIS;
 475                                 else
 476                                         guessed = C_EUC_JP;
 477                         } else
 478                                 guessed = C_EUC_JP;
 479                         p += 2;
 480                 } else if (issjiskanji1(*p) && issjiskanji2(*(p + 1))) {
 481                         if (iseuchwkana1(*p) && iseuchwkana2(*(p + 1)))
 482                                 guessed = C_SHIFT_JIS;
 483                         else
 484                                 return C_SHIFT_JIS;
 485                         p += 2;
 486                 } else if (issjishwkana(*p)) {
 487                         guessed = C_SHIFT_JIS;
 488                         p++;
 489                 } else {
 490                         p++;
 491                 }
 492         }
 493
 494         return guessed;
 495 }
 496
 497 void conv_jistodisp(gchar *outbuf, gint outlen, const gchar *inbuf)
 498 {
 499         conv_jistoeuc(outbuf, outlen, inbuf);
 500         conv_unreadable_eucjp(outbuf);
 501 }
 502
 503 void conv_sjistodisp(gchar *outbuf, gint outlen, const gchar *inbuf)
 504 {
 505         conv_sjistoeuc(outbuf, outlen, inbuf);
 506         conv_unreadable_eucjp(outbuf);
 507 }
 508
 509 void conv_euctodisp(gchar *outbuf, gint outlen, const gchar *inbuf)
 510 {
 511         strncpy2(outbuf, inbuf, outlen);
 512         conv_unreadable_eucjp(outbuf);
 513 }
 514
 515 void conv_anytodisp(gchar *outbuf, gint outlen, const gchar *inbuf)
 516 {
 517         conv_anytoeuc(outbuf, outlen, inbuf);
 518         conv_unreadable_eucjp(outbuf);
 519 }
 520
 521 void conv_ustodisp(gchar *outbuf, gint outlen, const gchar *inbuf)
 522 {
 523         strncpy2(outbuf, inbuf, outlen);
 524         conv_unreadable_8bit(outbuf);
 525 }
 526
 527 void conv_latintodisp(gchar *outbuf, gint outlen, const gchar *inbuf)
 528 {
 529         strncpy2(outbuf, inbuf, outlen);
 530         conv_unreadable_latin(outbuf);
 531 }
 532
 533 void conv_noconv(gchar *outbuf, gint outlen, const gchar *inbuf)
 534 {
 535         strncpy2(outbuf, inbuf, outlen);
 536 }
 537
 538 CodeConverter *conv_code_converter_new(const gchar *charset)
 539 {
 540         CodeConverter *conv;
 541
 542         conv = g_new0(CodeConverter, 1);
 543 #if !HAVE_LIBJCONV
 544         conv->code_conv_func = conv_get_code_conv_func(charset);
 545 #endif
 546         conv->charset_str = g_strdup(charset);
 547         conv->charset = conv_get_charset_from_str(charset);
 548
 549         return conv;
 550 }
 551
 552 void conv_code_converter_destroy(CodeConverter *conv)
 553 {
 554         g_free(conv->charset_str);
 555         g_free(conv);
 556 }
 557
 558 gint conv_convert(CodeConverter *conv, gchar *outbuf, gint outlen,
 559                   const gchar *inbuf)
 560 {
 561 #if HAVE_LIBJCONV
 562         gchar *str;
 563
 564         str = conv_codeset_strdup(inbuf, conv->charset_str, NULL);
 565         if (!str)
 566                 return -1;
 567         else {
 568                 strncpy2(outbuf, str, outlen);
 569                 g_free(str);
 570         }
 571 #else /* !HAVE_LIBJCONV */
 572         conv->code_conv_func(outbuf, outlen, inbuf);
 573 #endif
 574
 575         return 0;
 576 }
 577
 578 gchar *conv_codeset_strdup(const gchar *inbuf,
 579                            const gchar *src_codeset, const gchar *dest_codeset)
 580 {
 581         gchar *buf;
 582         size_t len;
 583 #if HAVE_LIBJCONV
 584         gint actual_codeset;
 585         const gchar *const *codesets;
 586         gint n_codesets;
 587 #else /* !HAVE_LIBJCONV */
 588         CharSet src_charset = C_AUTO, dest_charset = C_AUTO;
 589 #endif
 590
 591         if (!dest_codeset) {
 592                 CodeConvFunc func;
 593
 594                 func = conv_get_code_conv_func(src_codeset);
 595                 if (func != conv_noconv) {
 596                         if (func == conv_jistodisp ||
 597                             func == conv_sjistodisp ||
 598                             func == conv_anytodisp)
 599                                 len = strlen(inbuf) * 2 + 1;
 600                         else
 601                                 len = strlen(inbuf) + 1;
 602                         buf = g_malloc(len);
 603                         if (!buf) return NULL;
 604                         func(buf, len, inbuf);
 605                         buf = g_realloc(buf, strlen(buf) + 1);
 606                         return buf;
 607                 }
 608         }
 609
 610         /* don't convert if src and dest codeset are identical */
 611         if (src_codeset && dest_codeset &&
 612             !strcasecmp(src_codeset, dest_codeset))
 613                 return g_strdup(inbuf);
 614
 615 #if HAVE_LIBJCONV
 616         if (src_codeset) {
 617                 codesets = &src_codeset;
 618                 n_codesets = 1;
 619         } else
 620                 codesets = jconv_info_get_pref_codesets(&n_codesets);
 621         if (!dest_codeset) {
 622                 dest_codeset = conv_get_current_charset_str();
 623                 /* don't convert if current codeset is US-ASCII */
 624                 if (!strcasecmp(dest_codeset, CS_US_ASCII))
 625                         return g_strdup(inbuf);
 626         }
 627
 628         if (jconv_alloc_conv(inbuf, strlen(inbuf), &buf, &len,
 629                              codesets, n_codesets,
 630                              &actual_codeset, dest_codeset)
 631             == 0)
 632                 return buf;
 633         else {
 634 #if 0
 635                 g_warning("code conversion from %s to %s failed\n",
 636                           codesets && codesets[0] ? codesets[0] : "(unknown)",
 637                           dest_codeset);
 638 #endif /* 0 */
 639                 return NULL;
 640         }
 641 #else /* !HAVE_LIBJCONV */
 642         if (src_codeset) {
 643                 if (!strcasecmp(src_codeset, CS_EUC_JP) ||
 644                     !strcasecmp(src_codeset, CS_EUCJP))
 645                         src_charset = C_EUC_JP;
 646                 else if (!strcasecmp(src_codeset, CS_SHIFT_JIS) ||
 647                          !strcasecmp(src_codeset, "SHIFT-JIS") ||
 648                          !strcasecmp(src_codeset, "SJIS"))
 649                         src_charset = C_SHIFT_JIS;
 650                 if (dest_codeset && !strcasecmp(dest_codeset, CS_ISO_2022_JP))
 651                         dest_charset = C_ISO_2022_JP;
 652         }
 653
 654         if ((src_charset == C_EUC_JP || src_charset == C_SHIFT_JIS) &&
 655             dest_charset == C_ISO_2022_JP) {
 656                 len = (strlen(inbuf) + 1) * 3;
 657                 buf = g_malloc(len);
 658                 if (buf) {
 659                         if (src_charset == C_EUC_JP)
 660                                 conv_euctojis(buf, len, inbuf);
 661                         else
 662                                 conv_anytojis(buf, len, inbuf);
 663                         buf = g_realloc(buf, strlen(buf) + 1);
 664                 }
 665         } else
 666                 buf = g_strdup(inbuf);
 667
 668         return buf;
 669 #endif /* !HAVE_LIBJCONV */
 670 }
 671
 672 CodeConvFunc conv_get_code_conv_func(const gchar *charset)
 673 {
 674         CodeConvFunc code_conv;
 675         CharSet cur_charset;
 676
 677         if (!charset) {
 678                 cur_charset = conv_get_current_charset();
 679                 if (cur_charset == C_EUC_JP || cur_charset == C_SHIFT_JIS)
 680                         return conv_anytodisp;
 681                 else
 682                         return conv_noconv;
 683         }
 684
 685         if (!strcasecmp(charset, CS_ISO_2022_JP) ||
 686             !strcasecmp(charset, CS_ISO_2022_JP_2))
 687                 code_conv = conv_jistodisp;
 688         else if (!strcasecmp(charset, CS_US_ASCII))
 689                 code_conv = conv_ustodisp;
 690         else if (!strncasecmp(charset, CS_ISO_8859_1, 10))
 691                 code_conv = conv_latintodisp;
 692 #if !HAVE_LIBJCONV
 693         else if (!strncasecmp(charset, "ISO-8859-", 9))
 694                 code_conv = conv_latintodisp;
 695 #endif
 696         else if (!strcasecmp(charset, CS_SHIFT_JIS) ||
 697                  !strcasecmp(charset, "SHIFT-JIS")  ||
 698                  !strcasecmp(charset, "SJIS")       ||
 699                  !strcasecmp(charset, "X-SJIS"))
 700                 code_conv = conv_sjistodisp;
 701         else if (!strcasecmp(charset, CS_EUC_JP) ||
 702                  !strcasecmp(charset, CS_EUCJP))
 703                 code_conv = conv_euctodisp;
 704         else
 705                 code_conv = conv_noconv;
 706
 707         return code_conv;
 708 }
 709
 710 static const struct {
 711         CharSet charset;
 712         gchar *const name;
 713 } charsets[] = {
 714         {C_US_ASCII,            CS_US_ASCII},
 715         {C_US_ASCII,            CS_ANSI_X3_4_1968},
 716         {C_UTF_8,               CS_UTF_8},
 717         {C_ISO_8859_1,          CS_ISO_8859_1},
 718         {C_ISO_8859_2,          CS_ISO_8859_2},
 719         {C_ISO_8859_4,          CS_ISO_8859_4},
 720         {C_ISO_8859_5,          CS_ISO_8859_5},
 721         {C_ISO_8859_7,          CS_ISO_8859_7},
 722         {C_ISO_8859_8,          CS_ISO_8859_8},
 723         {C_ISO_8859_9,          CS_ISO_8859_9},
 724         {C_ISO_8859_11,         CS_ISO_8859_11},
 725         {C_ISO_8859_13,         CS_ISO_8859_13},
 726         {C_ISO_8859_15,         CS_ISO_8859_15},
 727         {C_BALTIC,              CS_BALTIC},
 728         {C_CP1251,              CS_CP1251},
 729         {C_WINDOWS_1251,        CS_WINDOWS_1251},
 730         {C_KOI8_R,              CS_KOI8_R},
 731         {C_KOI8_U,              CS_KOI8_U},
 732         {C_ISO_2022_JP,         CS_ISO_2022_JP},
 733         {C_ISO_2022_JP_2,       CS_ISO_2022_JP_2},
 734         {C_EUC_JP,              CS_EUC_JP},
 735         {C_EUC_JP,              CS_EUCJP},
 736         {C_SHIFT_JIS,           CS_SHIFT_JIS},
 737         {C_ISO_2022_KR,         CS_ISO_2022_KR},
 738         {C_EUC_KR,              CS_EUC_KR},
 739         {C_ISO_2022_CN,         CS_ISO_2022_CN},
 740         {C_EUC_CN,              CS_EUC_CN},
 741         {C_GB2312,              CS_GB2312},
 742         {C_EUC_TW,              CS_EUC_TW},
 743         {C_BIG5,                CS_BIG5},
 744         {C_TIS_620,             CS_TIS_620},
 745         {C_WINDOWS_874,         CS_WINDOWS_874},
 746 };
 747
 748 #if !HAVE_LIBJCONV
 749 static const struct {
 750         gchar *const locale;
 751         CharSet charset;
 752         CharSet out_charset;
 753 } locale_table[] = {
 754         {"ja_JP.eucJP"  , C_EUC_JP      , C_ISO_2022_JP},
 755         {"ja_JP.ujis"   , C_EUC_JP      , C_ISO_2022_JP},
 756         {"ja_JP.EUC"    , C_EUC_JP      , C_ISO_2022_JP},
 757         {"ja_JP.SJIS"   , C_SHIFT_JIS   , C_ISO_2022_JP},
 758         {"ja_JP.JIS"    , C_ISO_2022_JP , C_ISO_2022_JP},
 759         {"ja_JP"        , C_EUC_JP      , C_ISO_2022_JP},
 760         {"ko_KR"        , C_EUC_KR      , C_EUC_KR},
 761         {"zh_CN.GB2312" , C_GB2312      , C_GB2312},
 762         {"zh_CN"        , C_GB2312      , C_GB2312},
 763         {"zh_TW.eucTW"  , C_EUC_TW      , C_BIG5},
 764         {"zh_TW.Big5"   , C_BIG5        , C_BIG5},
 765         {"zh_TW"        , C_BIG5        , C_BIG5},
 766
 767         {"ru_RU.KOI8-R" , C_KOI8_R      , C_ISO_8859_5},
 768         {"ru_RU.CP1251" , C_WINDOWS_1251, C_ISO_8859_5},
 769
 770         {"bg_BG"        , C_WINDOWS_1251, C_WINDOWS_1251},
 771
 772         {"en_US"        , C_ISO_8859_1  , C_ISO_8859_1},
 773         {"ca_ES"        , C_ISO_8859_1  , C_ISO_8859_1},
 774         {"da_DK"        , C_ISO_8859_1  , C_ISO_8859_1},
 775         {"de_DE"        , C_ISO_8859_1  , C_ISO_8859_1},
 776         {"nl_NL"        , C_ISO_8859_1  , C_ISO_8859_1},
 777         {"et_EE"        , C_ISO_8859_1  , C_ISO_8859_1},
 778         {"fi_FI"        , C_ISO_8859_1  , C_ISO_8859_1},
 779         {"fr_FR"        , C_ISO_8859_1  , C_ISO_8859_1},
 780         {"is_IS"        , C_ISO_8859_1  , C_ISO_8859_1},
 781         {"it_IT"        , C_ISO_8859_1  , C_ISO_8859_1},
 782         {"no_NO"        , C_ISO_8859_1  , C_ISO_8859_1},
 783         {"pt_PT"        , C_ISO_8859_1  , C_ISO_8859_1},
 784         {"pt_BR"        , C_ISO_8859_1  , C_ISO_8859_1},
 785         {"es_ES"        , C_ISO_8859_1  , C_ISO_8859_1},
 786         {"sv_SE"        , C_ISO_8859_1  , C_ISO_8859_1},
 787
 788         {"hr_HR"        , C_ISO_8859_2  , C_ISO_8859_2},
 789         {"hu_HU"        , C_ISO_8859_2  , C_ISO_8859_2},
 790         {"pl_PL"        , C_ISO_8859_2  , C_ISO_8859_2},
 791         {"ro_RO"        , C_ISO_8859_2  , C_ISO_8859_2},
 792         {"sk_SK"        , C_ISO_8859_2  , C_ISO_8859_2},
 793         {"sl_SI"        , C_ISO_8859_2  , C_ISO_8859_2},
 794         {"ru_RU"        , C_ISO_8859_5  , C_ISO_8859_5},
 795         {"el_GR"        , C_ISO_8859_7  , C_ISO_8859_7},
 796         {"iw_IL"        , C_ISO_8859_8  , C_ISO_8859_8},
 797         {"tr_TR"        , C_ISO_8859_9  , C_ISO_8859_9},
 798
 799         {"th_TH"        , C_TIS_620     , C_TIS_620},
 800         /* {"th_TH"     , C_WINDOWS_874}, */
 801         /* {"th_TH"     , C_ISO_8859_11}, */
 802
 803         {"lt_LT.iso88594"       , C_ISO_8859_4  , C_ISO_8859_4},
 804         {"lt_LT.ISO8859-4"      , C_ISO_8859_4  , C_ISO_8859_4},
 805         {"lt_LT.ISO_8859-4"     , C_ISO_8859_4  , C_ISO_8859_4},
 806         {"lt_LT"                , C_ISO_8859_13 , C_ISO_8859_13},
 807         {"lv_LV"                , C_ISO_8859_13 , C_ISO_8859_13},
 808
 809         {"C"                    , C_US_ASCII    , C_US_ASCII},
 810         {"POSIX"                , C_US_ASCII    , C_US_ASCII},
 811         {"ANSI_X3.4-1968"       , C_US_ASCII    , C_US_ASCII},
 812 };
 813 #endif /* !HAVE_LIBJCONV */
 814
 815 const gchar *conv_get_charset_str(CharSet charset)
 816 {
 817         gint i;
 818
 819         for (i = 0; i < sizeof(charsets) / sizeof(charsets[0]); i++) {
 820                 if (charsets[i].charset == charset)
 821                         return charsets[i].name;
 822         }
 823
 824         return NULL;
 825 }
 826
 827 CharSet conv_get_charset_from_str(const gchar *charset)
 828 {
 829         gint i;
 830
 831         if (!charset) return C_AUTO;
 832
 833         for (i = 0; i < sizeof(charsets) / sizeof(charsets[0]); i++) {
 834                 if (!strcasecmp(charsets[i].name, charset))
 835                         return charsets[i].charset;
 836         }
 837
 838         return C_AUTO;
 839 }
 840
 841 CharSet conv_get_current_charset(void)
 842 {
 843         static CharSet cur_charset = -1;
 844         gint i;
 845
 846 #if HAVE_LIBJCONV
 847         const gchar *cur_codeset;
 848 #else
 849         const gchar *cur_locale;
 850 #endif
 851
 852         if (cur_charset != -1)
 853                 return cur_charset;
 854
 855 #if HAVE_LIBJCONV
 856         cur_codeset = jconv_info_get_current_codeset();
 857         for (i = 0; i < sizeof(charsets) / sizeof(charsets[0]); i++) {
 858                 if (!strcasecmp(cur_codeset, charsets[i].name)) {
 859                         cur_charset = charsets[i].charset;
 860                         return cur_charset;
 861                 }
 862         }
 863 #else
 864         cur_locale = conv_get_current_locale();
 865         if (!cur_locale) {
 866                 cur_charset = C_US_ASCII;
 867                 return cur_charset;
 868         }
 869
 870         if (strcasestr(cur_locale, "UTF-8")) {
 871                 cur_charset = C_UTF_8;
 872                 return cur_charset;
 873         }
 874
 875         for (i = 0; i < sizeof(locale_table) / sizeof(locale_table[0]); i++) {
 876                 const gchar *p;
 877
 878                 /* "ja_JP.EUC" matches with "ja_JP.eucJP" and "ja_JP.EUC" */
 879                 /* "ja_JP" matches with "ja_JP.xxxx" and "ja" */
 880                 if (!strncasecmp(cur_locale, locale_table[i].locale,
 881                                  strlen(locale_table[i].locale))) {
 882                         cur_charset = locale_table[i].charset;
 883                         return cur_charset;
 884                 } else if ((p = strchr(locale_table[i].locale, '_')) &&
 885                          !strchr(p + 1, '.')) {
 886                         if (strlen(cur_locale) == 2 &&
 887                             !strncasecmp(cur_locale, locale_table[i].locale, 2)) {
 888                                 cur_charset = locale_table[i].charset;
 889                                 return cur_charset;
 890                         }
 891                 }
 892         }
 893 #endif
 894
 895         cur_charset = C_AUTO;
 896         return cur_charset;
 897 }
 898
 899 const gchar *conv_get_current_charset_str(void)
 900 {
 901         static const gchar *codeset = NULL;
 902
 903         if (!codeset)
 904                 codeset = conv_get_charset_str(conv_get_current_charset());
 905
 906         return codeset ? codeset : "US-ASCII";
 907 }
 908
 909 CharSet conv_get_outgoing_charset(void)
 910 {
 911         static CharSet out_charset = -1;
 912         gint i;
 913
 914 #if HAVE_LIBJCONV
 915         gint j, n_pref_codesets;
 916         const gchar *const *pref_codesets;
 917 #else
 918         const gchar *cur_locale;
 919 #endif
 920
 921         if (out_charset != -1)
 922                 return out_charset;
 923
 924 #if HAVE_LIBJCONV
 925         /* skip US-ASCII and UTF-8 */
 926         pref_codesets = jconv_info_get_pref_codesets(&n_pref_codesets);
 927         for (i = 0; i < n_pref_codesets; i++) {
 928                 for (j = 3; j < sizeof(charsets) / sizeof(charsets[0]); j++) {
 929                         if (!strcasecmp(pref_codesets[i], charsets[j].name)) {
 930                                 out_charset = charsets[j].charset;
 931                                 return out_charset;
 932                         }
 933                 }
 934         }
 935
 936         for (i = 0; i < n_pref_codesets; i++) {
 937                 if (!strcasecmp(pref_codesets[i], "UTF-8")) {
 938                         out_charset = C_UTF_8;
 939                         return out_charset;
 940                 }
 941         }
 942
 943         out_charset = C_AUTO;
 944 #else
 945         cur_locale = conv_get_current_locale();
 946         if (!cur_locale) {
 947                 out_charset = C_AUTO;
 948                 return out_charset;
 949         }
 950
 951         for (i = 0; i < sizeof(locale_table) / sizeof(locale_table[0]); i++) {
 952                 const gchar *p;
 953
 954                 if (!strncasecmp(cur_locale, locale_table[i].locale,
 955                                  strlen(locale_table[i].locale))) {
 956                         out_charset = locale_table[i].out_charset;
 957                         break;
 958                 } else if ((p = strchr(locale_table[i].locale, '_')) &&
 959                          !strchr(p + 1, '.')) {
 960                         if (strlen(cur_locale) == 2 &&
 961                             !strncasecmp(cur_locale, locale_table[i].locale, 2)) {
 962                                 out_charset = locale_table[i].out_charset;
 963                                 break;
 964                         }
 965                 }
 966         }
 967
 968         /* encoding conversion without libjconv is only supported
 969            on Japanese locale for now */
 970         if (out_charset == C_ISO_2022_JP)
 971                 return out_charset;
 972
 973         out_charset = conv_get_current_charset();
 974 #endif
 975
 976         return out_charset;
 977 }
 978
 979 const gchar *conv_get_outgoing_charset_str(void)
 980 {
 981         CharSet out_charset;
 982         const gchar *str;
 983
 984         if (prefs_common.outgoing_charset) {
 985                 if (!isalpha(prefs_common.outgoing_charset[0])) {
 986                         g_free(prefs_common.outgoing_charset);
 987                         prefs_common.outgoing_charset = g_strdup(CS_AUTO);
 988                 } else if (strcmp(prefs_common.outgoing_charset, CS_AUTO) != 0)
 989                         return prefs_common.outgoing_charset;
 990         }
 991
 992         out_charset = conv_get_outgoing_charset();
 993         str = conv_get_charset_str(out_charset);
 994
 995         return str ? str : "US-ASCII";
 996 }
 997
 998 const gchar *conv_get_current_locale(void)
 999 {
1000         gchar *cur_locale;
1001
1002         cur_locale = g_getenv("LC_ALL");
1003         if (!cur_locale) cur_locale = g_getenv("LC_CTYPE");
1004         if (!cur_locale) cur_locale = g_getenv("LANG");
1005         if (!cur_locale) cur_locale = setlocale(LC_CTYPE, NULL);
1006
1007         debug_print("current locale: %s\n",
1008                     cur_locale ? cur_locale : "(none)");
1009
1010         return cur_locale;
1011 }
1012
1013 void conv_unmime_header_overwrite(gchar *str)
1014 {
1015         gchar *buf;
1016         gint buflen;
1017         CharSet cur_charset;
1018
1019         cur_charset = conv_get_current_charset();
1020
1021         if (cur_charset == C_EUC_JP) {
1022                 buflen = strlen(str) * 2 + 1;
1023                 Xalloca(buf, buflen, return);
1024                 conv_anytodisp(buf, buflen, str);
1025                 unmime_header(str, buf);
1026         } else {
1027                 buflen = strlen(str) + 1;
1028                 Xalloca(buf, buflen, return);
1029                 unmime_header(buf, str);
1030                 strncpy2(str, buf, buflen);
1031         }
1032 }
1033
1034 void conv_unmime_header(gchar *outbuf, gint outlen, const gchar *str,
1035                         const gchar *charset)
1036 {
1037         CharSet cur_charset;
1038
1039         cur_charset = conv_get_current_charset();
1040
1041         if (cur_charset == C_EUC_JP) {
1042                 gchar *buf;
1043                 gint buflen;
1044
1045                 buflen = strlen(str) * 2 + 1;
1046                 Xalloca(buf, buflen, return);
1047                 conv_anytodisp(buf, buflen, str);
1048                 unmime_header(outbuf, buf);
1049         } else
1050                 unmime_header(outbuf, str);
1051 }
1052
1053 #define MAX_LINELEN     76
1054 #define MIMESEP_BEGIN   "=?"
1055 #define MIMESEP_END     "?="
1056
1057 #define B64LEN(len)     ((len) / 3 * 4 + ((len) % 3 ? 4 : 0))
1058
1059 #define LBREAK_IF_REQUIRED(cond)                                \
1060 {                                                               \
1061         if (len - (destp - dest) < MAX_LINELEN + 2) {           \
1062                 *destp = '\0';                                  \
1063                 return;                                         \
1064         }                                                       \
1065                                                                 \
1066         if (cond) {                                             \
1067                 if (destp > dest && isspace(*(destp - 1)))      \
1068                         destp--;                                \
1069                 else if (isspace(*srcp))                        \
1070                         srcp++;                                 \
1071                 *destp++ = '\n';                                \
1072                 *destp++ = ' ';                                 \
1073                 left = MAX_LINELEN - 1;                         \
1074         }                                                       \
1075 }
1076
1077 void conv_encode_header(gchar *dest, gint len, const gchar *src,
1078                         gint header_len)
1079 {
1080         const gchar *cur_encoding;
1081         const gchar *out_encoding;
1082         gint mimestr_len;
1083         gchar *mimesep_enc;
1084         gint left;
1085         const gchar *srcp = src;
1086         gchar *destp = dest;
1087         gboolean use_base64;
1088
1089         if (MB_CUR_MAX > 1) {
1090                 use_base64 = TRUE;
1091                 mimesep_enc = "?B?";
1092         } else {
1093                 use_base64 = FALSE;
1094                 mimesep_enc = "?Q?";
1095         }
1096
1097         cur_encoding = conv_get_current_charset_str();
1098         out_encoding = conv_get_outgoing_charset_str();
1099         if (!strcmp(out_encoding, "US-ASCII"))
1100                 out_encoding = "ISO-8859-1";
1101
1102         mimestr_len = strlen(MIMESEP_BEGIN) + strlen(out_encoding) +
1103                 strlen(mimesep_enc) + strlen(MIMESEP_END);
1104
1105         left = MAX_LINELEN - header_len;
1106
1107         while (*srcp) {
1108                 LBREAK_IF_REQUIRED(left <= 0);
1109
1110                 while (isspace(*srcp)) {
1111                         *destp++ = *srcp++;
1112                         left--;
1113                         LBREAK_IF_REQUIRED(left <= 0);
1114                 }
1115
1116                 /* output as it is if the next word is ASCII string */
1117                 if (!is_next_nonascii(srcp)) {
1118                         gint word_len;
1119
1120                         word_len = get_next_word_len(srcp);
1121                         LBREAK_IF_REQUIRED(left < word_len);
1122                         while(*srcp && !isspace(*srcp)) {
1123                                 *destp++ = *srcp++;
1124                                 left--;
1125                                 LBREAK_IF_REQUIRED(left <= 0);
1126                         }
1127
1128                         continue;
1129                 }
1130
1131                 while (1) {
1132                         gint mb_len = 0;
1133                         gint cur_len = 0;
1134                         gchar *part_str;
1135                         gchar *out_str;
1136                         gchar *enc_str;
1137                         const gchar *p = srcp;
1138                         gint out_str_len;
1139                         gint out_enc_str_len;
1140                         gint mime_block_len;
1141                         gboolean cont = FALSE;
1142
1143                         while (*p != '\0') {
1144                                 if (isspace(*p) && !is_next_nonascii(p + 1))
1145                                         break;
1146
1147                                 mb_len = mblen(p, MB_CUR_MAX);
1148                                 if (mb_len < 0) {
1149                                         g_warning("invalid multibyte character encountered\n");
1150                                         break;
1151                                 }
1152
1153                                 Xstrndup_a(part_str, srcp, cur_len + mb_len, );
1154                                 out_str = conv_codeset_strdup
1155                                         (part_str, cur_encoding, out_encoding);
1156                                 out_str_len = strlen(out_str);
1157
1158                                 if (use_base64)
1159                                         out_enc_str_len = B64LEN(out_str_len);
1160                                 else
1161                                         out_enc_str_len =
1162                                                 qp_get_q_encoding_len(out_str);
1163
1164                                 g_free(out_str);
1165
1166                                 if (mimestr_len + out_enc_str_len <= left) {
1167                                         cur_len += mb_len;
1168                                         p += mb_len;
1169                                 } else if (cur_len == 0) {
1170                                         LBREAK_IF_REQUIRED(1);
1171                                         continue;
1172                                 } else {
1173                                         cont = TRUE;
1174                                         break;
1175                                 }
1176                         }
1177
1178                         if (cur_len > 0) {
1179                                 Xstrndup_a(part_str, srcp, cur_len, );
1180                                 out_str = conv_codeset_strdup
1181                                         (part_str, cur_encoding, out_encoding);
1182                                 out_str_len = strlen(out_str);
1183
1184                                 if (use_base64)
1185                                         out_enc_str_len = B64LEN(out_str_len);
1186                                 else
1187                                         out_enc_str_len =
1188                                                 qp_get_q_encoding_len(out_str);
1189
1190                                 Xalloca(enc_str, out_enc_str_len + 1, );
1191                                 if (use_base64)
1192                                         base64_encode(enc_str, out_str, out_str_len);
1193                                 else
1194                                         qp_q_encode(enc_str, out_str);
1195
1196                                 g_free(out_str);
1197
1198                                 /* output MIME-encoded string block */
1199                                 mime_block_len = mimestr_len + strlen(enc_str);
1200                                 g_snprintf(destp, mime_block_len + 1,
1201                                            MIMESEP_BEGIN "%s%s%s" MIMESEP_END,
1202                                            out_encoding, mimesep_enc, enc_str);
1203                                 destp += mime_block_len;
1204                                 srcp += cur_len;
1205
1206                                 left -= mime_block_len;
1207                         }
1208
1209                         LBREAK_IF_REQUIRED(cont);
1210
1211                         if (cur_len == 0)
1212                                 break;
1213                 }
1214         }
1215
1216         *destp = '\0';
1217 }
1218
1219 #undef LBREAK_IF_REQUIRED