src/codeconv.c

   1 /*
   2  * Sylpheed -- a GTK+ based, lightweight, and fast e-mail client
   3  * Copyright (C) 1999-2003 Hiroyuki Yamamoto
   4  *
   5  * This program is free software; you can redistribute it and/or modify
   6  * it under the terms of the GNU General Public License as published by
   7  * the Free Software Foundation; either version 2 of the License, or
   8  * (at your option) any later version.
   9  *
  10  * This program is distributed in the hope that it will be useful,
  11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13  * GNU General Public License for more details.
  14  *
  15  * You should have received a copy of the GNU General Public License
  16  * along with this program; if not, write to the Free Software
  17  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  18  */
  19
  20 #ifdef HAVE_CONFIG_H
  21 #  include "config.h"
  22 #endif
  23
  24 #include <glib.h>
  25 #include <string.h>
  26 #include <ctype.h>
  27 #include <stdlib.h>
  28 #include <errno.h>
  29
  30 #if HAVE_LOCALE_H
  31 #  include <locale.h>
  32 #endif
  33
  34 #if HAVE_ICONV
  35 #  include <iconv.h>
  36 #endif
  37
  38 #include "intl.h"
  39 #include "codeconv.h"
  40 #include "unmime.h"
  41 #include "base64.h"
  42 #include "quoted-printable.h"
  43 #include "utils.h"
  44 #include "prefs_common.h"
  45
  46 typedef enum
  47 {
  48         JIS_ASCII,
  49         JIS_KANJI,
  50         JIS_HWKANA,
  51         JIS_AUXKANJI
  52 } JISState;
  53
  54 #define SUBST_CHAR      '_'
  55 #define ESC             '\033'
  56
  57 #define iseuckanji(c) \
  58         (((c) & 0xff) >= 0xa1 && ((c) & 0xff) <= 0xfe)
  59 #define iseuchwkana1(c) \
  60         (((c) & 0xff) == 0x8e)
  61 #define iseuchwkana2(c) \
  62         (((c) & 0xff) >= 0xa1 && ((c) & 0xff) <= 0xdf)
  63 #define iseucaux(c) \
  64         (((c) & 0xff) == 0x8f)
  65 #define isunprintableeuckanji(c) \
  66         (((c) & 0xff) >= 0xa9 && ((c) & 0xff) <= 0xaf)
  67 #define issjiskanji1(c) \
  68         ((((c) & 0xff) >= 0x81 && ((c) & 0xff) <= 0x9f) || \
  69          (((c) & 0xff) >= 0xe0 && ((c) & 0xff) <= 0xfc))
  70 #define issjiskanji2(c) \
  71         ((((c) & 0xff) >= 0x40 && ((c) & 0xff) <= 0x7e) || \
  72          (((c) & 0xff) >= 0x80 && ((c) & 0xff) <= 0xfc))
  73 #define issjishwkana(c) \
  74         (((c) & 0xff) >= 0xa1 && ((c) & 0xff) <= 0xdf)
  75
  76 #define K_IN()                          \
  77         if (state != JIS_KANJI) {       \
  78                 *out++ = ESC;           \
  79                 *out++ = '$';           \
  80                 *out++ = 'B';           \
  81                 state = JIS_KANJI;      \
  82         }
  83
  84 #define K_OUT()                         \
  85         if (state != JIS_ASCII) {       \
  86                 *out++ = ESC;           \
  87                 *out++ = '(';           \
  88                 *out++ = 'B';           \
  89                 state = JIS_ASCII;      \
  90         }
  91
  92 #define HW_IN()                         \
  93         if (state != JIS_HWKANA) {      \
  94                 *out++ = ESC;           \
  95                 *out++ = '(';           \
  96                 *out++ = 'I';           \
  97                 state = JIS_HWKANA;     \
  98         }
  99
 100 #define AUX_IN()                        \
 101         if (state != JIS_AUXKANJI) {    \
 102                 *out++ = ESC;           \
 103                 *out++ = '$';           \
 104                 *out++ = '(';           \
 105                 *out++ = 'D';           \
 106                 state = JIS_AUXKANJI;   \
 107         }
 108
 109 void conv_jistoeuc(gchar *outbuf, gint outlen, const gchar *inbuf)
 110 {
 111         const guchar *in = inbuf;
 112         guchar *out = outbuf;
 113         JISState state = JIS_ASCII;
 114
 115         while (*in != '\0') {
 116                 if (*in == ESC) {
 117                         in++;
 118                         if (*in == '$') {
 119                                 if (*(in + 1) == '@' || *(in + 1) == 'B') {
 120                                         state = JIS_KANJI;
 121                                         in += 2;
 122                                 } else if (*(in + 1) == '(' &&
 123                                            *(in + 2) == 'D') {
 124                                         state = JIS_AUXKANJI;
 125                                         in += 3;
 126                                 } else {
 127                                         /* unknown escape sequence */
 128                                         state = JIS_ASCII;
 129                                 }
 130                         } else if (*in == '(') {
 131                                 if (*(in + 1) == 'B' || *(in + 1) == 'J') {
 132                                         state = JIS_ASCII;
 133                                         in += 2;
 134                                 } else if (*(in + 1) == 'I') {
 135                                         state = JIS_HWKANA;
 136                                         in += 2;
 137                                 } else {
 138                                         /* unknown escape sequence */
 139                                         state = JIS_ASCII;
 140                                 }
 141                         } else {
 142                                 /* unknown escape sequence */
 143                                 state = JIS_ASCII;
 144                         }
 145                 } else if (*in == 0x0e) {
 146                         state = JIS_HWKANA;
 147                         in++;
 148                 } else if (*in == 0x0f) {
 149                         state = JIS_ASCII;
 150                         in++;
 151                 } else {
 152                         switch (state) {
 153                         case JIS_ASCII:
 154                                 *out++ = *in++;
 155                                 break;
 156                         case JIS_KANJI:
 157                                 *out++ = *in++ | 0x80;
 158                                 if (*in == '\0') break;
 159                                 *out++ = *in++ | 0x80;
 160                                 break;
 161                         case JIS_HWKANA:
 162                                 *out++ = 0x8e;
 163                                 *out++ = *in++ | 0x80;
 164                                 break;
 165                         case JIS_AUXKANJI:
 166                                 *out++ = 0x8f;
 167                                 *out++ = *in++ | 0x80;
 168                                 if (*in == '\0') break;
 169                                 *out++ = *in++ | 0x80;
 170                                 break;
 171                         }
 172                 }
 173         }
 174
 175         *out = '\0';
 176 }
 177
 178 void conv_euctojis(gchar *outbuf, gint outlen, const gchar *inbuf)
 179 {
 180         const guchar *in = inbuf;
 181         guchar *out = outbuf;
 182         JISState state = JIS_ASCII;
 183
 184         while (*in != '\0') {
 185                 if (isascii(*in)) {
 186                         K_OUT();
 187                         *out++ = *in++;
 188                 } else if (iseuckanji(*in)) {
 189                         if (iseuckanji(*(in + 1))) {
 190                                 K_IN();
 191                                 *out++ = *in++ & 0x7f;
 192                                 *out++ = *in++ & 0x7f;
 193                         } else {
 194                                 K_OUT();
 195                                 *out++ = SUBST_CHAR;
 196                                 in++;
 197                                 if (*in != '\0' && !isascii(*in)) {
 198                                         *out++ = SUBST_CHAR;
 199                                         in++;
 200                                 }
 201                         }
 202                 } else if (iseuchwkana1(*in)) {
 203                         in++;
 204                         if (iseuchwkana2(*in)) {
 205                                 HW_IN();
 206                                 *out++ = *in++ & 0x7f;
 207                         } else {
 208                                 K_OUT();
 209                                 if (*in != '\0' && !isascii(*in)) {
 210                                         *out++ = SUBST_CHAR;
 211                                         in++;
 212                                 }
 213                         }
 214                 } else if (iseucaux(*in)) {
 215                         in++;
 216                         if (iseuckanji(*in) && iseuckanji(*(in + 1))) {
 217                                 AUX_IN();
 218                                 *out++ = *in++ & 0x7f;
 219                                 *out++ = *in++ & 0x7f;
 220                         } else {
 221                                 K_OUT();
 222                                 if (*in != '\0' && !isascii(*in)) {
 223                                         *out++ = SUBST_CHAR;
 224                                         in++;
 225                                         if (*in != '\0' && !isascii(*in)) {
 226                                                 *out++ = SUBST_CHAR;
 227                                                 in++;
 228                                         }
 229                                 }
 230                         }
 231                 } else {
 232                         K_OUT();
 233                         *out++ = SUBST_CHAR;
 234                         in++;
 235                 }
 236         }
 237
 238         K_OUT();
 239         *out = '\0';
 240 }
 241
 242 void conv_sjistoeuc(gchar *outbuf, gint outlen, const gchar *inbuf)
 243 {
 244         const guchar *in = inbuf;
 245         guchar *out = outbuf;
 246
 247         while (*in != '\0') {
 248                 if (isascii(*in)) {
 249                         *out++ = *in++;
 250                 } else if (issjiskanji1(*in)) {
 251                         if (issjiskanji2(*(in + 1))) {
 252                                 guchar out1 = *in;
 253                                 guchar out2 = *(in + 1);
 254                                 guchar row;
 255
 256                                 row = out1 < 0xa0 ? 0x70 : 0xb0;
 257                                 if (out2 < 0x9f) {
 258                                         out1 = (out1 - row) * 2 - 1;
 259                                         out2 -= out2 > 0x7f ? 0x20 : 0x1f;
 260                                 } else {
 261                                         out1 = (out1 - row) * 2;
 262                                         out2 -= 0x7e;
 263                                 }
 264
 265                                 *out++ = out1 | 0x80;
 266                                 *out++ = out2 | 0x80;
 267                                 in += 2;
 268                         } else {
 269                                 *out++ = SUBST_CHAR;
 270                                 in++;
 271                                 if (*in != '\0' && !isascii(*in)) {
 272                                         *out++ = SUBST_CHAR;
 273                                         in++;
 274                                 }
 275                         }
 276                 } else if (issjishwkana(*in)) {
 277                         *out++ = 0x8e;
 278                         *out++ = *in++;
 279                 } else {
 280                         *out++ = SUBST_CHAR;
 281                         in++;
 282                 }
 283         }
 284
 285         *out = '\0';
 286 }
 287
 288 void conv_anytoeuc(gchar *outbuf, gint outlen, const gchar *inbuf)
 289 {
 290         switch (conv_guess_encoding(inbuf)) {
 291         case C_ISO_2022_JP:
 292                 conv_jistoeuc(outbuf, outlen, inbuf);
 293                 break;
 294         case C_SHIFT_JIS:
 295                 conv_sjistoeuc(outbuf, outlen, inbuf);
 296                 break;
 297         default:
 298                 strncpy2(outbuf, inbuf, outlen);
 299                 break;
 300         }
 301 }
 302
 303 void conv_anytojis(gchar *outbuf, gint outlen, const gchar *inbuf)
 304 {
 305         switch (conv_guess_encoding(inbuf)) {
 306         case C_EUC_JP:
 307                 conv_euctojis(outbuf, outlen, inbuf);
 308                 break;
 309         default:
 310                 strncpy2(outbuf, inbuf, outlen);
 311                 break;
 312         }
 313 }
 314
 315 void conv_unreadable_eucjp(gchar *str)
 316 {
 317         register guchar *p = str;
 318
 319         while (*p != '\0') {
 320                 if (isascii(*p)) {
 321                         /* convert CR+LF -> LF */
 322                         if (*p == '\r' && *(p + 1) == '\n')
 323                                 memmove(p, p + 1, strlen(p));
 324                         /* printable 7 bit code */
 325                         p++;
 326                 } else if (iseuckanji(*p)) {
 327                         if (iseuckanji(*(p + 1)) && !isunprintableeuckanji(*p))
 328                                 /* printable euc-jp code */
 329                                 p += 2;
 330                         else {
 331                                 /* substitute unprintable code */
 332                                 *p++ = SUBST_CHAR;
 333                                 if (*p != '\0') {
 334                                         if (isascii(*p))
 335                                                 p++;
 336                                         else
 337                                                 *p++ = SUBST_CHAR;
 338                                 }
 339                         }
 340                 } else if (iseuchwkana1(*p)) {
 341                         if (iseuchwkana2(*(p + 1)))
 342                                 /* euc-jp hankaku kana */
 343                                 p += 2;
 344                         else
 345                                 *p++ = SUBST_CHAR;
 346                 } else if (iseucaux(*p)) {
 347                         if (iseuckanji(*(p + 1)) && iseuckanji(*(p + 2))) {
 348                                 /* auxiliary kanji */
 349                                 p += 3;
 350                         } else
 351                                 *p++ = SUBST_CHAR;
 352                 } else
 353                         /* substitute unprintable 1 byte code */
 354                         *p++ = SUBST_CHAR;
 355         }
 356 }
 357
 358 void conv_unreadable_8bit(gchar *str)
 359 {
 360         register guchar *p = str;
 361
 362         while (*p != '\0') {
 363                 /* convert CR+LF -> LF */
 364                 if (*p == '\r' && *(p + 1) == '\n')
 365                         memmove(p, p + 1, strlen(p));
 366                 else if (!isascii(*p)) *p = SUBST_CHAR;
 367                 p++;
 368         }
 369 }
 370
 371 void conv_unreadable_latin(gchar *str)
 372 {
 373         register guchar *p = str;
 374
 375         while (*p != '\0') {
 376                 /* convert CR+LF -> LF */
 377                 if (*p == '\r' && *(p + 1) == '\n')
 378                         memmove(p, p + 1, strlen(p));
 379                 else if ((*p & 0xff) >= 0x80 && (*p & 0xff) <= 0x9f)
 380                         *p = SUBST_CHAR;
 381                 p++;
 382         }
 383 }
 384
 385 #define NCV     '\0'
 386
 387 void conv_mb_alnum(gchar *str)
 388 {
 389         static guchar char_tbl[] = {
 390                 /* 0xa0 - 0xaf */
 391                 NCV, ' ', NCV, NCV, ',', '.', NCV, ':',
 392                 ';', '?', '!', NCV, NCV, NCV, NCV, NCV,
 393                 /* 0xb0 - 0xbf */
 394                 NCV, NCV, NCV, NCV, NCV, NCV, NCV, NCV,
 395                 NCV, NCV, NCV, NCV, NCV, NCV, NCV, NCV,
 396                 /* 0xc0 - 0xcf */
 397                 NCV, NCV, NCV, NCV, NCV, NCV, NCV, NCV,
 398                 NCV, NCV, '(', ')', NCV, NCV, '[', ']',
 399                 /* 0xd0 - 0xdf */
 400                 '{', '}', NCV, NCV, NCV, NCV, NCV, NCV,
 401                 NCV, NCV, NCV, NCV, '+', '-', NCV, NCV,
 402                 /* 0xe0 - 0xef */
 403                 NCV, '=', NCV, '<', '>', NCV, NCV, NCV,
 404                 NCV, NCV, NCV, NCV, NCV, NCV, NCV, NCV
 405         };
 406
 407         register guchar *p = str;
 408         register gint len;
 409
 410         len = strlen(str);
 411
 412         while (len > 1) {
 413                 if (*p == 0xa3) {
 414                         register guchar ch = *(p + 1);
 415
 416                         if (ch >= 0xb0 && ch <= 0xfa) {
 417                                 /* [a-zA-Z] */
 418                                 *p = ch & 0x7f;
 419                                 p++;
 420                                 len--;
 421                                 memmove(p, p + 1, len);
 422                                 len--;
 423                         } else  {
 424                                 p += 2;
 425                                 len -= 2;
 426                         }
 427                 } else if (*p == 0xa1) {
 428                         register guchar ch = *(p + 1);
 429
 430                         if (ch >= 0xa0 && ch <= 0xef &&
 431                             NCV != char_tbl[ch - 0xa0]) {
 432                                 *p = char_tbl[ch - 0xa0];
 433                                 p++;
 434                                 len--;
 435                                 memmove(p, p + 1, len);
 436                                 len--;
 437                         } else {
 438                                 p += 2;
 439                                 len -= 2;
 440                         }
 441                 } else if (iseuckanji(*p)) {
 442                         p += 2;
 443                         len -= 2;
 444                 } else {
 445                         p++;
 446                         len--;
 447                 }
 448         }
 449 }
 450
 451 CharSet conv_guess_encoding(const gchar *str)
 452 {
 453         const guchar *p = str;
 454         CharSet guessed = C_US_ASCII;
 455
 456         while (*p != '\0') {
 457                 if (*p == ESC && (*(p + 1) == '$' || *(p + 1) == '(')) {
 458                         if (guessed == C_US_ASCII)
 459                                 return C_ISO_2022_JP;
 460                         p += 2;
 461                 } else if (isascii(*p)) {
 462                         p++;
 463                 } else if (iseuckanji(*p) && iseuckanji(*(p + 1))) {
 464                         if (*p >= 0xfd && *p <= 0xfe)
 465                                 return C_EUC_JP;
 466                         else if (guessed == C_SHIFT_JIS) {
 467                                 if ((issjiskanji1(*p) &&
 468                                      issjiskanji2(*(p + 1))) ||
 469                                     issjishwkana(*p))
 470                                         guessed = C_SHIFT_JIS;
 471                                 else
 472                                         guessed = C_EUC_JP;
 473                         } else
 474                                 guessed = C_EUC_JP;
 475                         p += 2;
 476                 } else if (issjiskanji1(*p) && issjiskanji2(*(p + 1))) {
 477                         if (iseuchwkana1(*p) && iseuchwkana2(*(p + 1)))
 478                                 guessed = C_SHIFT_JIS;
 479                         else
 480                                 return C_SHIFT_JIS;
 481                         p += 2;
 482                 } else if (issjishwkana(*p)) {
 483                         guessed = C_SHIFT_JIS;
 484                         p++;
 485                 } else {
 486                         p++;
 487                 }
 488         }
 489
 490         return guessed;
 491 }
 492
 493 void conv_jistodisp(gchar *outbuf, gint outlen, const gchar *inbuf)
 494 {
 495         conv_jistoeuc(outbuf, outlen, inbuf);
 496         conv_unreadable_eucjp(outbuf);
 497 }
 498
 499 void conv_sjistodisp(gchar *outbuf, gint outlen, const gchar *inbuf)
 500 {
 501         conv_sjistoeuc(outbuf, outlen, inbuf);
 502         conv_unreadable_eucjp(outbuf);
 503 }
 504
 505 void conv_euctodisp(gchar *outbuf, gint outlen, const gchar *inbuf)
 506 {
 507         strncpy2(outbuf, inbuf, outlen);
 508         conv_unreadable_eucjp(outbuf);
 509 }
 510
 511 void conv_anytodisp(gchar *outbuf, gint outlen, const gchar *inbuf)
 512 {
 513         conv_anytoeuc(outbuf, outlen, inbuf);
 514         conv_unreadable_eucjp(outbuf);
 515 }
 516
 517 void conv_ustodisp(gchar *outbuf, gint outlen, const gchar *inbuf)
 518 {
 519         strncpy2(outbuf, inbuf, outlen);
 520         conv_unreadable_8bit(outbuf);
 521 }
 522
 523 void conv_latintodisp(gchar *outbuf, gint outlen, const gchar *inbuf)
 524 {
 525         strncpy2(outbuf, inbuf, outlen);
 526         conv_unreadable_latin(outbuf);
 527 }
 528
 529 void conv_noconv(gchar *outbuf, gint outlen, const gchar *inbuf)
 530 {
 531         strncpy2(outbuf, inbuf, outlen);
 532 }
 533
 534 CodeConverter *conv_code_converter_new(const gchar *charset)
 535 {
 536         CodeConverter *conv;
 537
 538         conv = g_new0(CodeConverter, 1);
 539         conv->code_conv_func = conv_get_code_conv_func(charset, NULL);
 540         conv->charset_str = g_strdup(charset);
 541         conv->charset = conv_get_charset_from_str(charset);
 542
 543         return conv;
 544 }
 545
 546 void conv_code_converter_destroy(CodeConverter *conv)
 547 {
 548         g_free(conv->charset_str);
 549         g_free(conv);
 550 }
 551
 552 gint conv_convert(CodeConverter *conv, gchar *outbuf, gint outlen,
 553                   const gchar *inbuf)
 554 {
 555 #if HAVE_ICONV
 556         if (conv->code_conv_func != conv_noconv)
 557                 conv->code_conv_func(outbuf, outlen, inbuf);
 558         else {
 559                 gchar *str;
 560
 561                 str = conv_codeset_strdup(inbuf, conv->charset_str, NULL);
 562                 if (!str)
 563                         return -1;
 564                 else {
 565                         strncpy2(outbuf, str, outlen);
 566                         g_free(str);
 567                 }
 568         }
 569 #else /* !HAVE_ICONV */
 570         conv->code_conv_func(outbuf, outlen, inbuf);
 571 #endif
 572
 573         return 0;
 574 }
 575
 576 gchar *conv_codeset_strdup(const gchar *inbuf,
 577                            const gchar *src_code, const gchar *dest_code)
 578 {
 579         gchar *buf;
 580         size_t len;
 581         CodeConvFunc conv_func;
 582
 583         conv_func = conv_get_code_conv_func(src_code, dest_code);
 584         if (conv_func != conv_noconv) {
 585                 len = (strlen(inbuf) + 1) * 3;
 586                 buf = g_malloc(len);
 587                 if (!buf) return NULL;
 588
 589                 conv_func(buf, len, inbuf);
 590                 return g_realloc(buf, strlen(buf) + 1);
 591         }
 592
 593 #if HAVE_ICONV
 594         if (!src_code)
 595                 src_code = conv_get_outgoing_charset_str();
 596         if (!dest_code)
 597                 dest_code = conv_get_current_charset_str();
 598
 599         /* don't convert if current codeset is US-ASCII */
 600         if (!strcasecmp(dest_code, CS_US_ASCII))
 601                 return g_strdup(inbuf);
 602
 603         /* don't convert if src and dest codeset are identical */
 604         if (!strcasecmp(src_code, dest_code))
 605                 return g_strdup(inbuf);
 606
 607         return conv_iconv_strdup(inbuf, src_code, dest_code);
 608 #else
 609         return g_strdup(inbuf);
 610 #endif /* HAVE_ICONV */
 611 }
 612
 613 CodeConvFunc conv_get_code_conv_func(const gchar *src_charset_str,
 614                                      const gchar *dest_charset_str)
 615 {
 616         CodeConvFunc code_conv = conv_noconv;
 617         CharSet src_charset;
 618         CharSet dest_charset;
 619
 620         if (!src_charset_str)
 621                 src_charset = conv_get_current_charset();
 622         else
 623                 src_charset = conv_get_charset_from_str(src_charset_str);
 624
 625         /* auto detection mode */
 626         if (!src_charset_str && !dest_charset_str) {
 627                 if (src_charset == C_EUC_JP || src_charset == C_SHIFT_JIS)
 628                         return conv_anytodisp;
 629                 else
 630                         return conv_noconv;
 631         }
 632
 633         dest_charset = conv_get_charset_from_str(dest_charset_str);
 634
 635         switch (src_charset) {
 636         case C_ISO_2022_JP:
 637         case C_ISO_2022_JP_2:
 638                 if (dest_charset == C_AUTO)
 639                         code_conv = conv_jistodisp;
 640                 else if (dest_charset == C_EUC_JP)
 641                         code_conv = conv_jistoeuc;
 642                 break;
 643         case C_US_ASCII:
 644                 if (dest_charset == C_AUTO)
 645                         code_conv = conv_ustodisp;
 646                 break;
 647         case C_ISO_8859_1:
 648 #if !HAVE_ICONV
 649         case C_ISO_8859_2:
 650         case C_ISO_8859_4:
 651         case C_ISO_8859_5:
 652         case C_ISO_8859_7:
 653         case C_ISO_8859_8:
 654         case C_ISO_8859_9:
 655         case C_ISO_8859_11:
 656         case C_ISO_8859_13:
 657         case C_ISO_8859_15:
 658 #endif
 659                 if (dest_charset == C_AUTO)
 660                         code_conv = conv_latintodisp;
 661                 break;
 662         case C_SHIFT_JIS:
 663                 if (dest_charset == C_AUTO)
 664                         code_conv = conv_sjistodisp;
 665                 else if (dest_charset == C_EUC_JP)
 666                         code_conv = conv_sjistoeuc;
 667                 break;
 668         case C_EUC_JP:
 669                 if (dest_charset == C_AUTO)
 670                         code_conv = conv_euctodisp;
 671                 else if (dest_charset == C_ISO_2022_JP ||
 672                          dest_charset == C_ISO_2022_JP_2)
 673                         code_conv = conv_euctojis;
 674                 break;
 675         default:
 676                 break;
 677         }
 678
 679         return code_conv;
 680 }
 681
 682 #if HAVE_ICONV
 683 gchar *conv_iconv_strdup(const gchar *inbuf,
 684                          const gchar *src_code, const gchar *dest_code)
 685 {
 686         iconv_t cd;
 687         const gchar *inbuf_p;
 688         gchar *outbuf;
 689         gchar *outbuf_p;
 690         gint in_size;
 691         gint in_left;
 692         gint out_size;
 693         gint out_left;
 694         gint n_conv;
 695
 696         cd = iconv_open(dest_code, src_code);
 697         if (cd == (iconv_t)-1)
 698                 return NULL;
 699
 700         inbuf_p = inbuf;
 701         in_size = strlen(inbuf) + 1;
 702         in_left = in_size;
 703         out_size = in_size * 2;
 704         outbuf = g_malloc(out_size);
 705         outbuf_p = outbuf;
 706         out_left = out_size;
 707
 708         while ((n_conv = iconv(cd, (gchar **)&inbuf_p, &in_left,
 709                                &outbuf_p, &out_left)) < 0) {
 710                 if (EILSEQ == errno) {
 711                         *outbuf_p = '\0';
 712                         break;
 713                 } else if (EINVAL == errno) {
 714                         *outbuf_p = '\0';
 715                         break;
 716                 } else if (E2BIG == errno) {
 717                         out_size *= 2;
 718                         outbuf = g_realloc(outbuf, out_size);
 719                         inbuf_p = inbuf;
 720                         in_left = in_size;
 721                         outbuf_p = outbuf;
 722                         out_left = out_size;
 723                 } else {
 724                         g_warning("conv_iconv_strdup(): %s\n",
 725                                   g_strerror(errno));
 726                         *outbuf_p = '\0';
 727                         break;
 728                 }
 729         }
 730
 731         iconv_close(cd);
 732
 733         return outbuf;
 734 }
 735 #endif /* HAVE_ICONV */
 736
 737 static const struct {
 738         CharSet charset;
 739         gchar *const name;
 740 } charsets[] = {
 741         {C_US_ASCII,            CS_US_ASCII},
 742         {C_US_ASCII,            CS_ANSI_X3_4_1968},
 743         {C_UTF_8,               CS_UTF_8},
 744         {C_ISO_8859_1,          CS_ISO_8859_1},
 745         {C_ISO_8859_2,          CS_ISO_8859_2},
 746         {C_ISO_8859_4,          CS_ISO_8859_4},
 747         {C_ISO_8859_5,          CS_ISO_8859_5},
 748         {C_ISO_8859_7,          CS_ISO_8859_7},
 749         {C_ISO_8859_8,          CS_ISO_8859_8},
 750         {C_ISO_8859_9,          CS_ISO_8859_9},
 751         {C_ISO_8859_11,         CS_ISO_8859_11},
 752         {C_ISO_8859_13,         CS_ISO_8859_13},
 753         {C_ISO_8859_15,         CS_ISO_8859_15},
 754         {C_BALTIC,              CS_BALTIC},
 755         {C_CP1251,              CS_CP1251},
 756         {C_WINDOWS_1251,        CS_WINDOWS_1251},
 757         {C_KOI8_R,              CS_KOI8_R},
 758         {C_KOI8_U,              CS_KOI8_U},
 759         {C_ISO_2022_JP,         CS_ISO_2022_JP},
 760         {C_ISO_2022_JP_2,       CS_ISO_2022_JP_2},
 761         {C_EUC_JP,              CS_EUC_JP},
 762         {C_EUC_JP,              CS_EUCJP},
 763         {C_SHIFT_JIS,           CS_SHIFT_JIS},
 764         {C_SHIFT_JIS,           CS_SHIFT__JIS},
 765         {C_SHIFT_JIS,           CS_SJIS},
 766         {C_ISO_2022_KR,         CS_ISO_2022_KR},
 767         {C_EUC_KR,              CS_EUC_KR},
 768         {C_ISO_2022_CN,         CS_ISO_2022_CN},
 769         {C_EUC_CN,              CS_EUC_CN},
 770         {C_GB2312,              CS_GB2312},
 771         {C_EUC_TW,              CS_EUC_TW},
 772         {C_BIG5,                CS_BIG5},
 773         {C_TIS_620,             CS_TIS_620},
 774         {C_WINDOWS_874,         CS_WINDOWS_874},
 775 };
 776
 777 static const struct {
 778         gchar *const locale;
 779         CharSet charset;
 780         CharSet out_charset;
 781 } locale_table[] = {
 782         {"ja_JP.eucJP"  , C_EUC_JP      , C_ISO_2022_JP},
 783         {"ja_JP.ujis"   , C_EUC_JP      , C_ISO_2022_JP},
 784         {"ja_JP.EUC"    , C_EUC_JP      , C_ISO_2022_JP},
 785         {"ja_JP.SJIS"   , C_SHIFT_JIS   , C_ISO_2022_JP},
 786         {"ja_JP.JIS"    , C_ISO_2022_JP , C_ISO_2022_JP},
 787         {"ja_JP"        , C_EUC_JP      , C_ISO_2022_JP},
 788         {"ko_KR"        , C_EUC_KR      , C_EUC_KR},
 789         {"zh_CN.GB2312" , C_GB2312      , C_GB2312},
 790         {"zh_CN"        , C_GB2312      , C_GB2312},
 791         {"zh_TW.eucTW"  , C_EUC_TW      , C_BIG5},
 792         {"zh_TW.Big5"   , C_BIG5        , C_BIG5},
 793         {"zh_TW"        , C_BIG5        , C_BIG5},
 794
 795         {"ru_RU.KOI8-R" , C_KOI8_R      , C_ISO_8859_5},
 796         {"ru_RU.CP1251" , C_WINDOWS_1251, C_ISO_8859_5},
 797
 798         {"bg_BG"        , C_WINDOWS_1251, C_WINDOWS_1251},
 799
 800         {"en_US"        , C_ISO_8859_1  , C_ISO_8859_1},
 801         {"ca_ES"        , C_ISO_8859_1  , C_ISO_8859_1},
 802         {"da_DK"        , C_ISO_8859_1  , C_ISO_8859_1},
 803         {"de_DE"        , C_ISO_8859_1  , C_ISO_8859_1},
 804         {"nl_NL"        , C_ISO_8859_1  , C_ISO_8859_1},
 805         {"et_EE"        , C_ISO_8859_1  , C_ISO_8859_1},
 806         {"fi_FI"        , C_ISO_8859_1  , C_ISO_8859_1},
 807         {"fr_FR"        , C_ISO_8859_1  , C_ISO_8859_1},
 808         {"is_IS"        , C_ISO_8859_1  , C_ISO_8859_1},
 809         {"it_IT"        , C_ISO_8859_1  , C_ISO_8859_1},
 810         {"no_NO"        , C_ISO_8859_1  , C_ISO_8859_1},
 811         {"pt_PT"        , C_ISO_8859_1  , C_ISO_8859_1},
 812         {"pt_BR"        , C_ISO_8859_1  , C_ISO_8859_1},
 813         {"es_ES"        , C_ISO_8859_1  , C_ISO_8859_1},
 814         {"sv_SE"        , C_ISO_8859_1  , C_ISO_8859_1},
 815
 816         {"hr_HR"        , C_ISO_8859_2  , C_ISO_8859_2},
 817         {"hu_HU"        , C_ISO_8859_2  , C_ISO_8859_2},
 818         {"pl_PL"        , C_ISO_8859_2  , C_ISO_8859_2},
 819         {"ro_RO"        , C_ISO_8859_2  , C_ISO_8859_2},
 820         {"sk_SK"        , C_ISO_8859_2  , C_ISO_8859_2},
 821         {"sl_SI"        , C_ISO_8859_2  , C_ISO_8859_2},
 822         {"ru_RU"        , C_ISO_8859_5  , C_ISO_8859_5},
 823         {"el_GR"        , C_ISO_8859_7  , C_ISO_8859_7},
 824         {"iw_IL"        , C_ISO_8859_8  , C_ISO_8859_8},
 825         {"tr_TR"        , C_ISO_8859_9  , C_ISO_8859_9},
 826
 827         {"th_TH"        , C_TIS_620     , C_TIS_620},
 828         /* {"th_TH"     , C_WINDOWS_874}, */
 829         /* {"th_TH"     , C_ISO_8859_11}, */
 830
 831         {"lt_LT.iso88594"       , C_ISO_8859_4  , C_ISO_8859_4},
 832         {"lt_LT.ISO8859-4"      , C_ISO_8859_4  , C_ISO_8859_4},
 833         {"lt_LT.ISO_8859-4"     , C_ISO_8859_4  , C_ISO_8859_4},
 834         {"lt_LT"                , C_ISO_8859_13 , C_ISO_8859_13},
 835         {"lv_LV"                , C_ISO_8859_13 , C_ISO_8859_13},
 836
 837         {"C"                    , C_US_ASCII    , C_US_ASCII},
 838         {"POSIX"                , C_US_ASCII    , C_US_ASCII},
 839         {"ANSI_X3.4-1968"       , C_US_ASCII    , C_US_ASCII},
 840 };
 841
 842 const gchar *conv_get_charset_str(CharSet charset)
 843 {
 844         gint i;
 845
 846         for (i = 0; i < sizeof(charsets) / sizeof(charsets[0]); i++) {
 847                 if (charsets[i].charset == charset)
 848                         return charsets[i].name;
 849         }
 850
 851         return NULL;
 852 }
 853
 854 CharSet conv_get_charset_from_str(const gchar *charset)
 855 {
 856         gint i;
 857
 858         if (!charset) return C_AUTO;
 859
 860         for (i = 0; i < sizeof(charsets) / sizeof(charsets[0]); i++) {
 861                 if (!strcasecmp(charsets[i].name, charset))
 862                         return charsets[i].charset;
 863         }
 864
 865         return C_AUTO;
 866 }
 867
 868 CharSet conv_get_current_charset(void)
 869 {
 870         static CharSet cur_charset = -1;
 871         const gchar *cur_locale;
 872         gint i;
 873
 874         if (cur_charset != -1)
 875                 return cur_charset;
 876
 877         cur_locale = conv_get_current_locale();
 878         if (!cur_locale) {
 879                 cur_charset = C_US_ASCII;
 880                 return cur_charset;
 881         }
 882
 883         if (strcasestr(cur_locale, "UTF-8")) {
 884                 cur_charset = C_UTF_8;
 885                 return cur_charset;
 886         }
 887
 888         for (i = 0; i < sizeof(locale_table) / sizeof(locale_table[0]); i++) {
 889                 const gchar *p;
 890
 891                 /* "ja_JP.EUC" matches with "ja_JP.eucJP", "ja_JP.EUC" and
 892                    "ja_JP". "ja_JP" matches with "ja_JP.xxxx" and "ja" */
 893                 if (!strncasecmp(cur_locale, locale_table[i].locale,
 894                                  strlen(locale_table[i].locale))) {
 895                         cur_charset = locale_table[i].charset;
 896                         return cur_charset;
 897                 } else if ((p = strchr(locale_table[i].locale, '_')) &&
 898                          !strchr(p + 1, '.')) {
 899                         if (strlen(cur_locale) == 2 &&
 900                             !strncasecmp(cur_locale, locale_table[i].locale, 2)) {
 901                                 cur_charset = locale_table[i].charset;
 902                                 return cur_charset;
 903                         }
 904                 }
 905         }
 906
 907         cur_charset = C_AUTO;
 908         return cur_charset;
 909 }
 910
 911 const gchar *conv_get_current_charset_str(void)
 912 {
 913         static const gchar *codeset = NULL;
 914
 915         if (!codeset)
 916                 codeset = conv_get_charset_str(conv_get_current_charset());
 917
 918         return codeset ? codeset : "US-ASCII";
 919 }
 920
 921 CharSet conv_get_outgoing_charset(void)
 922 {
 923         static CharSet out_charset = -1;
 924         const gchar *cur_locale;
 925         gint i;
 926
 927         if (out_charset != -1)
 928                 return out_charset;
 929
 930         cur_locale = conv_get_current_locale();
 931         if (!cur_locale) {
 932                 out_charset = C_AUTO;
 933                 return out_charset;
 934         }
 935
 936         for (i = 0; i < sizeof(locale_table) / sizeof(locale_table[0]); i++) {
 937                 const gchar *p;
 938
 939                 if (!strncasecmp(cur_locale, locale_table[i].locale,
 940                                  strlen(locale_table[i].locale))) {
 941                         out_charset = locale_table[i].out_charset;
 942                         break;
 943                 } else if ((p = strchr(locale_table[i].locale, '_')) &&
 944                          !strchr(p + 1, '.')) {
 945                         if (strlen(cur_locale) == 2 &&
 946                             !strncasecmp(cur_locale, locale_table[i].locale, 2)) {
 947                                 out_charset = locale_table[i].out_charset;
 948                                 break;
 949                         }
 950                 }
 951         }
 952
 953 #if !HAVE_ICONV
 954         /* encoding conversion without iconv() is only supported
 955            on Japanese locale for now */
 956         if (out_charset == C_ISO_2022_JP)
 957                 return out_charset;
 958         else
 959                 return conv_get_current_charset();
 960 #endif
 961
 962         return out_charset;
 963 }
 964
 965 const gchar *conv_get_outgoing_charset_str(void)
 966 {
 967         CharSet out_charset;
 968         const gchar *str;
 969
 970         if (prefs_common.outgoing_charset) {
 971                 if (!isalpha(prefs_common.outgoing_charset[0])) {
 972                         g_free(prefs_common.outgoing_charset);
 973                         prefs_common.outgoing_charset = g_strdup(CS_AUTO);
 974                 } else if (strcmp(prefs_common.outgoing_charset, CS_AUTO) != 0)
 975                         return prefs_common.outgoing_charset;
 976         }
 977
 978         out_charset = conv_get_outgoing_charset();
 979         str = conv_get_charset_str(out_charset);
 980
 981         return str ? str : "US-ASCII";
 982 }
 983
 984 const gchar *conv_get_current_locale(void)
 985 {
 986         gchar *cur_locale;
 987
 988         cur_locale = g_getenv("LC_ALL");
 989         if (!cur_locale) cur_locale = g_getenv("LC_CTYPE");
 990         if (!cur_locale) cur_locale = g_getenv("LANG");
 991         if (!cur_locale) cur_locale = setlocale(LC_CTYPE, NULL);
 992
 993         debug_print("current locale: %s\n",
 994                     cur_locale ? cur_locale : "(none)");
 995
 996         return cur_locale;
 997 }
 998
 999 void conv_unmime_header_overwrite(gchar *str)
1000 {
1001         gchar *buf;
1002         gint buflen;
1003         CharSet cur_charset;
1004
1005         cur_charset = conv_get_current_charset();
1006
1007         if (cur_charset == C_EUC_JP) {
1008                 buflen = strlen(str) * 2 + 1;
1009                 Xalloca(buf, buflen, return);
1010                 conv_anytodisp(buf, buflen, str);
1011                 unmime_header(str, buf);
1012         } else {
1013                 buflen = strlen(str) + 1;
1014                 Xalloca(buf, buflen, return);
1015                 unmime_header(buf, str);
1016                 strncpy2(str, buf, buflen);
1017         }
1018 }
1019
1020 void conv_unmime_header(gchar *outbuf, gint outlen, const gchar *str,
1021                         const gchar *charset)
1022 {
1023         CharSet cur_charset;
1024
1025         cur_charset = conv_get_current_charset();
1026
1027         if (cur_charset == C_EUC_JP) {
1028                 gchar *buf;
1029                 gint buflen;
1030
1031                 buflen = strlen(str) * 2 + 1;
1032                 Xalloca(buf, buflen, return);
1033                 conv_anytodisp(buf, buflen, str);
1034                 unmime_header(outbuf, buf);
1035         } else
1036                 unmime_header(outbuf, str);
1037 }
1038
1039 #define MAX_LINELEN     76
1040 #define MIMESEP_BEGIN   "=?"
1041 #define MIMESEP_END     "?="
1042
1043 #define B64LEN(len)     ((len) / 3 * 4 + ((len) % 3 ? 4 : 0))
1044
1045 #define LBREAK_IF_REQUIRED(cond)                                \
1046 {                                                               \
1047         if (len - (destp - dest) < MAX_LINELEN + 2) {           \
1048                 *destp = '\0';                                  \
1049                 return;                                         \
1050         }                                                       \
1051                                                                 \
1052         if ((cond) && *srcp) {                                  \
1053                 if (destp > dest && isspace(*(destp - 1)))      \
1054                         destp--;                                \
1055                 else if (isspace(*srcp))                        \
1056                         srcp++;                                 \
1057                 if (*srcp) {                                    \
1058                         *destp++ = '\n';                        \
1059                         *destp++ = ' ';                         \
1060                         left = MAX_LINELEN - 1;                 \
1061                 }                                               \
1062         }                                                       \
1063 }
1064
1065 void conv_encode_header(gchar *dest, gint len, const gchar *src,
1066                         gint header_len)
1067 {
1068         const gchar *cur_encoding;
1069         const gchar *out_encoding;
1070         gint mimestr_len;
1071         gchar *mimesep_enc;
1072         gint left;
1073         const gchar *srcp = src;
1074         gchar *destp = dest;
1075         gboolean use_base64;
1076
1077         if (MB_CUR_MAX > 1) {
1078                 use_base64 = TRUE;
1079                 mimesep_enc = "?B?";
1080         } else {
1081                 use_base64 = FALSE;
1082                 mimesep_enc = "?Q?";
1083         }
1084
1085         cur_encoding = conv_get_current_charset_str();
1086         if (!strcmp(cur_encoding, "US-ASCII"))
1087                 cur_encoding = "ISO-8859-1";
1088         out_encoding = conv_get_outgoing_charset_str();
1089         if (!strcmp(out_encoding, "US-ASCII"))
1090                 out_encoding = "ISO-8859-1";
1091
1092         mimestr_len = strlen(MIMESEP_BEGIN) + strlen(out_encoding) +
1093                 strlen(mimesep_enc) + strlen(MIMESEP_END);
1094
1095         left = MAX_LINELEN - header_len;
1096
1097         while (*srcp) {
1098                 LBREAK_IF_REQUIRED(left <= 0);
1099
1100                 while (isspace(*srcp)) {
1101                         *destp++ = *srcp++;
1102                         left--;
1103                         LBREAK_IF_REQUIRED(left <= 0);
1104                 }
1105
1106                 /* output as it is if the next word is ASCII string */
1107                 if (!is_next_nonascii(srcp)) {
1108                         gint word_len;
1109
1110                         word_len = get_next_word_len(srcp);
1111                         LBREAK_IF_REQUIRED(left < word_len);
1112                         while (word_len > 0) {
1113                                 LBREAK_IF_REQUIRED(left <= 0);
1114                                 *destp++ = *srcp++;
1115                                 left--;
1116                                 word_len--;
1117                         }
1118
1119                         continue;
1120                 }
1121
1122                 while (1) {
1123                         gint mb_len = 0;
1124                         gint cur_len = 0;
1125                         gchar *part_str;
1126                         gchar *out_str;
1127                         gchar *enc_str;
1128                         const gchar *p = srcp;
1129                         gint out_str_len;
1130                         gint out_enc_str_len;
1131                         gint mime_block_len;
1132                         gboolean cont = FALSE;
1133
1134                         while (*p != '\0') {
1135                                 if (isspace(*p) && !is_next_nonascii(p + 1))
1136                                         break;
1137
1138                                 if (MB_CUR_MAX > 1) {
1139                                         mb_len = mblen(p, MB_CUR_MAX);
1140                                         if (mb_len < 0) {
1141                                                 g_warning("conv_encode_header(): invalid multibyte character encountered\n");
1142                                                 mb_len = 1;
1143                                         }
1144                                 } else
1145                                         mb_len = 1;
1146
1147                                 Xstrndup_a(part_str, srcp, cur_len + mb_len, );
1148                                 out_str = conv_codeset_strdup
1149                                         (part_str, cur_encoding, out_encoding);
1150                                 if (!out_str) {
1151                                         g_warning("conv_encode_header(): code conversion failed\n");
1152                                         out_str = g_strdup(out_str);
1153                                 }
1154                                 out_str_len = strlen(out_str);
1155
1156                                 if (use_base64)
1157                                         out_enc_str_len = B64LEN(out_str_len);
1158                                 else
1159                                         out_enc_str_len =
1160                                                 qp_get_q_encoding_len(out_str);
1161
1162                                 g_free(out_str);
1163
1164                                 if (mimestr_len + out_enc_str_len <= left) {
1165                                         cur_len += mb_len;
1166                                         p += mb_len;
1167                                 } else if (cur_len == 0) {
1168                                         LBREAK_IF_REQUIRED(1);
1169                                         continue;
1170                                 } else {
1171                                         cont = TRUE;
1172                                         break;
1173                                 }
1174                         }
1175
1176                         if (cur_len > 0) {
1177                                 Xstrndup_a(part_str, srcp, cur_len, );
1178                                 out_str = conv_codeset_strdup
1179                                         (part_str, cur_encoding, out_encoding);
1180                                 if (!out_str) {
1181                                         g_warning("conv_encode_header(): code conversion failed\n");
1182                                         out_str = g_strdup(out_str);
1183                                 }
1184                                 out_str_len = strlen(out_str);
1185
1186                                 if (use_base64)
1187                                         out_enc_str_len = B64LEN(out_str_len);
1188                                 else
1189                                         out_enc_str_len =
1190                                                 qp_get_q_encoding_len(out_str);
1191
1192                                 Xalloca(enc_str, out_enc_str_len + 1, );
1193                                 if (use_base64)
1194                                         base64_encode(enc_str, out_str, out_str_len);
1195                                 else
1196                                         qp_q_encode(enc_str, out_str);
1197
1198                                 g_free(out_str);
1199
1200                                 /* output MIME-encoded string block */
1201                                 mime_block_len = mimestr_len + strlen(enc_str);
1202                                 g_snprintf(destp, mime_block_len + 1,
1203                                            MIMESEP_BEGIN "%s%s%s" MIMESEP_END,
1204                                            out_encoding, mimesep_enc, enc_str);
1205                                 destp += mime_block_len;
1206                                 srcp += cur_len;
1207
1208                                 left -= mime_block_len;
1209                         }
1210
1211                         LBREAK_IF_REQUIRED(cont);
1212
1213                         if (cur_len == 0)
1214                                 break;
1215                 }
1216         }
1217
1218         *destp = '\0';
1219 }
1220
1221 #undef LBREAK_IF_REQUIRED