Improve address parsing.
[claws.git] / src / addrharvest.c
1 /*
2  * Sylpheed -- a GTK+ based, lightweight, and fast e-mail client
3  * Copyright (C) 2002 Match Grun
4  *
5  * This program is free software; you can redistribute it and/or modify
6  * it under the terms of the GNU General Public License as published by
7  * the Free Software Foundation; either version 2 of the License, or
8  * (at your option) any later version.
9  *
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  * GNU General Public License for more details.
14  *
15  * You should have received a copy of the GNU General Public License
16  * along with this program; if not, write to the Free Software
17  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
18  */
19
20 /*
21  * Functions for an E-Mail address harvester.
22  * Code still needs some work. Address parsing not strictly correct.
23  */
24
25 #include <sys/stat.h>
26 #include <dirent.h>
27 #include <glib.h>
28 #include <string.h>
29
30 #include "utils.h"
31 #include "mgutils.h"
32 #include "addrharvest.h"
33 #include "addritem.h"
34
35 /* Mail header names of interest */
36 static gchar *_headerFrom_     = HEADER_FROM;
37 static gchar *_headerReplyTo_  = HEADER_REPLY_TO;
38 static gchar *_headerSender_   = HEADER_SENDER;
39 static gchar *_headerErrorsTo_ = HEADER_ERRORS_TO;
40 static gchar *_headerCC_       = HEADER_CC;
41 static gchar *_headerTo_       = HEADER_TO;
42
43 #define ADDR_BUFFSIZE    1024
44 #define MSG_BUFFSIZE     16384
45 #define DFL_FOLDER_SIZE  20
46
47 /* Noise strings included by some other E-Mail clients */
48 #define REM_NAME_STRING  "(E-mail)"
49 #define REM_NAME_STRING2 "(E-mail 2)"
50
51 /*
52  * Header entry.
53  */
54 typedef struct _HeaderEntry HeaderEntry;
55 struct _HeaderEntry {
56         gchar      *header;
57         gboolean   selected;
58         ItemFolder *folder;
59         gint       count;
60 };
61
62 /*
63  * Build header table entry.
64  * Enter: harvester Harvester object.
65  *        name      Header name.
66  */
67 static void *addrharvest_build_entry(
68                 AddressHarvester* harvester, gchar *name )
69 {
70         HeaderEntry *entry;
71
72         entry = g_new0( HeaderEntry, 1 );
73         entry->header = name;
74         entry->selected = FALSE;
75         entry->folder = NULL;
76         entry->count = 0;
77         harvester->headerTable = g_list_append( harvester->headerTable, entry );
78 }
79
80 static void addrharvest_print_hdrentry( HeaderEntry *entry, FILE *stream ) {
81         fprintf( stream, "Header Entry\n" );
82         fprintf( stream, "    name : %s\n", entry->header );
83         fprintf( stream, "selected : %s\n", entry->selected ? "yes" : "no" );
84 }
85
86 /*
87  * Free key in table.
88  */
89 static gint addrharvest_free_table_vis( gpointer key, gpointer value, gpointer data ) {
90         g_free( key );
91         key = NULL;
92         value = NULL;
93         return TRUE;
94 }
95
96 /*
97  * Free lookup table.
98  */
99 static void addrharvest_free_table( AddressHarvester* harvester ) {
100         GList *node;
101         HeaderEntry *entry;
102
103         /* Free header list */
104         node = harvester->headerTable;
105         while( node ) {
106                 entry = ( HeaderEntry * ) node->data;
107                 entry->header = NULL;
108                 entry->selected = FALSE;
109                 entry->folder = NULL;
110                 entry->count = 0;
111                 g_free( entry );
112                 node = g_list_next( node );
113         }
114         g_list_free( harvester->headerTable );
115         harvester->headerTable = NULL;
116
117         /* Free duplicate table */
118         g_hash_table_freeze( harvester->dupTable );
119         g_hash_table_foreach_remove( harvester->dupTable, addrharvest_free_table_vis, NULL );
120         g_hash_table_thaw( harvester->dupTable );
121         g_hash_table_destroy( harvester->dupTable );
122         harvester->dupTable = NULL;
123 }
124
125 /*
126 * Create new object.
127 * Return: Harvester.
128 */
129 AddressHarvester *addrharvest_create( void ) {
130         AddressHarvester *harvester;
131
132         harvester = g_new0( AddressHarvester, 1 );
133         harvester->path = NULL;
134         harvester->dupTable = g_hash_table_new( g_str_hash, g_str_equal );
135         harvester->folderSize = DFL_FOLDER_SIZE;
136         harvester->retVal = MGU_SUCCESS;
137
138         /* Build header table */
139         harvester->headerTable = NULL;
140         addrharvest_build_entry( harvester, _headerFrom_ );
141         addrharvest_build_entry( harvester, _headerReplyTo_ );
142         addrharvest_build_entry( harvester, _headerSender_ );
143         addrharvest_build_entry( harvester, _headerErrorsTo_ );
144         addrharvest_build_entry( harvester, _headerCC_ );
145         addrharvest_build_entry( harvester, _headerTo_ );
146
147         return harvester;
148 }
149
150 /*
151 * Properties...
152 */
153 /*
154  * Specify path to folder that will be harvested.
155  * Entry: harvester Harvester object.
156  *        value     Full directory path.
157  */
158 void addrharvest_set_path( AddressHarvester* harvester, const gchar *value ) {
159         g_return_if_fail( harvester != NULL );
160         harvester->path = mgu_replace_string( harvester->path, value );
161         g_strstrip( harvester->path );
162 }
163
164 /*
165  * Specify maximum folder size.
166  * Entry: harvester Harvester object.
167  *        value     Folder size.
168  */
169 void addrharvest_set_folder_size(
170         AddressHarvester* harvester, const gint value )
171 {
172         g_return_if_fail( harvester != NULL );
173         if( value > 0 ) {
174                 harvester->folderSize = value;
175         }
176 }
177
178 /*
179  * Search (case insensitive) for header entry with specified name.
180  * Enter: harvester Harvester.
181  *        name      Header name.
182  * Return: Header, or NULL if not found.
183  */
184 static HeaderEntry *addrharvest_find( 
185         AddressHarvester* harvester, const gchar *name ) {
186         HeaderEntry *retVal;
187         GList *node;
188
189         retVal = NULL;
190         node = harvester->headerTable;
191         while( node ) {
192                 HeaderEntry *entry;
193
194                 entry = node->data;
195                 if( g_strcasecmp( entry->header, name ) == 0 ) {
196                         retVal = entry;
197                         break;
198                 }
199                 node = g_list_next( node );
200         }
201         return retVal;
202 }
203
204 /*
205  * Set selection for specified heaader.
206  * Enter: harvester Harvester.
207  *        name      Header name.
208  *        value     Value to set.
209  */
210 void addrharvest_set_header(
211         AddressHarvester* harvester, const gchar *name, const gboolean value )
212 {
213         HeaderEntry *entry;
214
215         g_return_if_fail( harvester != NULL );
216         entry = addrharvest_find( harvester, name );
217         if( entry != NULL ) {
218                 entry->selected = value;
219         }
220 }
221
222 /*
223  * Get address count
224  * Enter: harvester Harvester.
225  *        name      Header name.
226  * Return: Address count, or -1 if header not found.
227  */
228 gint addrharvest_get_count( AddressHarvester* harvester, const gchar *name ) {
229         HeaderEntry *entry;
230         gint count;
231
232         count = -1;
233         g_return_val_if_fail( harvester != NULL, count );
234         entry = addrharvest_find( harvester, name );
235         if( entry != NULL ) {
236                 count = entry->count;
237         }
238         return count;
239 }
240
241 /*
242 * Free up object by releasing internal memory.
243 * Enter: harvester Harvester.
244 */
245 void addrharvest_free( AddressHarvester *harvester ) {
246         g_return_if_fail( harvester != NULL );
247
248         /* Free internal stuff */
249         addrharvest_free_table( harvester );
250         g_free( harvester->path );
251
252         /* Clear pointers */
253         harvester->path = NULL;
254         harvester->retVal = MGU_SUCCESS;
255         harvester->headerTable = NULL;
256
257         harvester->folderSize = 0;
258
259         /* Now release object */
260         g_free( harvester );
261 }
262
263 /*
264 * Display object to specified stream.
265 * Enter: harvester Harvester.
266 *        stream    Output stream.
267 */
268 void addrharvest_print( AddressHarvester *harvester, FILE *stream ) {
269         GList *node;
270         HeaderEntry *entry;
271
272         g_return_if_fail( harvester != NULL );
273         fprintf( stream, "Address Harvester:\n" );
274         fprintf( stream, " file path: '%s'\n", harvester->path );
275         fprintf( stream, "max folder: %d'\n", harvester->folderSize );
276
277         node = harvester->headerTable;
278         while( node ) {
279                 entry = node->data;
280                 fprintf( stream, "   header: %s", entry->header );
281                 fprintf( stream, "\t: %s", entry->selected ? "yes" : "no" );
282                 fprintf( stream, "\t: %d\n", entry->count );
283                 node = g_list_next( node );
284         }
285         fprintf( stream, "  ret val: %d\n", harvester->retVal );
286 }
287
288 /*
289  * Insert address into cache.
290  * Enter: harvester Harvester object.
291  *        entry     Header object.
292  *        cache     Address cache to load.
293  *        name      Name.
294  *        address   eMail address.
295  */
296 static void addrharvest_insert_cache(
297                 AddressHarvester *harvester, HeaderEntry *entry,
298                 AddressCache *cache, const gchar *name,
299                 const gchar *address )
300 {
301         ItemPerson *person;
302         ItemFolder *folder;
303         gchar *folderName;
304         gboolean newFolder;
305         gint cnt;
306         gchar *key, *value;
307
308         newFolder = FALSE;
309         folder = entry->folder;
310         if( folder == NULL ) {
311                 newFolder = TRUE;       /* No folder yet */
312         }
313         if( entry->count % harvester->folderSize == 0 ) {
314                 newFolder = TRUE;       /* Folder is full */
315         }
316
317         if( newFolder ) {
318                 cnt = 1 + ( entry->count / harvester->folderSize );
319                 folderName = g_strdup_printf( "%s (%d)", entry->header, cnt );
320                 folder = addritem_create_item_folder();
321                 addritem_folder_set_name( folder, folderName );
322                 addritem_folder_set_remarks( folder, "" );
323                 addrcache_id_folder( cache, folder );
324                 addrcache_add_folder( cache, folder );
325                 entry->folder = folder;
326                 g_free( folderName );
327         }
328
329         /* Insert address */
330         key = g_strdup( address );
331         g_strdown( key );
332         person = g_hash_table_lookup( harvester->dupTable, key );
333         if( person ) {
334                 /* Use longest name */
335                 value = ADDRITEM_NAME(person);
336                 if( strlen( name ) > strlen( value ) ) {
337                         addritem_person_set_common_name( person, name );
338                 }
339                 g_free( key );
340         }
341         else {
342                 /* Insert entry */
343                 person = addrcache_add_contact(
344                                 cache, folder, name, address, "" );
345                 g_hash_table_insert( harvester->dupTable, key, person );
346                 entry->count++;
347         }
348 }
349
350 /*
351  * Remove specified string from name.
352  * Enter: name Name.
353  *        em   String to remove.
354  */
355 static void addrharvest_del_email( gchar *name, gchar *em ) {
356         gchar *p;
357         gint ilen;
358
359         ilen = strlen( em );
360         while( p = strcasestr( name, em )  ) {
361                 memmove( p, p + ilen, ilen + 1 );
362         }
363 }
364
365 /*
366  * Find position of at (@) character in buffer.
367  * Enter:  buffer Start of buffer.
368  * Return: Position of at character, or NULL if not found.
369  * Note: This function searches for the last occurrence of an 'at' character
370  * prior to a valid delimiter character for the end of address. This enables
371  * an address to be found where it is also used as the name of the
372  * recipient. For example:
373  *     "axle.rose@netscape.com" <axle.rose@netscape.com>
374  * The last occurrence of the at character is detected.
375  */
376 static gchar *addrharvest_find_at( const gchar *buffer ) {
377         gchar *atCh;
378         gchar *p;
379
380         atCh = strchr( buffer, '@' );
381         if( atCh ) {
382                 /* Search forward for another one */
383                 p = atCh + 1;
384                 while( *p ) {
385                         if( *p == '>' ) {
386                                 break;
387                         }
388                         if( *p == ',' ) {
389                                 break;
390                         }
391                         if( *p == '\n' ) {
392                                 break;
393                         }
394                         if( *p == '@' ) {
395                                 atCh = p;
396                                 break;
397                         }
398                         p++;
399                 }
400         }
401         return atCh;
402 }
403
404 /*
405  * Find start and end of address string.
406  * Enter: buf Start address of buffer to process (not modified).
407  *        atp Pointer to email at (@) character.
408  *        bp  Pointer to start of email address (returned).
409  *        ep  Pointer to end of email address (returned).
410  */
411 static void addrharvest_find_address(
412                 const gchar *buf, const gchar *atp, const gchar **bp,
413                 const gchar **ep )
414 {
415         const gchar *p;
416
417         /* Find first non-separator char */
418         *bp = NULL;
419         p = buf;
420         while( TRUE ) {
421                 if( strchr( ",; \n\r", *p ) == NULL ) break;
422                 p++;
423         }
424         *bp = p;
425
426         /* Search forward for end of address */
427         *ep = NULL;
428         p = atp + 1;
429         while( TRUE ) {
430                 if( strchr( ",;", *p ) ) break;
431                 p++;
432         }
433         *ep = p;
434 }
435
436 /*
437  * Extract E-Mail address from buffer. If found, address is removed from
438  * buffer.
439  * Enter:  buffer Address buffer.
440  * Return: E-Mail address, or NULL if none found. Must g_free() when done.
441  */
442 static gchar *addrharvest_extract_address( gchar *buffer ) {
443         gchar *addr;
444         gchar *atCh, *p, *bp, *ep;
445         gint len;
446
447         addr = NULL;
448         atCh = addrharvest_find_at( buffer );
449         if( atCh ) {
450                 /* Search back for start of address */
451                 bp = NULL;
452                 p = atCh;
453                 while( p >= buffer ) {
454                         bp = p;
455                         if( *p == '<' ) {
456                                 *p = ' ';
457                                 bp++;
458                                 break;
459                         }
460                         p--;
461                 }
462
463                 /* Search fwd for end */
464                 ep = NULL;
465                 ep = p = atCh;
466                 while( *p ) {
467                         if( *p == '>' ) {
468                                 *p = ' ';
469                                 break;
470                         }
471                         else if( *p == ' ' ) {
472                                 break;
473                         }
474                         ep = p;
475                         p++;
476                 }
477
478                 /* Extract email */
479                 if( bp != NULL ) {
480                         len = ( ep - bp );
481                         if( len > 0 ) {
482                                 addr = g_strndup( bp, len + 1 );
483                                 memmove( bp, ep, len );
484                                 *bp = ' ';
485                         }
486                 }       
487         }
488         return addr;
489 }
490
491 /*
492  * Parse address from header buffer creating address in cache.
493  * Enter: harvester Harvester object.
494  *        entry     Header object.
495  *        cache     Address cache to load.
496  *        hdrBuf    Pointer to header buffer.
497  */
498 static void addrharvest_parse_address(
499                 AddressHarvester *harvester, HeaderEntry *entry,
500                 AddressCache *cache, const gchar *hdrBuf )
501 {
502         gchar buffer[ ADDR_BUFFSIZE + 2 ];
503         const gchar *bp;
504         const gchar *ep;
505         gchar *atCh, *email, *p;
506         gint bufLen;
507
508         /* Search for an address */
509         while( atCh = addrharvest_find_at( hdrBuf ) ) {
510                 /* Find addres string */
511                 addrharvest_find_address( hdrBuf, atCh, &bp, &ep );
512
513                 /* Copy into buffer */
514                 bufLen = ( size_t ) ( ep - bp );
515                 if( bufLen > ADDR_BUFFSIZE ) {
516                         bufLen = ADDR_BUFFSIZE;
517                 }
518                 strncpy( buffer, bp, bufLen );
519                 buffer[ bufLen ] = '\0';
520                 buffer[ bufLen + 1 ] = '\0';
521                 buffer[ bufLen + 2 ] = '\0';
522
523                 /* Make whitespace */
524                 p = buffer;
525                 while( *p ) {
526                         if( *p == '\r' || *p == '\n' || *p == '\t' ) *p = ' ';
527                         p++;
528                 }
529
530                 /* Extract address from buffer */
531                 email = addrharvest_extract_address( buffer );
532                 if( email ) {
533                         /* Unescape characters */
534                         mgu_str_unescape( buffer );
535
536                         /* Remove noise characaters */
537                         addrharvest_del_email( buffer, REM_NAME_STRING );
538                         addrharvest_del_email( buffer, REM_NAME_STRING2 );
539
540                         /* Remove leading trailing quotes and spaces */
541                         mgu_str_ltc2space( buffer, '\"', '\"' );
542                         mgu_str_ltc2space( buffer, '\'', '\'' );
543                         mgu_str_ltc2space( buffer, '\"', '\"' );
544                         mgu_str_ltc2space( buffer, '(', ')' );
545                         g_strstrip( buffer );
546
547                         /* Insert into address book */
548                         addrharvest_insert_cache(
549                                 harvester, entry, cache, buffer, email );
550                         g_free( email );
551                 }
552                 hdrBuf = ep;
553         }
554 }
555
556 /*
557  * Read specified file into address book.
558  * Enter:  harvester Harvester object.
559  *         fileName  File to read.
560  *         cache     Address cache to load.
561  * Return: Status.
562  */
563 static gint addrharvest_readfile(
564                 AddressHarvester *harvester, const gchar *fileName,
565                 AddressCache *cache )
566 {
567         gint retVal;
568         FILE *msgFile;
569         gchar buf[ MSG_BUFFSIZE ], tmp[ MSG_BUFFSIZE ];
570         HeaderEntry *entry;
571
572         msgFile = fopen( fileName, "rb" );
573         if( ! msgFile ) {
574                 /* Cannot open file */
575                 retVal = MGU_OPEN_FILE;
576                 return retVal;
577         }
578
579         for( ;; ) {
580                 gint val;
581                 gchar *p;
582
583                 val = procheader_get_one_field(
584                         buf, sizeof(buf), msgFile, NULL );
585                 if( val == -1 ) {
586                         break;
587                 }
588                 conv_unmime_header( tmp, sizeof(tmp), buf, NULL );
589                 if(( p = strchr( tmp, ':' ) ) != NULL ) {
590                         const gchar *hdr;
591
592                         *p = '\0';
593                         hdr = p + 1;
594                         entry = addrharvest_find( harvester, tmp );
595                         if( entry && entry->selected ) {
596                                 addrharvest_parse_address(
597                                         harvester, entry, cache, hdr );
598                         }
599                 }
600         }
601
602         fclose( msgFile );
603         return MGU_SUCCESS;
604 }
605
606 /*
607  * ============================================================================
608  * Read all files in specified directory into address book.
609  * Enter:  harvester Harvester object.
610  *         cache     Address cache to load.
611  * Return: Status.
612  * ============================================================================
613  */
614 gint addrharvest_harvest( AddressHarvester *harvester, AddressCache *cache ) {
615         gint retVal;
616         DIR *dp;
617         struct dirent *d;
618         struct stat s;
619         gint num;
620
621         retVal = MGU_BAD_ARGS;
622         g_return_val_if_fail( harvester != NULL, retVal );
623         g_return_val_if_fail( cache != NULL, retVal );
624         g_return_val_if_fail( harvester->path != NULL, retVal );
625
626         /* Clear cache */
627         addrcache_clear( cache );
628         cache->dataRead = FALSE;
629
630         if( chdir( harvester->path ) < 0 ) {
631                 /* printf( "Error changing dir\n" ); */
632                 return retVal;
633         }
634
635         if( ( dp = opendir( harvester->path ) ) == NULL ) {
636                 /* printf( "Error opening dir\n" ); */
637                 return retVal;
638         }
639
640         while( ( d = readdir( dp ) ) != NULL ) {
641                 stat( d->d_name, &s );
642                 if( S_ISREG( s.st_mode ) ) {
643                         if( ( num = to_number( d->d_name ) ) >= 0 ) {
644                                 addrharvest_readfile(
645                                         harvester, d->d_name, cache );
646                         }
647                 }
648         }
649
650         closedir( dp );
651
652         /* Mark cache */
653         cache->modified = FALSE;
654         cache->dataRead = TRUE;
655
656         return retVal;
657 }
658
659 /*
660  * ============================================================================
661  * Test whether any headers have been selected for processing.
662  * Enter:  harvester Harvester object.
663  * Return: TRUE if a header was selected, FALSE if none were selected.
664  * ============================================================================
665  */
666 gboolean addrharvest_check_header( AddressHarvester *harvester ) {
667         gboolean retVal;
668         GList *node;
669
670         retVal = FALSE;
671         g_return_val_if_fail( harvester != NULL, retVal );
672
673         node = harvester->headerTable;
674         while( node ) {
675                 HeaderEntry *entry;
676
677                 entry = ( HeaderEntry * ) node->data;
678                 if( entry->selected ) return TRUE;
679                 node = g_list_next( node );
680         }
681         return retVal;
682 }
683
684 /*
685  * ============================================================================
686  * End of Source.
687  * ============================================================================
688  */
689
690