Included address harvester feature.
[claws.git] / src / addrharvest.c
1 /*
2  * Sylpheed -- a GTK+ based, lightweight, and fast e-mail client
3  * Copyright (C) 2002 Match Grun
4  *
5  * This program is free software; you can redistribute it and/or modify
6  * it under the terms of the GNU General Public License as published by
7  * the Free Software Foundation; either version 2 of the License, or
8  * (at your option) any later version.
9  *
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  * GNU General Public License for more details.
14  *
15  * You should have received a copy of the GNU General Public License
16  * along with this program; if not, write to the Free Software
17  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
18  */
19
20 /*
21  * Functions for an E-Mail address harvester.
22  * Code still needs some work. Address parsing not strictly correct.
23  */
24
25 #include <sys/stat.h>
26 #include <dirent.h>
27 #include <glib.h>
28 #include <string.h>
29
30 #include "utils.h"
31 #include "mgutils.h"
32 #include "addrharvest.h"
33 #include "addritem.h"
34
35 /* Mail header names of interest */
36 static gchar *_headerFrom_     = HEADER_FROM;
37 static gchar *_headerReplyTo_  = HEADER_REPLY_TO;
38 static gchar *_headerSender_   = HEADER_SENDER;
39 static gchar *_headerErrorsTo_ = HEADER_ERRORS_TO;
40 static gchar *_headerCC_       = HEADER_CC;
41 static gchar *_headerTo_       = HEADER_TO;
42
43 static gchar *_emptyString_ = "";
44
45 #define MSG_BUFFSIZE    8192
46 #define DFL_FOLDER_SIZE 20
47
48 /*
49  * Header entry.
50  */
51 typedef struct _HeaderEntry HeaderEntry;
52 struct _HeaderEntry {
53         gchar      *header;
54         gboolean   selected;
55         ItemFolder *folder;
56         gint       count;
57 };
58
59 /*
60  * Build header table entry.
61  * Enter: harvester Harvester object.
62  *        name      Header name.
63  */
64 static void *addrharvest_build_entry(
65                 AddressHarvester* harvester, gchar *name )
66 {
67         HeaderEntry *entry;
68
69         entry = g_new0( HeaderEntry, 1 );
70         entry->header = name;
71         entry->selected = FALSE;
72         entry->folder = NULL;
73         entry->count = 0;
74         harvester->headerTable = g_list_append( harvester->headerTable, entry );
75 }
76
77 static void addrharvest_print_hdrentry( HeaderEntry *entry, FILE *stream ) {
78         fprintf( stream, "Header Entry\n" );
79         fprintf( stream, "    name : %s\n", entry->header );
80         fprintf( stream, "selected : %s\n", entry->selected ? "yes" : "no" );
81 }
82
83 /*
84  * Free key in table.
85  */
86 static gint addrharvest_free_table_vis( gpointer key, gpointer value, gpointer data ) {
87         g_free( key );
88         key = NULL;
89         value = NULL;
90         return TRUE;
91 }
92
93 /*
94  * Free lookup table.
95  */
96 static void addrharvest_free_table( AddressHarvester* harvester ) {
97         GList *node;
98         HeaderEntry *entry;
99
100         /* Free header list */
101         node = harvester->headerTable;
102         while( node ) {
103                 entry = ( HeaderEntry * ) node->data;
104                 entry->header = NULL;
105                 entry->selected = FALSE;
106                 entry->folder = NULL;
107                 entry->count = 0;
108                 g_free( entry );
109                 node = g_list_next( node );
110         }
111         g_list_free( harvester->headerTable );
112         harvester->headerTable = NULL;
113
114         /* Free duplicate table */
115         g_hash_table_freeze( harvester->dupTable );
116         g_hash_table_foreach_remove( harvester->dupTable, addrharvest_free_table_vis, NULL );
117         g_hash_table_thaw( harvester->dupTable );
118         g_hash_table_destroy( harvester->dupTable );
119         harvester->dupTable = NULL;
120 }
121
122 /*
123 * Create new object.
124 * Return: Harvester.
125 */
126 AddressHarvester *addrharvest_create( void ) {
127         AddressHarvester *harvester;
128
129         harvester = g_new0( AddressHarvester, 1 );
130         harvester->path = NULL;
131         harvester->bufptr = harvester->buffer;
132         harvester->dupTable = g_hash_table_new( g_str_hash, g_str_equal );
133         harvester->folderSize = DFL_FOLDER_SIZE;
134         harvester->retVal = MGU_SUCCESS;
135
136         /* Build header table */
137         harvester->headerTable = NULL;
138         addrharvest_build_entry( harvester, _headerFrom_ );
139         addrharvest_build_entry( harvester, _headerReplyTo_ );
140         addrharvest_build_entry( harvester, _headerSender_ );
141         addrharvest_build_entry( harvester, _headerErrorsTo_ );
142         addrharvest_build_entry( harvester, _headerCC_ );
143         addrharvest_build_entry( harvester, _headerTo_ );
144
145         return harvester;
146 }
147
148 /*
149 * Properties...
150 */
151 /*
152  * Specify path to folder that will be harvested.
153  * Entry: harvester Harvester object.
154  *        value     Full directory path.
155  */
156 void addrharvest_set_path( AddressHarvester* harvester, const gchar *value ) {
157         g_return_if_fail( harvester != NULL );
158         harvester->path = mgu_replace_string( harvester->path, value );
159         g_strstrip( harvester->path );
160 }
161
162 /*
163  * Specify maximum folder size.
164  * Entry: harvester Harvester object.
165  *        value     Folder size.
166  */
167 void addrharvest_set_folder_size( AddressHarvester* harvester, const gint value ) {
168         g_return_if_fail( harvester != NULL );
169         if( value > 0 ) {
170                 harvester->folderSize = value;
171         }
172 }
173
174 /*
175  * Search (case insensitive) for header entry with specified name.
176  * Enter: harvester Harvester.
177  *        name      Header name.
178  * Return: Header, or NULL if not found.
179  */
180 static HeaderEntry *addrharvest_find( 
181         AddressHarvester* harvester, const gchar *name ) {
182         HeaderEntry *retVal;
183         GList *node;
184
185         retVal = NULL;
186         node = harvester->headerTable;
187         while( node ) {
188                 HeaderEntry *entry;
189
190                 entry = node->data;
191                 if( g_strcasecmp( entry->header, name ) == 0 ) {
192                         retVal = entry;
193                         break;
194                 }
195                 node = g_list_next( node );
196         }
197         return retVal;
198 }
199
200 /*
201  * Set selection for specified heaader.
202  * Enter: harvester Harvester.
203  *        name      Header name.
204  *        value     Value to set.
205  */
206 void addrharvest_set_header(
207         AddressHarvester* harvester, const gchar *name, const gboolean value )
208 {
209         HeaderEntry *entry;
210
211         g_return_if_fail( harvester != NULL );
212         entry = addrharvest_find( harvester, name );
213         if( entry != NULL ) {
214                 entry->selected = value;
215         }
216 }
217
218 /*
219  * Get address count
220  * Enter: harvester Harvester.
221  *        name      Header name.
222  * Return: Address count, or -1 if header not found.
223  */
224 gint addrharvest_get_count(
225         AddressHarvester* harvester, const gchar *name )
226 {
227         HeaderEntry *entry;
228         gint count;
229
230         count = -1;
231         g_return_val_if_fail( harvester != NULL, count );
232         entry = addrharvest_find( harvester, name );
233         if( entry != NULL ) {
234                 count = entry->count;
235         }
236         return count;
237 }
238
239 /*
240 * Free up object by releasing internal memory.
241 * Enter: harvester Harvester.
242 */
243 void addrharvest_free( AddressHarvester *harvester ) {
244         g_return_if_fail( harvester != NULL );
245
246         /* Free internal stuff */
247         addrharvest_free_table( harvester );
248         g_free( harvester->path );
249
250         /* Clear pointers */
251         harvester->path = NULL;
252         harvester->retVal = MGU_SUCCESS;
253         harvester->headerTable = NULL;
254
255         harvester->folderSize = 0;
256
257         /* Now release object */
258         g_free( harvester );
259 }
260
261 /*
262 * Display object to specified stream.
263 * Enter: harvester Harvester.
264 *        stream    Output stream.
265 */
266 void addrharvest_print( AddressHarvester *harvester, FILE *stream ) {
267         GList *node;
268         HeaderEntry *entry;
269
270         g_return_if_fail( harvester != NULL );
271         fprintf( stream, "Address Harvester:\n" );
272         fprintf( stream, " file path: '%s'\n", harvester->path );
273         fprintf( stream, "max folder: %d'\n", harvester->folderSize );
274
275         node = harvester->headerTable;
276         while( node ) {
277                 entry = node->data;
278                 fprintf( stream, "   header: %s", entry->header );
279                 fprintf( stream, "\t: %s", entry->selected ? "yes" : "no" );
280                 fprintf( stream, "\t: %d\n", entry->count );
281                 node = g_list_next( node );
282         }
283         fprintf( stream, "  ret val: %d\n", harvester->retVal );
284 }
285
286 #ifdef STANDALONE
287 gint to_number(const gchar *nstr) {
288         register const gchar *p;
289         if (*nstr == '\0') return -1;
290         for( p = nstr; *p != '\0'; p++ )
291                 if (!isdigit(*p)) return -1;
292         return atoi(nstr);
293 }
294 #endif
295
296 /*
297  * Replace leading and trailing characters (quotes) in input string
298  * with spaces. Only matching non-blank characters that appear at both
299  * start and end of string are replaces. Control characters are also
300  * replaced with spaces.
301  * Enter: str String to process.
302  *        ch  Character to remove.
303  */
304 static void addrutil_strip_char( gchar *str, gchar ch ) {
305         gchar *as;
306         gchar *ae;
307
308         /* Search forwards for first non-space match */
309         as = str;
310         ae = -1 + str + strlen( str );
311         while( as < ae ) {
312                 if( *as != ' ' ) {
313                         if( *as == ch ) {
314                                 /* Search backwards from end for match */
315                                 while( ae > as ) {
316                                         if( *ae != ' ' ) {
317                                                 if( *ae == ch ) {
318                                                         *as = ' ';
319                                                         *ae = ' ';
320                                                         return;
321                                                 }
322                                                 if( *ae < 32 ) {
323                                                         *ae = ' ';
324                                                 }
325                                                 else if( *ae == 127 ) {
326                                                         *ae = ' ';
327                                                 }
328                                                 else {
329                                                         return;
330                                                 }
331                                         }
332                                         ae--;
333                                 }
334                         }
335                         if( *as < 32 ) {
336                                 *as = ' ';
337                         }
338                         else if( *as == 127 ) {
339                                 *as = ' ';
340                         }
341                         else {
342                                 return;
343                         }
344                 }
345                 as++;
346         }
347         return;
348 }
349
350 /*
351  * Remove backslash character from input string.
352  * Enter: str String to process.
353  */
354 static void addrutil_unescape( gchar *str ) {
355         gchar *p;
356         gint ilen;
357
358         p = str;
359         while( *p ) {
360                 if( *p == '\\' ) {
361                         ilen = strlen( p + 1 );
362                         memmove( p, p + 1, ilen );
363                 }
364                 p++;
365         }
366 }
367
368 /*
369  * Parse name from email address string.
370  * Enter: buf Start address of buffer to process (not modified).
371  *        atp Pointer to email at (@) character.
372  *        ap  Pointer to start of email address returned.
373  *        ep  Pointer to end of email address returned.
374  * Return: Parsed name or NULL if not present. This should be g_free'd
375  * when done.
376  */
377 static gchar *addrutil_parse_name(
378                 const gchar *buf, const gchar *atp, const gchar **ap,
379                 const gchar **ep )
380 {
381         gchar *name;
382         const gchar *pos;
383         const gchar *tmp;
384         const gchar *bp;
385         gint ilen;
386
387         name = NULL;
388         *ap = NULL;
389         *ep = NULL;
390
391         /* Find first non-separator char */
392         bp = buf;
393         while( TRUE ) {
394                 if( strchr( ",; \n\r", *bp ) == NULL ) break;
395                 bp++;
396         }
397
398         /* Search back for start of name */
399         tmp = atp;
400         pos = atp;
401         while( pos >= bp ) {
402                 tmp = pos;
403                 if( *pos == '<' ) {
404                         /* Found start of address/end of name part */
405                         ilen = -1 + ( size_t ) ( pos - bp );
406                         name = g_strndup( bp, ilen + 1 );
407                         *(name + ilen + 1) = '\0';
408
409                         /* Remove leading trailing quotes and spaces */
410                         addrutil_strip_char( name, '\"' );
411                         addrutil_strip_char( name, '\'' );
412                         addrutil_strip_char( name, '\"' );
413                         addrutil_unescape( name );
414                         g_strstrip( name );
415                         break;
416                 }
417                 pos--;
418         }
419         *ap = tmp;
420
421         /* Search forward for end of address */
422         pos = atp + 1;
423         while( TRUE ) {
424                 if( *pos == '>' ) {
425                         pos++;
426                         break;
427                 }
428                 if( strchr( ",; \'\n\r", *pos ) ) break;
429                 pos++;
430         }
431         *ep = pos;
432
433         return name;
434
435 }
436
437 /*
438  * Insert address into cache.
439  * Enter: harvester Harvester object.
440  *        entry     Header object.
441  *        cache     Address cache to load.
442  *        name      Name.
443  *        address   eMail address.
444  * Return: Person inserted.
445  */
446 static ItemPerson *addrharvest_insert_cache(
447                 AddressHarvester *harvester, HeaderEntry *entry,
448                 AddressCache *cache, const gchar *name,
449                 const gchar *address )
450 {
451         ItemPerson *person;
452         ItemFolder *folder;
453         gchar *folderName;
454         gboolean newFolder;
455         gint cnt;
456
457         newFolder = FALSE;
458         folder = entry->folder;
459         if( folder == NULL ) {
460                 newFolder = TRUE;       /* No folder yet */
461         }
462         if( entry->count % harvester->folderSize == 0 ) {
463                 newFolder = TRUE;       /* Folder is full */
464         }
465
466         if( newFolder ) {
467                 cnt = 1 + ( entry->count / harvester->folderSize );
468                 folderName = g_strdup_printf( "%s (%d)", entry->header, cnt );
469                 folder = addritem_create_item_folder();
470                 addritem_folder_set_name( folder, folderName );
471                 addritem_folder_set_remarks( folder, "" );
472                 addrcache_id_folder( cache, folder );
473                 addrcache_add_folder( cache, folder );
474                 entry->folder = folder;
475                 g_free( folderName );
476         }
477
478         person = addrcache_add_contact( cache, folder, name, address, "" );
479         entry->count++;
480         return person;
481 }
482
483 #define ATCHAR "@"
484
485 /*
486  * Parse address from header buffer creating address in cache.
487  * Enter: harvester Harvester object.
488  *        entry     Header object.
489  *        cache     Address cache to load.
490  *        hdrBuf    Pointer to header buffer.
491  */
492 static void addrharvest_parse_address(
493                 AddressHarvester *harvester, HeaderEntry *entry,
494                 AddressCache *cache, const gchar *hdrBuf )
495 {
496         gchar addr[ MSG_BUFFSIZE ];
497         const gchar *bp;
498         const gchar *ep;
499         gchar *atCh;
500         gchar *name;
501         gchar *value;
502         gchar *key;
503         gint addrLen;
504         ItemPerson *person;
505
506         /* printf( "hdrBuf    :%s:\n", hdrBuf ); */
507         /* Search for an address */
508         while( atCh = strcasestr( hdrBuf, ATCHAR ) ) {
509                 name = addrutil_parse_name( hdrBuf, atCh, &bp, &ep );
510                 addrLen = ( size_t ) ( ep - bp );
511                 strncpy( addr, bp, addrLen );
512                 addr[ addrLen ] = '\0';
513                 extract_address( addr );
514                 /* printf( "name/addr :%s:\t:%s:\n", addr, name ); */
515                 hdrBuf = ep;
516                 if( atCh == ep ) {
517                         hdrBuf++;
518                 }
519                 if( strlen( addr ) > 0 ) {
520                         if( name == NULL ) {
521                                 name = g_strdup( _emptyString_ );
522                         }
523                         g_strdown( addr );
524                         /* printf( "name/addr :%s:\t:%s:\n", addr, name ); */
525                         person = g_hash_table_lookup(
526                                         harvester->dupTable, addr );
527                         if( person ) {
528                                 /* Use longest name */
529                                 value = ADDRITEM_NAME(person);
530                                 if( strlen( name ) > strlen( value ) ) {
531                                         addritem_person_set_common_name(
532                                                 person, name );
533                                 }
534                         }
535                         else {
536                                 /* Insert entry */
537                                 key = g_strdup( addr );
538                                 person = addrharvest_insert_cache(
539                                         harvester, entry, cache, name, addr );
540                                 g_hash_table_insert(
541                                         harvester->dupTable, key, person );
542                         }
543                 }
544                 g_free( name );
545         }
546 }
547
548 /*
549  * Read specified file into address book.
550  * Enter:  harvester Harvester object.
551  *         fileName  File to read.
552  *         cache     Address cache to load.
553  * Return: Status.
554  */
555 static gint addrharvest_readfile(
556                 AddressHarvester *harvester, const gchar *fileName,
557                 AddressCache *cache )
558 {
559         gint retVal;
560         FILE *msgFile;
561         gchar buf[ MSG_BUFFSIZE ], tmp[ MSG_BUFFSIZE ];
562         HeaderEntry *entry;
563
564         msgFile = fopen( fileName, "r" );
565         if( ! msgFile ) {
566                 /* Cannot open file */
567                 retVal = MGU_OPEN_FILE;
568                 return retVal;
569         }
570
571         for( ;; ) {
572                 gint val;
573                 gchar *p;
574
575                 val = procheader_get_one_field( buf, sizeof(buf), msgFile, NULL );
576                 if( val == -1 ) {
577                         break;
578                 }
579                 conv_unmime_header( tmp, sizeof(tmp), buf, NULL );
580                 if(( p = strchr( tmp, ':' ) ) != NULL ) {
581                         const gchar *hdr;
582
583                         *p = '\0';
584                         hdr = p + 1;
585                         entry = addrharvest_find( harvester, tmp );
586                         if( entry && entry->selected ) {
587                                 addrharvest_parse_address(
588                                         harvester, entry, cache, hdr );
589                         }
590                 }
591         }
592
593         fclose( msgFile );
594         return MGU_SUCCESS;
595 }
596
597 #undef ATCHAR
598
599 /*
600  * ============================================================================
601  * Read all files in specified directory into address book.
602  * Enter:  harvester Harvester object.
603  *         cache     Address cache to load.
604  * Return: Status.
605  * ============================================================================
606  */
607 gint addrharvest_harvest( AddressHarvester *harvester, AddressCache *cache ) {
608         gint retVal;
609         DIR *dp;
610         struct dirent *d;
611         struct stat s;
612         gint num;
613
614         retVal = MGU_BAD_ARGS;
615         g_return_val_if_fail( harvester != NULL, retVal );
616         g_return_val_if_fail( cache != NULL, retVal );
617         g_return_val_if_fail( harvester->path != NULL, retVal );
618
619         /* Clear cache */
620         addrcache_clear( cache );
621         cache->dataRead = FALSE;
622
623         if( chdir( harvester->path ) < 0 ) {
624                 printf( "Error changing dir\n" );
625                 return retVal;
626         }
627
628         if( ( dp = opendir( harvester->path ) ) == NULL ) {
629                 printf( "Error opening dir\n" );
630                 return retVal;
631         }
632
633         while( ( d = readdir( dp ) ) != NULL ) {
634                 stat( d->d_name, &s );
635                 if( S_ISREG( s.st_mode ) ) {
636                         if( ( num = to_number( d->d_name ) ) >= 0 ) {
637                                 addrharvest_readfile( harvester, d->d_name, cache );
638                         }
639                 }
640         }
641
642         closedir( dp );
643
644         /* Mark cache */
645         cache->modified = FALSE;
646         cache->dataRead = TRUE;
647
648         return retVal;
649 }
650
651 /*
652  * ============================================================================
653  * Test whether any headers have been selected for processing.
654  * Enter:  harvester Harvester object.
655  * Return: TRUE if a header was selected, FALSE if none were selected.
656  * ============================================================================
657  */
658 gboolean addrharvest_check_header( AddressHarvester *harvester ) {
659         gboolean retVal;
660         GList *node;
661
662         retVal = FALSE;
663         g_return_val_if_fail( harvester != NULL, retVal );
664
665         node = harvester->headerTable;
666         while( node ) {
667                 HeaderEntry *entry;
668
669                 entry = ( HeaderEntry * ) node->data;
670                 if( entry->selected ) return TRUE;
671                 node = g_list_next( node );
672         }
673         return retVal;
674 }
675
676 /*
677  * ============================================================================
678  * End of Source.
679  * ============================================================================
680  */
681
682