CVS: swish-e/src index.h,1.51,1.52 swish.h,1.156,1.157 compress.c,1.47,1.48 index.c,1.192,1.193 merg

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 454-5900

Update of /cvsroot/swishe/swish-e/src
In directory usw-pr-cvs1:/tmp/cvs-serv10039

Modified Files:
	index.h swish.h compress.c index.c merge.c 
Log Message:

Here is one more try on merge issue. Now, it should also work with -e.

Also some minor changes to compress.c. Added the ability to pass number
0 to compress routinesi. I have also changed 5 by MAXINTCOMPSIZE.

BTW, I will also make some other improvements to compress.c


Index: index.h
===================================================================
RCS file: /cvsroot/swishe/swish-e/src/index.h,v
retrieving revision 1.51
retrieving revision 1.52
diff -u -r1.51 -r1.52

--- index.h	7 Aug 2002 00:28:38 -0000	1.51
+++ index.h	29 Aug 2002 13:45:39 -0000	1.52
@@ -129,7 +129,8 @@
 
 void    do_index_file(SWISH * sw, FileProp * fprop);
 
-ENTRY  *addentry(SWISH *, char *, int, int, int, int);
+ENTRY  *getentry(SWISH * , char *);
+void    addentry(SWISH *, ENTRY *, int, int, int, int);
 
 void    addCommonProperties(SWISH * sw, FileProp * fprop, FileRec * fi, char *title, char *summary, int start);
 

Index: swish.h
===================================================================
RCS file: /cvsroot/swishe/swish-e/src/swish.h,v
retrieving revision 1.156
retrieving revision 1.157
diff -u -r1.156 -r1.157
--- swish.h	22 Aug 2002 22:58:39 -0000	1.156
+++ swish.h	29 Aug 2002 13:45:40 -0000	1.157
@@ -574,6 +574,8 @@
     int     filenum;                // current filenumber to use
     
 
+    /* Used by merge.c */
+    int    *merge_file_num_map;
 }
 IndexFILE;
 

Index: compress.c
===================================================================
RCS file: /cvsroot/swishe/swish-e/src/compress.c,v
retrieving revision 1.47
retrieving revision 1.48
diff -u -r1.47 -r1.48
--- compress.c	16 Jul 2002 19:02:39 -0000	1.47
+++ compress.c	29 Aug 2002 13:45:40 -0000	1.48
@@ -43,8 +43,16 @@
 {
     int     _i = 0,
             _r = num;
-    unsigned char _s[5];
+    unsigned char _s[MAXINTCOMPSIZE];
 
+    /* Trivial case: 0 */
+    if(!_r)
+    {
+        f_putc(0,fp);
+        return;
+    }
+
+    /* Any other case ... */
     while (_r)
     {
         _s[_i++] = _r & 127;
@@ -61,6 +69,14 @@
 {
     int     _i = num;
 
+    /* Trivial case: 0 */
+    if(!_i)
+    {
+        *buffer-- = 0; 
+        return 0;
+    }
+
+    /* Any other case ... */
     while (_i)
     {
         *buffer = _i & 127;
@@ -80,8 +96,16 @@
 {
     int     _i = 0,
             _r = num;
-    unsigned char _s[5];
+    unsigned char _s[MAXINTCOMPSIZE];
 
+    /* Trivial case: 0 */
+    if(!_r)
+    {
+        *buffer++ = 0;
+        return buffer;
+    }
+
+    /* Any other case ... */
     while (_r)
     {
         _s[_i++] = _r & 127;
@@ -364,7 +388,7 @@
 
     /* check if the work buffer is long enough */
     /* just to avoid bufferoverruns */
-    /* In the worst case and integer will need 5 bytes */
+    /* In the worst case and integer will need MAXINTCOMPSIZE bytes */
     /* but fortunatelly this is very uncommon */
 
 /* 2002/01 JMRUIZ
@@ -619,7 +643,7 @@
     if(!idx->fp_loc_write[idx_swap_file] && !idx->fp_loc_read[idx_swap_file])
        return;
 
-	/* Check if the file is opened for write and close it */
+    /* Check if the file is opened for write and close it */
     if(idx->fp_loc_write[idx_swap_file])
     {
         /* Write a 0 to mark the end of locations */
@@ -638,7 +662,7 @@
     {
         /* File already opened for read -> reset pointer */
         fseek(idx->fp_loc_read[idx_swap_file],0,SEEK_SET);
-	}
+    }
 
     fp = idx->fp_loc_read[idx_swap_file];
     while((lenbuf = uncompress1(fp, idx->swap_getc)))
@@ -649,7 +673,7 @@
             idx->swap_read(buf, lenbuf, 1, fp);
             e = *(ENTRY **)buf;
             /* Store the locations in reverse order - Faster. They will be
-			** sorted later */
+            ** sorted later */
             l = (LOCATION *) buf;
             l->next = e->allLocationList;
             e->allLocationList = l;
@@ -670,7 +694,7 @@
             }
             else
             {
-				/* Just advance file pointer */
+                /* Just advance file pointer */
                 idx->swap_seek(fp,lenbuf - sizeof(ENTRY *),SEEK_CUR);
             }
         }

Index: index.c
===================================================================
RCS file: /cvsroot/swishe/swish-e/src/index.c,v
retrieving revision 1.192
retrieving revision 1.193
diff -u -r1.192 -r1.193
--- index.c	20 Aug 2002 22:24:08 -0000	1.192
+++ index.c	29 Aug 2002 13:45:40 -0000	1.193
@@ -946,16 +946,63 @@
 }
 
 
+ENTRY  *getentry(SWISH * sw, char *word)
+{
+    IndexFILE *indexf = sw->indexlist;
+    struct MOD_Index *idx = sw->Index;
+    int     hashval;
+    ENTRY *e;
+
+    if (!idx->entryArray)
+    {
+        idx->entryArray = (ENTRYARRAY *) emalloc(sizeof(ENTRYARRAY));
+        idx->entryArray->numWords = 0;
+        idx->entryArray->elist = NULL;
+    }
+    /* Compute hash value of word */
+    hashval = verybighash(word);
+
+
+    /* Look for the word in the hash array */
+    for (e = idx->hashentries[hashval]; e; e = e->next)
+        if (strcmp(e->word, word) == 0)
+            break;
+
+    /* flag hash entry used this file, so that the locations can be "compressed" in do_index_file */
+    idx->hashentriesdirty[hashval] = 1;
+
+
+    /* Word found, return it */
+    if (e)
+        return e;
+
+    /* Word not found, so create a new word */
+
+    e = (ENTRY *) Mem_ZoneAlloc(idx->entryZone, sizeof(ENTRY) + strlen(word));
+    strcpy(e->word, word);
+    e->next = idx->hashentries[hashval];
+    idx->hashentries[hashval] = e;
+
+    /* Init values */
+    e->tfrequency = 0;  
+    e->u1.last_filenum = 0; 
+    e->currentlocation = NULL;
+    e->currentChunkLocationList = NULL;  
+    e->allLocationList = NULL;
+
+    idx->entryArray->numWords++;
+    indexf->header.totalwords++;
+
+    return e;
+}
+
 /* Adds a word to the master index tree.
 */
 
-ENTRY  *addentry(SWISH * sw, char *word, int filenum, int structure, int metaID, int position)
+void   addentry(SWISH * sw, ENTRY *e, int filenum, int structure, int metaID, int position)
 {
     int     found;
-    ENTRY  *en,
-           *efound;
     LOCATION *tp, *newtp, *prevtp;
-    int     hashval;
     IndexFILE *indexf = sw->indexlist;
     struct MOD_Index *idx = sw->Index;
 
@@ -966,7 +1013,7 @@
     {
         struct metaEntry *m = getMetaNameByID(&indexf->header, metaID);
 
-        printf("    Adding:[%d:%s(%d)]   '%s'   Pos:%d  Stuct:0x%0X (", filenum, m ? m->metaName : "PROP_UNKNOWN", metaID, word, position, structure);
+        printf("    Adding:[%d:%s(%d)]   '%s'   Pos:%d  Stuct:0x%0X (", filenum, m ? m->metaName : "PROP_UNKNOWN", metaID, e->word, position, structure);
         
         if ( structure & IN_EMPHASIZED ) printf(" EM");
         if ( structure & IN_HEADER ) printf(" HEADING");
@@ -979,52 +1026,23 @@
         printf(" )\n");
     }
 
-    if (!idx->entryArray)
-    {
-        idx->entryArray = (ENTRYARRAY *) emalloc(sizeof(ENTRYARRAY));
-        idx->entryArray->numWords = 0;
-        idx->entryArray->elist = NULL;
-    }
-    /* Compute hash value of word */
-    hashval = verybighash(word);
-
 
-    /* Look for the word in the hash array */
-    for (efound = idx->hashentries[hashval]; efound; efound = efound->next)
-        if (strcmp(efound->word, word) == 0)
-            break;
-
-    /* flag hash entry used this file, so that the locations can be "compressed" in do_index_file */
-    idx->hashentriesdirty[hashval] = 1;
-
-
-    /* Word not found, so create a new word */
-
-    if (!efound)
+    /* Check for first time */
+    if(!e->tfrequency)
     {
-        en = (ENTRY *) Mem_ZoneAlloc(idx->entryZone, sizeof(ENTRY) + strlen(word));
-        strcpy(en->word, word);
-        en->tfrequency = 1;
-        en->u1.last_filenum = filenum;
-        en->next = idx->hashentries[hashval];
-        idx->hashentries[hashval] = en;
-
         /* create a location record */
         tp = (LOCATION *) new_location(idx);
         tp->filenum = filenum;
         tp->frequency = 1;
         tp->metaID = metaID;
         tp->posdata[0] = SET_POSDATA(position,structure);
-
         tp->next = NULL;
-        en->currentlocation = NULL;
-        en->currentChunkLocationList = tp;
-        en->allLocationList = NULL;
 
-        idx->entryArray->numWords++;
-        indexf->header.totalwords++;
+        e->currentChunkLocationList = tp;
+        e->tfrequency = 1;
+        e->u1.last_filenum = filenum;
 
-        return en;  /* all done here */
+        return;
     }
 
     /* Word found -- look for same metaID and filename */
@@ -1032,10 +1050,10 @@
     /* Note: filename not needed due to compress we are only looking at the current file */
     /* Oct 18, 2001 -- filename is needed since merge adds words in non-filenum order */
 
-    tp = efound->currentChunkLocationList;
+    tp = e->currentChunkLocationList;
     found = 0;
 
-    while (tp != efound->currentlocation)
+    while (tp != e->currentlocation)
     {
         if(tp->metaID == metaID && tp->filenum == filenum  )
         {
@@ -1058,17 +1076,17 @@
         tp->posdata[0] = SET_POSDATA(position,structure);
 
         /* add the new LOCATION onto the array */
-        tp->next = efound->currentChunkLocationList;
-        efound->currentChunkLocationList = tp;
+        tp->next = e->currentChunkLocationList;
+        e->currentChunkLocationList = tp;
 
         /* Count number of different files that this word is used in */
-        if ( efound->u1.last_filenum != filenum )
+        if ( e->u1.last_filenum != filenum )
         {
-            efound->tfrequency++;
-            efound->u1.last_filenum = filenum;
+            e->tfrequency++;
+            e->u1.last_filenum = filenum;
         }
 
-        return efound; /* all done */
+        return; /* all done */
     }
 
 
@@ -1082,10 +1100,10 @@
 
     if(newtp != tp)
     {
-        if(efound->currentChunkLocationList == tp)
-            efound->currentChunkLocationList = newtp;
+        if(e->currentChunkLocationList == tp)
+            e->currentChunkLocationList = newtp;
         else
-            for(prevtp = efound->currentChunkLocationList;;prevtp = prevtp->next)
+            for(prevtp = e->currentChunkLocationList;;prevtp = prevtp->next)
             {
                 if(prevtp->next == tp)
                 {
@@ -1098,7 +1116,6 @@
 
     tp->posdata[tp->frequency++] = SET_POSDATA(position,structure);
 
-    return efound;
 }
 
 
@@ -1427,11 +1444,11 @@
             if(sw->Index->swap_locdata)
             {
                 /* jmruiz - Be careful with this lines!!!! If we have a lot of words,
-			    ** probably this code can be very slow and may be rethought.
+                ** probably this code can be very slow and may be rethought.
                 ** Fortunately, only a few words must usually raise a IgnoreLimit option
                 */
-				last_loc_swap = (verybighash(ep->word) * (MAX_LOC_SWAP_FILES - 1)) / (VERYBIGHASHSIZE - 1);
-			    unSwapLocData(sw, last_loc_swap, ep );
+                last_loc_swap = (verybighash(ep->word) * (MAX_LOC_SWAP_FILES - 1)) / (VERYBIGHASHSIZE - 1);
+                unSwapLocData(sw, last_loc_swap, ep );
             }
       
             /* Run through location list to get positions */
@@ -1900,46 +1917,46 @@
     n = lastPercent = last_loc_swap = -1;
     for (i = 0; i < VERYBIGHASHSIZE; i++)
     {
-		/* If we are in economic mode -e restore locations */
-		if(sw->Index->swap_locdata)
-		{
-			if (((i * (MAX_LOC_SWAP_FILES - 1)) / (VERYBIGHASHSIZE - 1)) != last_loc_swap)
-			{
+         /* If we are in economic mode -e restore locations */
+        if(sw->Index->swap_locdata)
+        {
+            if (((i * (MAX_LOC_SWAP_FILES - 1)) / (VERYBIGHASHSIZE - 1)) != last_loc_swap)
+            {
                 /* Free not longer needed memory */
-				Mem_ZoneReset(sw->Index->totalLocZone);
-				last_loc_swap = (i * (MAX_LOC_SWAP_FILES - 1)) / (VERYBIGHASHSIZE - 1);
-			    unSwapLocData(sw, last_loc_swap, NULL );
-			}
-		}
+                Mem_ZoneReset(sw->Index->totalLocZone);
+                last_loc_swap = (i * (MAX_LOC_SWAP_FILES - 1)) / (VERYBIGHASHSIZE - 1);
+                unSwapLocData(sw, last_loc_swap, NULL );
+            }
+        }
         if ((epi = sw->Index->hashentries[i]))
         {
             while (epi)
             {
                 /* If we are in economic mode -e we must sort locations by metaID, filenum */
-		        if(sw->Index->swap_locdata)
-				{
+                if(sw->Index->swap_locdata)
+                {
                     sortSwapLocData(sw, epi);
-				}
+                }
                 if ( sw->verbose && totalwords > 10000 )  // just some random guess
-				{
+                {
                     n++;
                     percent = (n * 100)/totalwords;
                     if (percent - lastPercent >= DELTA )
-					{
+                    {
                         printf("\r  Writing word data: %3d%%", percent );
                         fflush(stdout);
                         lastPercent = percent;
-					}
-				}
+                    }
+                }
                 if (epi->u1.wordID > 0)   /* Not a stopword */
-				{
+                {
                     build_worddata(sw, epi, indexf);
                     write_worddata(sw, epi, indexf);
-				}
-	            epi = epi->next;
-			}
-		}
-	}
+                }
+                epi = epi->next;
+            }
+        }
+    }
     if (sw->verbose)
         printf("\r  Writing word data: Complete\n" );
 
@@ -1976,7 +1993,7 @@
     totalwords = ep->numWords;
 
 
-	/* Write words */
+    /* Write words */
     DB_InitWriteWords(sw, indexf->DB);
 
     if (sw->verbose)
@@ -2136,7 +2153,7 @@
 
     /* Add the word for each nested metaname. */
     for (i = 0; i < numMetaNames; i++)
-        (void) addentry(sw, word, filenum, structure, metaID[i], *word_position);
+        (void) addentry(sw, getentry(sw,word), filenum, structure, metaID[i], *word_position);
 
     (*word_position)++;
 }
@@ -2499,7 +2516,7 @@
     {
         tmploc = (LOCATION **)coalesced;
         *tmploc = (LOCATION *)e;   /* Preserve e in buffer */
-                                   /* The cast is for avoiding the warning */		                         
+                                   /* The cast is for avoiding the warning */                                 
         SwapLocData(sw, e, coalesced, sz_coalesced);
         return;
     }

Index: merge.c
===================================================================
RCS file: /cvsroot/swishe/swish-e/src/merge.c,v
retrieving revision 1.68
retrieving revision 1.69
diff -u -r1.68 -r1.69
--- merge.c	27 Aug 2002 18:16:59 -0000	1.68
+++ merge.c	29 Aug 2002 13:45:41 -0000	1.69
@@ -45,8 +45,8 @@
 static IndexFILE *get_next_file_in_order( SWISH *sw_input );
 static void add_file( FILE *filenum_map, IndexFILE *cur_index, SWISH *sw_input, SWISH *sw_output );
 static int *get_map( FILE *filenum_map, IndexFILE *cur_index );
-static void dump_index(SWISH * sw, IndexFILE * indexf, SWISH *sw_output, int *filenum_map );
-static ENTRY *write_word_pos( SWISH *sw_input, IndexFILE *indexf, SWISH *sw_output, int *file_num_map, int filenum, char *resultword, int metaID, int posdata );
+static void dump_index_words(SWISH * sw, IndexFILE * indexf, SWISH *sw_output );
+static void write_word_pos( SWISH *sw_input, IndexFILE *indexf, SWISH *sw_output, int *file_num_map, int filenum, ENTRY *e, int metaID, int posdata );
 
 
 // #define DEBUG_MERGE
@@ -62,9 +62,24 @@
     IndexFILE   *cur_index;
     FILE        *filenum_map;
     char        *tmpfilename;
-
-
-
+    struct MOD_Index *idx_output = sw_output->Index;
+    ENTRY       *e;
+    int          hash,
+                 sz_worddata,
+                 tmpval,
+                 filenum,
+                 metaID = 0,
+                 frequency,
+                 loc_count = 0,
+                 word_count = 0;
+    long         wordID;
+    unsigned long    nextposmetaID = 0L;
+    unsigned char   *worddata;
+    unsigned char   *s;
+    unsigned char   flag;
+    int          local_posdata[MAX_STACK_POSITIONS];
+    int         *posdata;
+    int          i;
 
     /*******************************************************************************
     * Get ready to merge the indexes.  For each index:
@@ -146,20 +161,144 @@
     *
     ****************************************************************************/
 
+    /* 08/2002 jmruiz
+    ** First of all, get all the words
+    */
     cur_index = sw_input->indexlist;
     while( cur_index )
     {
-        int *file_num_map = get_map( filenum_map, cur_index );
+        dump_index_words(sw_input, cur_index, sw_output);
+        /* Get filr_num_map for later proccess */
+        cur_index->merge_file_num_map = get_map( filenum_map, cur_index );
+        cur_index = cur_index->next;
+    }
+
+    /* At this point we have all the words. Now we have to get worddata
+    * and merge it
+    */
+    word_count = 0;
+    printf("Processing words in index '%s': %6d words\r", sw_output->indexlist->line, word_count);
+    fflush(stdout);
+    /* walk the hash list to merge worddata */
+    for (hash = 0; hash < VERYBIGHASHSIZE; hash++)
+    {
+        if (idx_output->hashentriesdirty[hash])
+        {
+            idx_output->hashentriesdirty[hash] = 0;
+            for (e = idx_output->hashentries[hash]; e; e = e->next)
+            {
+                word_count++;
+                /* Search the word in all index and get worddata */
+                cur_index = sw_input->indexlist;
+                while( cur_index )
+                {
+                    DB_ReadWordHash(sw_input, e->word, &wordID, cur_index->DB);
+                    /* If word exits in the index */
+                    if(wordID)
+                    {
+
+                        DB_ReadWordData(sw_input, wordID, &worddata, &sz_worddata, cur_index->DB);
 
-        dump_index(sw_input, cur_index, sw_output, file_num_map );
+                        /* Now, parse word's data */
+                        s = worddata;
+                        tmpval = uncompress2(&s);     /* tfrequency */
+                        metaID = uncompress2(&s);     /* metaID */
 
+                        if (metaID)
+                        {
+                            nextposmetaID = UNPACKLONG2(s);
+                            s += sizeof(long);
+                        }
+
+                        filenum = 0;
+
+                        while(1)
+                        {                   /* Read on all items */
+                            uncompress_location_values(&s,&flag,&tmpval,&frequency);
+                            filenum += tmpval;
+                            /* Use stack array when possible to avoid malloc/free overhead */
+                            if(frequency > MAX_STACK_POSITIONS)
+                                posdata = (int *) emalloc(frequency * sizeof(int));
+                            else
+                                posdata = local_posdata;
+                            
+                            /* Read the positions */
+                            uncompress_location_positions(&s,flag,frequency,posdata);
+
+
+                            /* now we have the word data */
+                            for (i = 0; i < frequency; i++, loc_count++)
+                                write_word_pos( sw_input, cur_index, sw_output, cur_index->merge_file_num_map, filenum, e, metaID, posdata[i]);
+
+                            if(e->tfrequency)
+                            {
+                                /* 08/2002 jmruiz - We will call CompressCurrentLocEntry from time
+                                ** to time to help addentry.
+                                ** If we do not do this, addentry routine will have to run linked lists 
+                                ** of positions with thousands of elements and makes the merge proccess
+                                ** very slow
+                                */
+                                if(!(loc_count % 100))
+                                    CompressCurrentLocEntry(sw_output, sw_output->indexlist, e);
+                            }
+                        
+
+                            if(posdata != local_posdata)
+                                efree(posdata);
+
+                            if ((s - worddata) == sz_worddata)
+                                break;   /* End of worddata */
+
+                            if ((unsigned long)(s - worddata) == nextposmetaID)
+                            {
+                                filenum = 0;
+                                metaID = uncompress2(&s);
+                                if (metaID)
+                                {
+                                    nextposmetaID = UNPACKLONG2(s); 
+                                    s += sizeof(long);
+                                }
+                                else
+                                    nextposmetaID = 0L;
+                            }
+                        }
+
+                        if(e->tfrequency)
+                            CompressCurrentLocEntry(sw_output, sw_output->indexlist, e);
+
+                        efree(worddata);
+                    }
+                    cur_index = cur_index->next;
+                }
+                /* Let's coalesce locations for each word to save memory 
+                ** This makes use of the -e feature
+                ** Because we are proccessing one word at a time we can
+                ** coalesce its data just once
+                */
+                coalesce_word_locations(sw_output,sw_output->indexlist,e);
+
+                if(!(word_count % 1000))
+                {
+                    /* Make zone available for reuse and save memory */
+                    Mem_ZoneReset(sw_output->Index->currentChunkLocZone);
+                    sw_output->Index->freeLocMemChain = NULL;
+                    printf("Processing words in index '%s': %6d words\r", sw_output->indexlist->line, word_count);
+                }
+            }
+        }
+    }
+
+    printf("Processing words in index '%s': %6d words\n", sw_output->indexlist->line, word_count);
+    fflush(stdout);
+
+    cur_index = sw_input->indexlist;
+    while( cur_index )
+    {
         /* free the maps */
-        efree( file_num_map );
+        efree( cur_index->merge_file_num_map );
         efree( cur_index->meta_map );
         cur_index->meta_map = NULL;
-
         cur_index = cur_index->next;
-
     }
 
 
@@ -729,25 +868,12 @@
 }
 
 /****************************************************************************
-*  Reads the index and calls write_word_pos
-*
-*  This should be a common, shared function in swish.  Would also be good
-*  written as an iterator function so it retuns a position structure.
-*  Currently, calls write_word_pos for each position.  Would it be better
-*  to call with a LOCATION structure?
-*
+*  Reads the index to get the all the words
 ****************************************************************************/
    
-static void dump_index(SWISH * sw, IndexFILE * indexf, SWISH *sw_output, int *filenum_map )
+static void dump_index_words(SWISH * sw, IndexFILE * indexf, SWISH *sw_output)
 {
-    int         i;
     int         j;
-    int         frequency = 0;
-    int         tmpval;
-    int         filenum;
-    int         local_posdata[MAX_STACK_POSITIONS];
-    int        *posdata;
-    int         sz_worddata;
     int         metaname = 0;
     int         word_count = 0;
     int         loc_count = 0;
@@ -755,19 +881,11 @@
     char       *resultword;
     long        wordID;
     unsigned long    nextposmetaname = 0L;
-    unsigned char   *worddata;
-    unsigned char   *s;
-    unsigned char   flag;
-    ENTRY      *e, *tmp;
-    int         hash;
-
-    
 
     DB_InitReadWords(sw, indexf->DB);
 
 
-
-    printf("Processing words in index '%s': %3d words\r", indexf->line, word_count);
+    printf("Getting words in index '%s': %3d words\r", indexf->line, word_count);
     fflush(stdout);
     
     for(j=0;j<256;j++)
@@ -778,137 +896,19 @@
 
         while(wordID)
         {
-            e = NULL;
-            loc_count = 0;
-            
-            /* Read Word's data */
-            DB_ReadWordData(sw, wordID, &worddata, &sz_worddata, indexf->DB);
-
-            /* parse and print word's data */
-            s = worddata;
-
-            tmpval = uncompress2(&s);     /* tfrequency */
-
-            metaname = uncompress2(&s);     /* metaID */
-
-            if (metaname)
-            {
-                nextposmetaname = UNPACKLONG2(s);
-                s += sizeof(long);
-            }
-
-            filenum = 0;
-
-            while(1)
-            {                   /* Read on all items */
-                uncompress_location_values(&s,&flag,&tmpval,&frequency);
-                filenum += tmpval;
-                if(frequency > MAX_STACK_POSITIONS)
-                    posdata = (int *) emalloc(frequency * sizeof(int));
-                else
-                    posdata = local_posdata;
-
-                uncompress_location_positions(&s,flag,frequency,posdata);
-
-
-                /* now we have the word data */
-                for (i = 0; i < frequency; i++, loc_count++)
-                {
-                    tmp = write_word_pos( sw, indexf, sw_output, filenum_map, filenum, resultword, metaname, posdata[i]);
-                    /* get the pointer to entry for later compress */
-                    if(!e)
-                        e = tmp;
-                }
-
-                if(e)
-                {
-                    /* 08/2002 jmruiz - We will call CompressCurrentLocEntry from time
-                    ** to time to help addentry.
-                    ** If we do not do this, addentry routine will have to run linked lists 
-                    ** of positions with thousands of elements and makes the merge proccess
-                    ** very slow
-                    */
-                    if(!(loc_count % 100))
-                    {
-                        hash = verybighash(e->word);
-                        if(sw_output->Index->hashentriesdirty[hash])
-                        {
-                            /* Reset the hashentriesdirty flag - Not needed by merge */
-                            sw_output->Index->hashentriesdirty[hash] = 0;
-                            CompressCurrentLocEntry(sw_output, sw_output->indexlist, e);
-                        }
-                    }
-                }
-
-                if(posdata != local_posdata)
-                    efree(posdata);
-
-                if ((s - worddata) == sz_worddata)
-                    break;   /* End of worddata */
-
-                if ((unsigned long)(s - worddata) == nextposmetaname)
-                {
-                    filenum = 0;
-                    metaname = uncompress2(&s);
-                    if (metaname)
-                    {
-                        nextposmetaname = UNPACKLONG2(s); 
-                        s += sizeof(long);
-                    }
-                    else
-                        nextposmetaname = 0L;
-                }
-            }
-
-            if(e)
-            {
-                /* Reset the hashentriesdirty flag - Not needed by merge */
-                sw_output->Index->hashentriesdirty[verybighash(e->word)] = 0;
-                CompressCurrentLocEntry(sw_output, sw_output->indexlist, e);
-                word_count++;
-            }
-
-
-            /* Let's coalesce location from time to time to save memory */
-            /* FIX 08/2002 jmruiz
-            ** Unfortunately, we cannot coalesce because this routine
-            ** need that the worddata is sorted by filenum. In merge,
-            ** this is not true because new filenums are asigned on the
-            ** fly in write_word_pos routine. If done
-            **
-            ** BTW, if we cannot call coalesce_word_locations, -e is useless
-            ** Probably, this need some redesign ...
-            */
-            if(!(word_count % 1000))
-            {
-                printf("Processing words in index '%s': %6d words\r", indexf->line, word_count);
-                fflush(stdout);
-
-                //coalesce_all_word_locations(sw_output, sw_output->indexlist);
-
-                /* Make zone available for reuse */
-                //Mem_ZoneReset(sw_output->Index->currentChunkLocZone);
-                //sw_output->Index->freeLocMemChain = NULL;
-
-            }
-
-            efree(worddata);
+            /* Add resultword to output */
+            getentry(sw_output, resultword);
             efree(resultword);
             DB_ReadNextWordInvertedIndex(sw, word,&resultword,&wordID,indexf->DB);
+            word_count++;
+            if(!word_count % 10000)
+                printf("Getting words in index '%s': %3d words\r", indexf->line, word_count);
         }
     }
-    printf("Processing words in index '%s': %6d words\n", indexf->line, word_count);
+    printf("Getting words in index '%s': %6d words\n", indexf->line, word_count);
 
     DB_EndReadWords(sw, indexf->DB);
 
-    /* Coalesce pending work*/
-    /* FIX 08/2002 jmruiz - See above
-    */
-    //coalesce_all_word_locations(sw_output, sw_output->indexlist);
-
-    /* Make zone available for reuse */
-    //Mem_ZoneReset(sw_output->Index->currentChunkLocZone);
-    //sw_output->Index->freeLocMemChain = NULL;
 }
 
 /****************************************************************************
@@ -917,48 +917,47 @@
 *
 ****************************************************************************/
 
-static ENTRY  *write_word_pos( SWISH *sw_input, IndexFILE *indexf, SWISH *sw_output, int *file_num_map, int filenum, char *resultword, int metaID, int posdata )
+static void write_word_pos( SWISH *sw_input, IndexFILE *indexf, SWISH *sw_output, int *file_num_map, int filenum, ENTRY *e, int metaID, int posdata )
 {
     int         new_file;
     int         new_meta;
 
 #ifdef DEBUG_MERGE
     printf("\nindex %s '%s' Struct: %d Pos: %d",
-    indexf->line, resultword, structure, position );
+    indexf->line, e->word, structure, position );
 
 
     if ( !(new_file = file_num_map[ filenum ]) )
     {
         printf("  file: %d **File deleted!**\n", filenum);
-        return NULL;
+        return;
     }
 
     if ( !(new_meta = indexf->meta_map[ metaID ] ))
     {
         printf("  file: %d **Failed to map meta ID **\n", filenum);
-        return NULL;
+        return;
     }
 
     printf("  File: %d -> %d  Meta: %d -> %d\n", filenum, new_file, metaID, new_meta );
 
-    return addentry( sw_output, resultword, new_file, structure, metaID, position );
+    addentry( sw_output, e, new_file, structure, metaID, position );
 
+    return;
 
 
-    /* Compress the entries ?  */
-    //compress_entries( sw_output );  // maybe after every 1000 words or so?
-    // * but does this make it so addentry can't lookup a word?
 #else
 
 
     if ( !(new_file = file_num_map[ filenum ]) )
-        return NULL;
+        return;
 
     if ( !(new_meta = indexf->meta_map[ metaID ] ))
-        return NULL;
+        return;
 
-    return addentry( sw_output, resultword, new_file, GET_STRUCTURE(posdata), metaID, GET_POSITION(posdata) );
+    addentry( sw_output, e, new_file, GET_STRUCTURE(posdata), metaID, GET_POSITION(posdata) );
 
+    return;
 
 #endif
 





CVS: swish-e/src index.h,1.51,1.52 swish.h,1.156,1.157 compress.c,1.47,1.48 index.c,1.192,1.193 merg

CVS: swish-e/src index.h,1.51,1.52 swish.h,1.156,1.157 compress.c,1.47,1.48 index.c,1.192,1.193 merge.c,1.68,1.69