From: Jose R. <jm...@us...> - 2002-08-29 13:45:46
|
Update of /cvsroot/swishe/swish-e/src In directory usw-pr-cvs1:/tmp/cvs-serv10039 Modified Files: index.h swish.h compress.c index.c merge.c Log Message: Here is one more try on merge issue. Now, it should also work with -e. Also some minor changes to compress.c. Added the ability to pass number 0 to compress routinesi. I have also changed 5 by MAXINTCOMPSIZE. BTW, I will also make some other improvements to compress.c Index: index.h =================================================================== RCS file: /cvsroot/swishe/swish-e/src/index.h,v retrieving revision 1.51 retrieving revision 1.52 diff -u -r1.51 -r1.52 --- index.h 7 Aug 2002 00:28:38 -0000 1.51 +++ index.h 29 Aug 2002 13:45:39 -0000 1.52 @@ -129,7 +129,8 @@ void do_index_file(SWISH * sw, FileProp * fprop); -ENTRY *addentry(SWISH *, char *, int, int, int, int); +ENTRY *getentry(SWISH * , char *); +void addentry(SWISH *, ENTRY *, int, int, int, int); void addCommonProperties(SWISH * sw, FileProp * fprop, FileRec * fi, char *title, char *summary, int start); Index: swish.h =================================================================== RCS file: /cvsroot/swishe/swish-e/src/swish.h,v retrieving revision 1.156 retrieving revision 1.157 diff -u -r1.156 -r1.157 --- swish.h 22 Aug 2002 22:58:39 -0000 1.156 +++ swish.h 29 Aug 2002 13:45:40 -0000 1.157 @@ -574,6 +574,8 @@ int filenum; // current filenumber to use + /* Used by merge.c */ + int *merge_file_num_map; } IndexFILE; Index: compress.c =================================================================== RCS file: /cvsroot/swishe/swish-e/src/compress.c,v retrieving revision 1.47 retrieving revision 1.48 diff -u -r1.47 -r1.48 --- compress.c 16 Jul 2002 19:02:39 -0000 1.47 +++ compress.c 29 Aug 2002 13:45:40 -0000 1.48 @@ -43,8 +43,16 @@ { int _i = 0, _r = num; - unsigned char _s[5]; + unsigned char _s[MAXINTCOMPSIZE]; + /* Trivial case: 0 */ + if(!_r) + { + f_putc(0,fp); + return; + } + + /* Any other case ... */ while (_r) { _s[_i++] = _r & 127; @@ -61,6 +69,14 @@ { int _i = num; + /* Trivial case: 0 */ + if(!_i) + { + *buffer-- = 0; + return 0; + } + + /* Any other case ... */ while (_i) { *buffer = _i & 127; @@ -80,8 +96,16 @@ { int _i = 0, _r = num; - unsigned char _s[5]; + unsigned char _s[MAXINTCOMPSIZE]; + /* Trivial case: 0 */ + if(!_r) + { + *buffer++ = 0; + return buffer; + } + + /* Any other case ... */ while (_r) { _s[_i++] = _r & 127; @@ -364,7 +388,7 @@ /* check if the work buffer is long enough */ /* just to avoid bufferoverruns */ - /* In the worst case and integer will need 5 bytes */ + /* In the worst case and integer will need MAXINTCOMPSIZE bytes */ /* but fortunatelly this is very uncommon */ /* 2002/01 JMRUIZ @@ -619,7 +643,7 @@ if(!idx->fp_loc_write[idx_swap_file] && !idx->fp_loc_read[idx_swap_file]) return; - /* Check if the file is opened for write and close it */ + /* Check if the file is opened for write and close it */ if(idx->fp_loc_write[idx_swap_file]) { /* Write a 0 to mark the end of locations */ @@ -638,7 +662,7 @@ { /* File already opened for read -> reset pointer */ fseek(idx->fp_loc_read[idx_swap_file],0,SEEK_SET); - } + } fp = idx->fp_loc_read[idx_swap_file]; while((lenbuf = uncompress1(fp, idx->swap_getc))) @@ -649,7 +673,7 @@ idx->swap_read(buf, lenbuf, 1, fp); e = *(ENTRY **)buf; /* Store the locations in reverse order - Faster. They will be - ** sorted later */ + ** sorted later */ l = (LOCATION *) buf; l->next = e->allLocationList; e->allLocationList = l; @@ -670,7 +694,7 @@ } else { - /* Just advance file pointer */ + /* Just advance file pointer */ idx->swap_seek(fp,lenbuf - sizeof(ENTRY *),SEEK_CUR); } } Index: index.c =================================================================== RCS file: /cvsroot/swishe/swish-e/src/index.c,v retrieving revision 1.192 retrieving revision 1.193 diff -u -r1.192 -r1.193 --- index.c 20 Aug 2002 22:24:08 -0000 1.192 +++ index.c 29 Aug 2002 13:45:40 -0000 1.193 @@ -946,16 +946,63 @@ } +ENTRY *getentry(SWISH * sw, char *word) +{ + IndexFILE *indexf = sw->indexlist; + struct MOD_Index *idx = sw->Index; + int hashval; + ENTRY *e; + + if (!idx->entryArray) + { + idx->entryArray = (ENTRYARRAY *) emalloc(sizeof(ENTRYARRAY)); + idx->entryArray->numWords = 0; + idx->entryArray->elist = NULL; + } + /* Compute hash value of word */ + hashval = verybighash(word); + + + /* Look for the word in the hash array */ + for (e = idx->hashentries[hashval]; e; e = e->next) + if (strcmp(e->word, word) == 0) + break; + + /* flag hash entry used this file, so that the locations can be "compressed" in do_index_file */ + idx->hashentriesdirty[hashval] = 1; + + + /* Word found, return it */ + if (e) + return e; + + /* Word not found, so create a new word */ + + e = (ENTRY *) Mem_ZoneAlloc(idx->entryZone, sizeof(ENTRY) + strlen(word)); + strcpy(e->word, word); + e->next = idx->hashentries[hashval]; + idx->hashentries[hashval] = e; + + /* Init values */ + e->tfrequency = 0; + e->u1.last_filenum = 0; + e->currentlocation = NULL; + e->currentChunkLocationList = NULL; + e->allLocationList = NULL; + + idx->entryArray->numWords++; + indexf->header.totalwords++; + + return e; +} + /* Adds a word to the master index tree. */ -ENTRY *addentry(SWISH * sw, char *word, int filenum, int structure, int metaID, int position) +void addentry(SWISH * sw, ENTRY *e, int filenum, int structure, int metaID, int position) { int found; - ENTRY *en, - *efound; LOCATION *tp, *newtp, *prevtp; - int hashval; IndexFILE *indexf = sw->indexlist; struct MOD_Index *idx = sw->Index; @@ -966,7 +1013,7 @@ { struct metaEntry *m = getMetaNameByID(&indexf->header, metaID); - printf(" Adding:[%d:%s(%d)] '%s' Pos:%d Stuct:0x%0X (", filenum, m ? m->metaName : "PROP_UNKNOWN", metaID, word, position, structure); + printf(" Adding:[%d:%s(%d)] '%s' Pos:%d Stuct:0x%0X (", filenum, m ? m->metaName : "PROP_UNKNOWN", metaID, e->word, position, structure); if ( structure & IN_EMPHASIZED ) printf(" EM"); if ( structure & IN_HEADER ) printf(" HEADING"); @@ -979,52 +1026,23 @@ printf(" )\n"); } - if (!idx->entryArray) - { - idx->entryArray = (ENTRYARRAY *) emalloc(sizeof(ENTRYARRAY)); - idx->entryArray->numWords = 0; - idx->entryArray->elist = NULL; - } - /* Compute hash value of word */ - hashval = verybighash(word); - - /* Look for the word in the hash array */ - for (efound = idx->hashentries[hashval]; efound; efound = efound->next) - if (strcmp(efound->word, word) == 0) - break; - - /* flag hash entry used this file, so that the locations can be "compressed" in do_index_file */ - idx->hashentriesdirty[hashval] = 1; - - - /* Word not found, so create a new word */ - - if (!efound) + /* Check for first time */ + if(!e->tfrequency) { - en = (ENTRY *) Mem_ZoneAlloc(idx->entryZone, sizeof(ENTRY) + strlen(word)); - strcpy(en->word, word); - en->tfrequency = 1; - en->u1.last_filenum = filenum; - en->next = idx->hashentries[hashval]; - idx->hashentries[hashval] = en; - /* create a location record */ tp = (LOCATION *) new_location(idx); tp->filenum = filenum; tp->frequency = 1; tp->metaID = metaID; tp->posdata[0] = SET_POSDATA(position,structure); - tp->next = NULL; - en->currentlocation = NULL; - en->currentChunkLocationList = tp; - en->allLocationList = NULL; - idx->entryArray->numWords++; - indexf->header.totalwords++; + e->currentChunkLocationList = tp; + e->tfrequency = 1; + e->u1.last_filenum = filenum; - return en; /* all done here */ + return; } /* Word found -- look for same metaID and filename */ @@ -1032,10 +1050,10 @@ /* Note: filename not needed due to compress we are only looking at the current file */ /* Oct 18, 2001 -- filename is needed since merge adds words in non-filenum order */ - tp = efound->currentChunkLocationList; + tp = e->currentChunkLocationList; found = 0; - while (tp != efound->currentlocation) + while (tp != e->currentlocation) { if(tp->metaID == metaID && tp->filenum == filenum ) { @@ -1058,17 +1076,17 @@ tp->posdata[0] = SET_POSDATA(position,structure); /* add the new LOCATION onto the array */ - tp->next = efound->currentChunkLocationList; - efound->currentChunkLocationList = tp; + tp->next = e->currentChunkLocationList; + e->currentChunkLocationList = tp; /* Count number of different files that this word is used in */ - if ( efound->u1.last_filenum != filenum ) + if ( e->u1.last_filenum != filenum ) { - efound->tfrequency++; - efound->u1.last_filenum = filenum; + e->tfrequency++; + e->u1.last_filenum = filenum; } - return efound; /* all done */ + return; /* all done */ } @@ -1082,10 +1100,10 @@ if(newtp != tp) { - if(efound->currentChunkLocationList == tp) - efound->currentChunkLocationList = newtp; + if(e->currentChunkLocationList == tp) + e->currentChunkLocationList = newtp; else - for(prevtp = efound->currentChunkLocationList;;prevtp = prevtp->next) + for(prevtp = e->currentChunkLocationList;;prevtp = prevtp->next) { if(prevtp->next == tp) { @@ -1098,7 +1116,6 @@ tp->posdata[tp->frequency++] = SET_POSDATA(position,structure); - return efound; } @@ -1427,11 +1444,11 @@ if(sw->Index->swap_locdata) { /* jmruiz - Be careful with this lines!!!! If we have a lot of words, - ** probably this code can be very slow and may be rethought. + ** probably this code can be very slow and may be rethought. ** Fortunately, only a few words must usually raise a IgnoreLimit option */ - last_loc_swap = (verybighash(ep->word) * (MAX_LOC_SWAP_FILES - 1)) / (VERYBIGHASHSIZE - 1); - unSwapLocData(sw, last_loc_swap, ep ); + last_loc_swap = (verybighash(ep->word) * (MAX_LOC_SWAP_FILES - 1)) / (VERYBIGHASHSIZE - 1); + unSwapLocData(sw, last_loc_swap, ep ); } /* Run through location list to get positions */ @@ -1900,46 +1917,46 @@ n = lastPercent = last_loc_swap = -1; for (i = 0; i < VERYBIGHASHSIZE; i++) { - /* If we are in economic mode -e restore locations */ - if(sw->Index->swap_locdata) - { - if (((i * (MAX_LOC_SWAP_FILES - 1)) / (VERYBIGHASHSIZE - 1)) != last_loc_swap) - { + /* If we are in economic mode -e restore locations */ + if(sw->Index->swap_locdata) + { + if (((i * (MAX_LOC_SWAP_FILES - 1)) / (VERYBIGHASHSIZE - 1)) != last_loc_swap) + { /* Free not longer needed memory */ - Mem_ZoneReset(sw->Index->totalLocZone); - last_loc_swap = (i * (MAX_LOC_SWAP_FILES - 1)) / (VERYBIGHASHSIZE - 1); - unSwapLocData(sw, last_loc_swap, NULL ); - } - } + Mem_ZoneReset(sw->Index->totalLocZone); + last_loc_swap = (i * (MAX_LOC_SWAP_FILES - 1)) / (VERYBIGHASHSIZE - 1); + unSwapLocData(sw, last_loc_swap, NULL ); + } + } if ((epi = sw->Index->hashentries[i])) { while (epi) { /* If we are in economic mode -e we must sort locations by metaID, filenum */ - if(sw->Index->swap_locdata) - { + if(sw->Index->swap_locdata) + { sortSwapLocData(sw, epi); - } + } if ( sw->verbose && totalwords > 10000 ) // just some random guess - { + { n++; percent = (n * 100)/totalwords; if (percent - lastPercent >= DELTA ) - { + { printf("\r Writing word data: %3d%%", percent ); fflush(stdout); lastPercent = percent; - } - } + } + } if (epi->u1.wordID > 0) /* Not a stopword */ - { + { build_worddata(sw, epi, indexf); write_worddata(sw, epi, indexf); - } - epi = epi->next; - } - } - } + } + epi = epi->next; + } + } + } if (sw->verbose) printf("\r Writing word data: Complete\n" ); @@ -1976,7 +1993,7 @@ totalwords = ep->numWords; - /* Write words */ + /* Write words */ DB_InitWriteWords(sw, indexf->DB); if (sw->verbose) @@ -2136,7 +2153,7 @@ /* Add the word for each nested metaname. */ for (i = 0; i < numMetaNames; i++) - (void) addentry(sw, word, filenum, structure, metaID[i], *word_position); + (void) addentry(sw, getentry(sw,word), filenum, structure, metaID[i], *word_position); (*word_position)++; } @@ -2499,7 +2516,7 @@ { tmploc = (LOCATION **)coalesced; *tmploc = (LOCATION *)e; /* Preserve e in buffer */ - /* The cast is for avoiding the warning */ + /* The cast is for avoiding the warning */ SwapLocData(sw, e, coalesced, sz_coalesced); return; } Index: merge.c =================================================================== RCS file: /cvsroot/swishe/swish-e/src/merge.c,v retrieving revision 1.68 retrieving revision 1.69 diff -u -r1.68 -r1.69 --- merge.c 27 Aug 2002 18:16:59 -0000 1.68 +++ merge.c 29 Aug 2002 13:45:41 -0000 1.69 @@ -45,8 +45,8 @@ static IndexFILE *get_next_file_in_order( SWISH *sw_input ); static void add_file( FILE *filenum_map, IndexFILE *cur_index, SWISH *sw_input, SWISH *sw_output ); static int *get_map( FILE *filenum_map, IndexFILE *cur_index ); -static void dump_index(SWISH * sw, IndexFILE * indexf, SWISH *sw_output, int *filenum_map ); -static ENTRY *write_word_pos( SWISH *sw_input, IndexFILE *indexf, SWISH *sw_output, int *file_num_map, int filenum, char *resultword, int metaID, int posdata ); +static void dump_index_words(SWISH * sw, IndexFILE * indexf, SWISH *sw_output ); +static void write_word_pos( SWISH *sw_input, IndexFILE *indexf, SWISH *sw_output, int *file_num_map, int filenum, ENTRY *e, int metaID, int posdata ); // #define DEBUG_MERGE @@ -62,9 +62,24 @@ IndexFILE *cur_index; FILE *filenum_map; char *tmpfilename; - - - + struct MOD_Index *idx_output = sw_output->Index; + ENTRY *e; + int hash, + sz_worddata, + tmpval, + filenum, + metaID = 0, + frequency, + loc_count = 0, + word_count = 0; + long wordID; + unsigned long nextposmetaID = 0L; + unsigned char *worddata; + unsigned char *s; + unsigned char flag; + int local_posdata[MAX_STACK_POSITIONS]; + int *posdata; + int i; /******************************************************************************* * Get ready to merge the indexes. For each index: @@ -146,20 +161,144 @@ * ****************************************************************************/ + /* 08/2002 jmruiz + ** First of all, get all the words + */ cur_index = sw_input->indexlist; while( cur_index ) { - int *file_num_map = get_map( filenum_map, cur_index ); + dump_index_words(sw_input, cur_index, sw_output); + /* Get filr_num_map for later proccess */ + cur_index->merge_file_num_map = get_map( filenum_map, cur_index ); + cur_index = cur_index->next; + } + + /* At this point we have all the words. Now we have to get worddata + * and merge it + */ + word_count = 0; + printf("Processing words in index '%s': %6d words\r", sw_output->indexlist->line, word_count); + fflush(stdout); + /* walk the hash list to merge worddata */ + for (hash = 0; hash < VERYBIGHASHSIZE; hash++) + { + if (idx_output->hashentriesdirty[hash]) + { + idx_output->hashentriesdirty[hash] = 0; + for (e = idx_output->hashentries[hash]; e; e = e->next) + { + word_count++; + /* Search the word in all index and get worddata */ + cur_index = sw_input->indexlist; + while( cur_index ) + { + DB_ReadWordHash(sw_input, e->word, &wordID, cur_index->DB); + /* If word exits in the index */ + if(wordID) + { + + DB_ReadWordData(sw_input, wordID, &worddata, &sz_worddata, cur_index->DB); - dump_index(sw_input, cur_index, sw_output, file_num_map ); + /* Now, parse word's data */ + s = worddata; + tmpval = uncompress2(&s); /* tfrequency */ + metaID = uncompress2(&s); /* metaID */ + if (metaID) + { + nextposmetaID = UNPACKLONG2(s); + s += sizeof(long); + } + + filenum = 0; + + while(1) + { /* Read on all items */ + uncompress_location_values(&s,&flag,&tmpval,&frequency); + filenum += tmpval; + /* Use stack array when possible to avoid malloc/free overhead */ + if(frequency > MAX_STACK_POSITIONS) + posdata = (int *) emalloc(frequency * sizeof(int)); + else + posdata = local_posdata; + + /* Read the positions */ + uncompress_location_positions(&s,flag,frequency,posdata); + + + /* now we have the word data */ + for (i = 0; i < frequency; i++, loc_count++) + write_word_pos( sw_input, cur_index, sw_output, cur_index->merge_file_num_map, filenum, e, metaID, posdata[i]); + + if(e->tfrequency) + { + /* 08/2002 jmruiz - We will call CompressCurrentLocEntry from time + ** to time to help addentry. + ** If we do not do this, addentry routine will have to run linked lists + ** of positions with thousands of elements and makes the merge proccess + ** very slow + */ + if(!(loc_count % 100)) + CompressCurrentLocEntry(sw_output, sw_output->indexlist, e); + } + + + if(posdata != local_posdata) + efree(posdata); + + if ((s - worddata) == sz_worddata) + break; /* End of worddata */ + + if ((unsigned long)(s - worddata) == nextposmetaID) + { + filenum = 0; + metaID = uncompress2(&s); + if (metaID) + { + nextposmetaID = UNPACKLONG2(s); + s += sizeof(long); + } + else + nextposmetaID = 0L; + } + } + + if(e->tfrequency) + CompressCurrentLocEntry(sw_output, sw_output->indexlist, e); + + efree(worddata); + } + cur_index = cur_index->next; + } + /* Let's coalesce locations for each word to save memory + ** This makes use of the -e feature + ** Because we are proccessing one word at a time we can + ** coalesce its data just once + */ + coalesce_word_locations(sw_output,sw_output->indexlist,e); + + if(!(word_count % 1000)) + { + /* Make zone available for reuse and save memory */ + Mem_ZoneReset(sw_output->Index->currentChunkLocZone); + sw_output->Index->freeLocMemChain = NULL; + printf("Processing words in index '%s': %6d words\r", sw_output->indexlist->line, word_count); + } + } + } + } + + printf("Processing words in index '%s': %6d words\n", sw_output->indexlist->line, word_count); + fflush(stdout); + + cur_index = sw_input->indexlist; + while( cur_index ) + { /* free the maps */ - efree( file_num_map ); + efree( cur_index->merge_file_num_map ); efree( cur_index->meta_map ); cur_index->meta_map = NULL; - cur_index = cur_index->next; - } @@ -729,25 +868,12 @@ } /**************************************************************************** -* Reads the index and calls write_word_pos -* -* This should be a common, shared function in swish. Would also be good -* written as an iterator function so it retuns a position structure. -* Currently, calls write_word_pos for each position. Would it be better -* to call with a LOCATION structure? -* +* Reads the index to get the all the words ****************************************************************************/ -static void dump_index(SWISH * sw, IndexFILE * indexf, SWISH *sw_output, int *filenum_map ) +static void dump_index_words(SWISH * sw, IndexFILE * indexf, SWISH *sw_output) { - int i; int j; - int frequency = 0; - int tmpval; - int filenum; - int local_posdata[MAX_STACK_POSITIONS]; - int *posdata; - int sz_worddata; int metaname = 0; int word_count = 0; int loc_count = 0; @@ -755,19 +881,11 @@ char *resultword; long wordID; unsigned long nextposmetaname = 0L; - unsigned char *worddata; - unsigned char *s; - unsigned char flag; - ENTRY *e, *tmp; - int hash; - - DB_InitReadWords(sw, indexf->DB); - - printf("Processing words in index '%s': %3d words\r", indexf->line, word_count); + printf("Getting words in index '%s': %3d words\r", indexf->line, word_count); fflush(stdout); for(j=0;j<256;j++) @@ -778,137 +896,19 @@ while(wordID) { - e = NULL; - loc_count = 0; - - /* Read Word's data */ - DB_ReadWordData(sw, wordID, &worddata, &sz_worddata, indexf->DB); - - /* parse and print word's data */ - s = worddata; - - tmpval = uncompress2(&s); /* tfrequency */ - - metaname = uncompress2(&s); /* metaID */ - - if (metaname) - { - nextposmetaname = UNPACKLONG2(s); - s += sizeof(long); - } - - filenum = 0; - - while(1) - { /* Read on all items */ - uncompress_location_values(&s,&flag,&tmpval,&frequency); - filenum += tmpval; - if(frequency > MAX_STACK_POSITIONS) - posdata = (int *) emalloc(frequency * sizeof(int)); - else - posdata = local_posdata; - - uncompress_location_positions(&s,flag,frequency,posdata); - - - /* now we have the word data */ - for (i = 0; i < frequency; i++, loc_count++) - { - tmp = write_word_pos( sw, indexf, sw_output, filenum_map, filenum, resultword, metaname, posdata[i]); - /* get the pointer to entry for later compress */ - if(!e) - e = tmp; - } - - if(e) - { - /* 08/2002 jmruiz - We will call CompressCurrentLocEntry from time - ** to time to help addentry. - ** If we do not do this, addentry routine will have to run linked lists - ** of positions with thousands of elements and makes the merge proccess - ** very slow - */ - if(!(loc_count % 100)) - { - hash = verybighash(e->word); - if(sw_output->Index->hashentriesdirty[hash]) - { - /* Reset the hashentriesdirty flag - Not needed by merge */ - sw_output->Index->hashentriesdirty[hash] = 0; - CompressCurrentLocEntry(sw_output, sw_output->indexlist, e); - } - } - } - - if(posdata != local_posdata) - efree(posdata); - - if ((s - worddata) == sz_worddata) - break; /* End of worddata */ - - if ((unsigned long)(s - worddata) == nextposmetaname) - { - filenum = 0; - metaname = uncompress2(&s); - if (metaname) - { - nextposmetaname = UNPACKLONG2(s); - s += sizeof(long); - } - else - nextposmetaname = 0L; - } - } - - if(e) - { - /* Reset the hashentriesdirty flag - Not needed by merge */ - sw_output->Index->hashentriesdirty[verybighash(e->word)] = 0; - CompressCurrentLocEntry(sw_output, sw_output->indexlist, e); - word_count++; - } - - - /* Let's coalesce location from time to time to save memory */ - /* FIX 08/2002 jmruiz - ** Unfortunately, we cannot coalesce because this routine - ** need that the worddata is sorted by filenum. In merge, - ** this is not true because new filenums are asigned on the - ** fly in write_word_pos routine. If done - ** - ** BTW, if we cannot call coalesce_word_locations, -e is useless - ** Probably, this need some redesign ... - */ - if(!(word_count % 1000)) - { - printf("Processing words in index '%s': %6d words\r", indexf->line, word_count); - fflush(stdout); - - //coalesce_all_word_locations(sw_output, sw_output->indexlist); - - /* Make zone available for reuse */ - //Mem_ZoneReset(sw_output->Index->currentChunkLocZone); - //sw_output->Index->freeLocMemChain = NULL; - - } - - efree(worddata); + /* Add resultword to output */ + getentry(sw_output, resultword); efree(resultword); DB_ReadNextWordInvertedIndex(sw, word,&resultword,&wordID,indexf->DB); + word_count++; + if(!word_count % 10000) + printf("Getting words in index '%s': %3d words\r", indexf->line, word_count); } } - printf("Processing words in index '%s': %6d words\n", indexf->line, word_count); + printf("Getting words in index '%s': %6d words\n", indexf->line, word_count); DB_EndReadWords(sw, indexf->DB); - /* Coalesce pending work*/ - /* FIX 08/2002 jmruiz - See above - */ - //coalesce_all_word_locations(sw_output, sw_output->indexlist); - - /* Make zone available for reuse */ - //Mem_ZoneReset(sw_output->Index->currentChunkLocZone); - //sw_output->Index->freeLocMemChain = NULL; } /**************************************************************************** @@ -917,48 +917,47 @@ * ****************************************************************************/ -static ENTRY *write_word_pos( SWISH *sw_input, IndexFILE *indexf, SWISH *sw_output, int *file_num_map, int filenum, char *resultword, int metaID, int posdata ) +static void write_word_pos( SWISH *sw_input, IndexFILE *indexf, SWISH *sw_output, int *file_num_map, int filenum, ENTRY *e, int metaID, int posdata ) { int new_file; int new_meta; #ifdef DEBUG_MERGE printf("\nindex %s '%s' Struct: %d Pos: %d", - indexf->line, resultword, structure, position ); + indexf->line, e->word, structure, position ); if ( !(new_file = file_num_map[ filenum ]) ) { printf(" file: %d **File deleted!**\n", filenum); - return NULL; + return; } if ( !(new_meta = indexf->meta_map[ metaID ] )) { printf(" file: %d **Failed to map meta ID **\n", filenum); - return NULL; + return; } printf(" File: %d -> %d Meta: %d -> %d\n", filenum, new_file, metaID, new_meta ); - return addentry( sw_output, resultword, new_file, structure, metaID, position ); + addentry( sw_output, e, new_file, structure, metaID, position ); + return; - /* Compress the entries ? */ - //compress_entries( sw_output ); // maybe after every 1000 words or so? - // * but does this make it so addentry can't lookup a word? #else if ( !(new_file = file_num_map[ filenum ]) ) - return NULL; + return; if ( !(new_meta = indexf->meta_map[ metaID ] )) - return NULL; + return; - return addentry( sw_output, resultword, new_file, GET_STRUCTURE(posdata), metaID, GET_POSITION(posdata) ); + addentry( sw_output, e, new_file, GET_STRUCTURE(posdata), metaID, GET_POSITION(posdata) ); + return; #endif |