/* * CDE - Common Desktop Environment * * Copyright (c) 1993-2012, The Open Group. All rights reserved. * * These libraries and programs are free software; you can * redistribute them and/or modify them under the terms of the GNU * Lesser General Public License as published by the Free Software * Foundation; either version 2 of the License, or (at your option) * any later version. * * These libraries and programs are distributed in the hope that * they will be useful, but WITHOUT ANY WARRANTY; without even the * implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR * PURPOSE. See the GNU Lesser General Public License for more * details. * * You should have received a copy of the GNU Lesser General Public * License along with these libraries and programs; if not, write * to the Free Software Foundation, Inc., 51 Franklin Street, Fifth * Floor, Boston, MA 02110-1301 USA */ /* * COMPONENT_NAME: austext * * FUNCTIONS: descend_tree * displayable * fill_data1 * load_into_bintree * main * print_exit_code * print_usage_msg * put_addrs_2_dtbs_addr_file * segregate_dicname * traverse_tree * user_args_processor * write_2_dtbs_addr_file * write_new_word_2_dtbs * write_to_file * * ORIGINS: 27 * * * (C) COPYRIGHT International Business Machines Corp. 1992,1996 * All Rights Reserved * Licensed Materials - Property of IBM * US Government Users Restricted Rights - Use, duplication or * disclosure restricted by GSA ADP Schedule Contract with IBM Corp. */ /************************ DTSRINDEX.C ******************************* * $XConsortium: dtsrindex.c /main/10 1996/09/23 21:02:54 cde-ibm $ * CDE version of borodin.c * Formerly dtsrindex.c was cborodin.c. * * INPUT FORMAT: * Text file in FZK format. * Each record contains 4 formatted 'lines' (text strings ending in \n): * 1. fzkey (not used in this program). * 2. abstract (not used in this program). * 3. unique database key for the record. Used to find the database * address of the record which is the reference for the inverted index. * 4. The record's date (not used in this program). * * The rest of the record is unformatted text (not necessarily organized * into 'lines'). It is read a character at a time and parsed into * individual words by the parser function for the database's language. * Each record ends with a delimiter string specified by command line arg. * * $Log$ * Revision 2.8 1996/04/10 19:50:38 miker * Deleted dangerous and unnecessary -a option. * * Revision 2.7 1996/03/25 18:54:15 miker * Changed FILENAME_MAX to _POSIX_PATH_MAX. * * Revision 2.6 1996/02/01 18:25:44 miker * AusText 2.1.11, DtSearch 0.3. Pass 1 changed to accommodate * new single-character reading parser/stemmers. * * Revision 2.5 1995/12/29 17:16:04 miker * Bug fix: Opened wrong msg catalog. * * Revision 2.4 1995/12/27 21:18:40 miker * Msg bug: 'percent done' was negative number. * * Revision 2.3 1995/12/01 16:15:44 miker * Deleted unnecessary log2 var, conflict with Solaris function. * Added -r command line arg. * * Revision 2.2 1995/10/26 15:26:53 miker * Added prolog. * * Revision 2.1 1995/09/22 19:29:53 miker * Freeze DtSearch 0.1, AusText 2.1.8 * * Revision 1.3 1995/09/05 21:08:54 miker * Fixed bug: appeared as if 1 and 2 char 'words' were being indexed. * Added DEBUG_P switch. * * Revision 1.2 1995/09/01 22:17:02 miker * Fixed solaris segfault: too many args to printf in print_usage(). * * Revision 1.1 1995/08/31 20:51:08 miker * Initial revision of dtsrindex.c, copied from cborodin.c. * * Log: cborodin.c,v * Revision 1.18 1995/05/30 18:58:54 miker * Correct bug introduced by previous fix (2.1.5c). * * Revision 1.17 1995/05/18 22:54:08 miker * 2.1.5b cborodin bug. Segfault due to overflowing bitvector * after many deletions and no mrclean. */ #include "SearchP.h" #include #include #include #include #include #include #include #include #include #include #include "vista.h" extern void find_keyword (char *cur_word, int vista_num); extern void read_wordstr (struct or_hwordrec * glob_word, int vista_num); extern void write_wordstr (struct or_hwordrec * glob_word, int vista_num); extern void fill_data1 (char *ch); #define PROGNAME "DTSRINDEX" #define BATCH_SIZE 10000L #define WORDS_PER_DOT 500 #define RECS_PER_DOT 20 #define INBUFSZ 1024 /* default input text header line size */ #define MS_misc 1 #define MS_cborodin 14 /******************* BIT VECTORS *****************/ DB_ADDR *word_addrs_ii; /* fread buf for d99 (= tot # dbas) */ DtSrINT32 *dbas_word_count; char *dbas_bits_batch; DB_ADDR *record_addr_word; DtSrINT32 num_addrs_for_word; DtSrINT32 or_reccount; DtSrINT32 bit_vector_size; /*-------------------------- GLOBALS ----------------------------*/ /* batch_size also used by fileman.c for allocating unused holes * in order to no go past end of 'record_addr_word' array. */ extern DtSrINT32 batch_size; char buf[1024]; static int cache_size = CACHE_SIZE; static int check_existing_addrs = TRUE; long count_word_ii = 0L; long dbkey_seqno = 0L; DBLK dblk; DBREC dbrec; static int debugging = 0; #define DEBUG_I 0x01 /* P1 tree insertions */ #define DEBUG_P 0x10 /* P1 parser/stemmer */ #define DEBUG_T 0x02 /* P2 tree dump (words) */ #define DEBUG_N 0x04 /* P2 NEW words, vista */ #define DEBUG_O 0x08 /* P2 OLD words, vista) */ #define DEBUG_t 0x20 /* P2 tree dump (dbas) */ #define DEBUG_n 0x40 /* P2 NEW d99 for new words */ #define DEBUG_o 0x80 /* P2 OLD d99 updates for old words */ static unsigned long default_hashsize; char dicname [10]; char dicpath [_POSIX_PATH_MAX]; static int dotcount = 0; char dtbs_addr_file [_POSIX_PATH_MAX]; FILE *dtbs_addr_fp; long dtbs_size_records = 0L; static long duplicate_recids = 0L; struct stat fstat_input; FILE_HEADER fl_hdr; static char fname_input [_POSIX_PATH_MAX]; struct or_hwordrec got_word; static FILE *instream; char *inbuf; int inbuf_overflowed = FALSE; size_t inbufsz = INBUFSZ; int is_pmr; static DtSrINT32 or_maxdba = 0; static char msg_374[] = "\n%s Out of Memory!\n" " Split the incoming file into several " "smaller files and try again.\n"; static char msg_776[] = "\n%s Write Failure d99 file: %s\n"; char new_dtbs_file = FALSE; long num_of_diff_words = 0L; int normal_retncode = 0; static PARG parg; int parsep_char = END_RETAIN_PAGE; char rec_type; unsigned long record_count = 0UL; int record_lines; static int recs_per_dot = RECS_PER_DOT; static unsigned long seconds_left; extern int shutdown_now; static DtSrINT32 or_recslots; char *sprintbuffer = NULL; char *temp = NULL; extern int debugging_teskey; time_t timestart = 0; time_t totalstart = 0; static int words_per_dot = WORDS_PER_DOT; /************************************************/ /* */ /* DBALIST */ /* */ /************************************************/ typedef struct dba_str { DB_ADDR dba; DtSrINT32 w_c; struct dba_str *next_dba; } DBALIST; /************************************************/ /* */ /* TREENODE */ /* */ /************************************************/ typedef struct _treen_ { char *word; /* ptr to word in stop list */ struct _treen_ *llink; /* left link in binary tree */ struct _treen_ *rlink; /* ptr to right link in binary tree */ DBALIST *dba_list; } TREENODE; static TREENODE *root_node = NULL; static TREENODE *top_of_stack; static TREENODE *stack; static TREENODE *pres; static TREENODE *prev; static TREENODE *next; static TREENODE *avail_node; /************************************************/ /* */ /* displayable */ /* */ /************************************************/ /* Returns static string same as passed string except nonprintable * and nonascii chars replaced by '^' for display. */ static char *displayable (char *passed_string) { static char *buf = NULL; static size_t buflen = 0; size_t passed_len = strlen (passed_string); char *targ, *src; if (buflen < passed_len) { if (buf) free (buf); buflen = passed_len; buf = austext_malloc (buflen + 4, PROGNAME"158", NULL); } targ = buf; for (src = passed_string; *src != 0; src++) { if (*src >= 32 && *src < 127) *targ++ = *src; else *targ++ = '^'; } *targ = 0; return buf; } /* displayable() */ /************************************************/ /* */ /* print_exit_code */ /* */ /************************************************/ /* Called from inside DtSearchExit() at (*austext_exit_last)() */ static void print_exit_code (int exit_code) { if(dotcount) { putchar ('\n'); dotcount = 0; } /* Put total seconds into totalstart */ if (totalstart > 0) totalstart = time (NULL) - totalstart; printf (catgets (dtsearch_catd, MS_cborodin, 206, "%s: Exit Code = %d, Total elapsed time %ldm %lds.\n"), aa_argv0, exit_code, totalstart / 60L, totalstart % 60L); return; } /* print_exit_code() */ /****************************************/ /* */ /* write_to_file() */ /* */ /****************************************/ /* This is the 'visit node' point for the tree traversal * functions of Pass 2 (traverse_tree() and descend_tree()). * * Each tree node = word or stem + linked list of dbas. * When called, each dba list member just contains the number * of times the token appears in that document. This function * chains through the list, builds a statistical 'weight' * for each doc/word pair, and stores it as a reformatted 'dba' * in array 'record_addr_word[]', in 'host' byte swap order. * The count of the current number of addrs * in the array is stored in 'num_addrs_for_word'. * Fill_data1() is then called to update or write a new * vista record and d99 data for the token. * * The weight stored for each doc-word instance is 1 byte. * It's the ratio of log of number of times given word occurs in doc, * divided by log of total count of all words in doc, * scaled to range 0 to 255. * Fundamentally it's a word count of that word in the doc, * but adjusted as follows: * 1) Large occurrences in small documents weigh more than * the same number of occurrences in large documents. * 2) Taking the log skews the ratio to be more linear, * ie take advantage of higher ranges of the 'weight'. * For example a word that occurs in 10% of the document, * will have a weight of .5 (50%). * 3) The scaling changes the ratio, a float between 0. and .9999, * to an integer between 0 and 255. */ void write_to_file (TREENODE * output_node) { DBALIST *print_dba; DB_ADDR mydba; /* 'record_addr_word[]' was permanently allocated * with a size = max batch size so it can hold * all the addrs for a single word node in the tree. * In effect it will replace the dba linked list. * Note: word_addrs_ii (io buffer for d99 file) != record_addr_word[]. */ if (debugging & (DEBUG_T | DEBUG_t)) { /* Print out tree node */ printf (" node '%s' %c%c%c", displayable(output_node->word), (output_node->llink)? 'L' : '.', (output_node->rlink)? 'R' : '.', (debugging & DEBUG_t)? '\n' : ' '); } num_addrs_for_word = 0; /* DtSrINT32 */ print_dba = output_node->dba_list; while (print_dba != NULL) { mydba = print_dba->dba; if (debugging & DEBUG_t) printf (" dba #%ld: node adr=%ld cnt=%ld", (long)num_addrs_for_word, (long)mydba, (long)print_dba->w_c); record_addr_word [num_addrs_for_word] = mydba << 8; /* rec# in hi 3 bytes */ record_addr_word [num_addrs_for_word] += (log ((double) (print_dba->w_c) + 0.5) / log ((double) (dbas_word_count[mydba] + 1))) * 256; if (debugging & DEBUG_t) printf (" -> x%lx (%ld:%ld)\n", (long)record_addr_word [num_addrs_for_word], (long)record_addr_word [num_addrs_for_word] >> 8, (long)record_addr_word [num_addrs_for_word] & 0xffL); print_dba = print_dba->next_dba; num_addrs_for_word++; if (num_addrs_for_word >= batch_size) { printf (catgets (dtsearch_catd, MS_cborodin, 280, "\n%s num_addrs_for_word (%ld) >= batchsz (%ld).\n"), PROGNAME"280", (long)num_addrs_for_word, (long)batch_size); DtSearchExit (91); } } if ((debugging & DEBUG_T) && !(debugging & DEBUG_t)) printf (" dbacnt=%ld\n", (long)num_addrs_for_word); fill_data1 (output_node->word); return; } /* write_to_file() */ /****************************************/ /* */ /* descend_tree() */ /* */ /****************************************/ /* Coroutine of traverse_tree(), Pass 2 Robson tree traversal. * The write_to_file() function is the 'preorder visit' point. */ void descend_tree (void) { int not_done = TRUE; while (not_done) { if ((pres->llink == NULL) && (pres->rlink == NULL)) { write_to_file (pres); avail_node = pres; return; } if (pres->llink != NULL) { next = pres->llink; pres->llink = prev; prev = pres; pres = next; } else { write_to_file (pres); next = pres->rlink; pres->rlink = prev; prev = pres; pres = next; } } return; } /* descend_tree() */ /********************************/ /* */ /* traverse_tree */ /* */ /********************************/ /* This is the actual Pass 2 function, a tree traversal * of Pass 1's word-dba binary tree. * The algorithm is based on the J. M. ROBSON link inversion traversal * algorithm for binary trees. Ref. Thomas A. STANDISH pp. 77-78. * The write_to_file() function is the 'preorder visit' point. */ void traverse_tree (void) { int not_done = TRUE; int descend = TRUE; /* Dheck for the empty tree */ if (root_node == NULL) { printf (catgets (dtsearch_catd, MS_cborodin, 288, "%s Abort. There are no words in the input file %s.\n"), PROGNAME"288", fname_input); DtSearchExit (34); } /* Initialize the variables */ pres = root_node; prev = pres; top_of_stack = NULL; stack = NULL; while (not_done) { if (descend) { descend_tree (); } if (pres == root_node) { return; } if (prev->rlink == NULL) { write_to_file (prev); next = prev->llink; prev->llink = pres; pres = prev; prev = next; descend = FALSE; } else { if (prev->llink == NULL) { next = prev->rlink; prev->rlink = pres; pres = prev; prev = next; descend = FALSE; } else { if (prev == top_of_stack) { next = stack; top_of_stack = stack->rlink; stack = stack->llink; next->llink = NULL; next->rlink = NULL; next = prev->llink; prev->llink = prev->rlink; prev->rlink = pres; pres = prev; prev = next; descend = FALSE; } else { write_to_file (prev); avail_node->llink = stack; avail_node->rlink = top_of_stack; stack = avail_node; top_of_stack = prev; next = prev->rlink; prev->rlink = pres; pres = next; descend = TRUE; } } } } } /* traverse_tree() */ /********************************************************/ /* */ /* print_usage_msg */ /* */ /********************************************************/ static void print_usage_msg (void) { printf (catgets (dtsearch_catd, MS_cborodin, 17, "\n" "USAGE: %s -d [options] \n" " Listed default file name extensions can be overridden.\n" " -d 1 - 8 character database name, include optional path prefix.\n" " -t End of text document delimiter string. Default '\\f\\n'.\n" " -r Change Pass 1 records-per-dot from %d to .\n" " -b Change max batch size from %ld to .\n" " -c Change database paging cache from %ld 1K pages to 1K pages.\n" " >= 16 by powers of 2. Initially try only small changes.\n" " -i Change (i)nput buffer size from default %d to .\n" " -h Change duplicate record id hash table size from %ld to .\n" " -h0 means there are no duplicates, do not check for them.\n" " Input [path]file name. Default extension %s.\n"), aa_argv0, (int) RECS_PER_DOT, (long) BATCH_SIZE, (long) CACHE_SIZE, (int) INBUFSZ, default_hashsize, EXT_FZKEY); return; } /* print_usage_msg() */ /********************************************************/ /* */ /* segregate_dicname */ /* */ /********************************************************/ /* Separates dictionary name from pathname and loads * them into the globals 'dicname' and 'dicpath'. * Returns TRUE if dicname is valid, else returns FALSE. */ static int segregate_dicname (char *string) { char mybuf[_POSIX_PATH_MAX]; char *ptr; int i; strncpy (mybuf, string, sizeof (mybuf)); mybuf[sizeof (mybuf) - 1] = 0; /* * Set 'ptr' to just the 8 char dictionary name by moving * it backwards until first non-alphanumeric character * (such as a ":" in the dos drive id or a slash between directories), * or to the beginning of string. */ for (ptr = mybuf + strlen (mybuf) - 1; ptr >= mybuf; ptr--) if (!isalnum (*ptr)) { ptr++; break; } if (ptr < mybuf) ptr = mybuf; /* test for valid dictionary name */ i = strlen (ptr); if (i < 1 || i > 8) return FALSE; strcpy (dicname, ptr); *ptr = 0; strncpy (dicpath, mybuf, sizeof (dicpath)); dicpath[sizeof (dicpath) - 1] = 0; return TRUE; } /* segregate_dicname() */ /********************************************************/ /* */ /* USER_ARGS_PROCESSOR */ /* */ /********************************************************/ /* handles command line arguments for 'main' */ void user_args_processor (int argc, char **argv) { char *argptr; char *targ, *src; int i; if (argc <= 1) { print_usage_msg (); DtSearchExit (2); } /* Initialize some variables prior to parsing command line */ dicname[0] = 0; dicpath[0] = 0; /* Each pass grabs new parm of "-xxx" format */ while (--argc > 0 && (*++argv)[0] == '-') { argptr = argv[0]; switch (argptr[1]) { case 't': /* ETX delimiter string */ /* Replace any "\n" string with real linefeed */ targ = parg.etxdelim = malloc (strlen (argptr + 2) + 4); src = argptr + 2; while (*src) { if (src[0] == '\\' && src[1] == 'n') { *targ++ = '\n'; src += 2; } else *targ++ = *src++; } *targ = 0; break; case 'r': if ((recs_per_dot = atoi (argptr + 2)) <= 0) { printf (catgets (dtsearch_catd, MS_cborodin, 577, "%s Invalid arg '%s'. Using default -r%d.\n"), PROGNAME"577", argptr, RECS_PER_DOT); recs_per_dot = RECS_PER_DOT; } break; case 'h': duprec_hashsize = atol (argptr + 2); if (duprec_hashsize == 0UL) printf (catgets (dtsearch_catd, MS_cborodin, 539, "%s Duplicate record id checking disabled.\n"), PROGNAME"539"); break; case 'b': batch_size = atol (argptr + 2); if (batch_size <= 0L) { printf (catgets (dtsearch_catd, MS_cborodin, 595, "%s Invalid batch size argument '%s'.\n"), PROGNAME"595", argptr); goto BADPARM; } break; case 'c': cache_size = atoi (argptr + 2); if (cache_size < 16) { /* minimum size is 16 */ if (cache_size > 0) cache_size = 16; /* on error reset size to default */ else cache_size = CACHE_SIZE; CACHE_ADJUSTED: printf (catgets (dtsearch_catd, MS_cborodin, 600, "%sCache size readjusted to %d.\n"), PROGNAME "600 ", cache_size); break; } /* If necessary, round up to nearest power of 2 */ for (i = 4; i < 12; i++) if (1 << i >= cache_size) break; i = 1 << i; if (i != cache_size) { cache_size = i; goto CACHE_ADJUSTED; } break; case 'D': /* unadvertised debugging feature */ for (i = 2; argptr[i] != 0; i++) { switch (argptr[i]) { case 'I': debugging |= DEBUG_I; break; case 'P': debugging |= DEBUG_P; /******* debugging_teskey = TRUE; ******/ break; case 'N': debugging |= DEBUG_N; break; case 'n': debugging |= DEBUG_n; break; case 'O': debugging |= DEBUG_O; break; case 'o': debugging |= DEBUG_o; break; case 'T': debugging |= DEBUG_T; break; case 't': debugging |= DEBUG_t; break; default: goto BADPARM; } } break; case 'd': /* May include both dicname and dicpath */ if (!segregate_dicname (argptr + 2)) { printf (catgets (dtsearch_catd, MS_cborodin, 550, "%s '%s' is invalid path/database name.\n"), PROGNAME"550", argptr); goto BADPARM; } break; case 'i': /* (I)nput buffer size */ if ((inbufsz = atol (argptr + 2)) <= 0) { printf (catgets (dtsearch_catd, MS_cborodin, 558, "%s Invalid input buffer size '%s'.\n"), PROGNAME"558", argptr); goto BADPARM; } break; default: printf (catgets (dtsearch_catd, MS_cborodin, 567, "%s Unknown command line argument '%s'.\n"), PROGNAME"567", argptr); BADPARM: print_usage_msg (); DtSearchExit (2); /* abort */ } /* endswitch */ } /* endwhile for cmd line '-'processing */ /* Validate input file name */ if (argc-- <= 0) { printf (catgets (dtsearch_catd, MS_cborodin, 580, "%s Missing required input file name.\n"), PROGNAME"580"); goto BADPARM; } /* Don't incr argv yet--save input file name */ else append_ext (fname_input, _POSIX_PATH_MAX, argv[0], EXT_FZKEY); /* Check for missing database name */ if (dicname[0] == 0) { printf (catgets (dtsearch_catd, MS_cborodin, 589, "%s No database name specified (-d argument).\a\n"), PROGNAME"589"); goto BADPARM; } strcpy (dblk.name, dicname); dblk.path = dicpath; return; } /* user_args_processor() */ /****************************************/ /* */ /* put_addrs_2_dtbs_addr_file */ /* */ /****************************************/ /* Suboutine of write_2_dtbs_addr_file() from Pass 2. * That function has used a bit vector to determine * the total change in old d99 addrs for preexisting words, * and prepared for writing an array of old dbas that * are not in the current words tree node (globally named * word_addrs_ii [num_addrs]). * The addrs that ARE in the Pass 1 node fzk file were previously * prepared in a similar array of dbas, globally named * record_addr_word [num_addrs_for_word] but passed here as * 'addrs_array' and 'nitems'. * Both arrays will be byte swapped from 'host' to * 'network' order in this function. * This function does the actual fwrite of both arrays to the d99. * If the number of new addrs can fit in the available free slots, * it rewrites to original offset, otherwise appends to end of d99. */ static void put_addrs_2_dtbs_addr_file ( DB_ADDR *addrs_array, DtSrINT32 nitems) { FREE_SPACE_STR *free_slot; FREE_SPACE_STR del_rec; DtSrINT32 int32; DtSrINT32 num_writes; DtSrINT32 num_addrs; if (nitems >= batch_size) { printf ( catgets(dtsearch_catd, MS_cborodin, 6, "put_addrs_2_dtbs_addr_file() nitems=%d, batchsz=%ld\n") , (int)nitems, (long)batch_size); DtSearchExit (58); } num_addrs = got_word.or_hwaddrs; got_word.or_hwaddrs += nitems; /** somehow, this can exceed total **** num addrs in database by 1 (!?) ******/ /* (...only if prev 'overlay/compression' didn't delete all) */ #ifdef BYTE_SWAP /* Put both arrays in 'network' byte order */ for (int32 = 0; int32 < nitems; int32++) HTONL (addrs_array[int32]); for (int32 = 0; int32 < num_addrs; int32++) HTONL (word_addrs_ii[int32]); #endif /* * If number of new addresses greater than number of free holes, * find new free slot that is big enough to hold the data . */ if (nitems > got_word.or_hwfree) { /* Discard old slot, find new one. */ del_rec.hole_size = num_addrs + got_word.or_hwfree; del_rec.offset = got_word.or_hwoffset; free_slot = find_free_space (got_word.or_hwaddrs, &fl_hdr); add_free_space (&del_rec, &fl_hdr); if (free_slot == NULL) { fseek (dtbs_addr_fp, 0L, SEEK_END); got_word.or_hwoffset = ftell (dtbs_addr_fp); got_word.or_hwfree = 0; } else { fseek (dtbs_addr_fp, free_slot->offset, SEEK_SET); got_word.or_hwoffset = free_slot->offset; got_word.or_hwfree = free_slot->hole_size - got_word.or_hwaddrs; } /*----- Write new database addresses to a file -----*/ num_writes = fwrite (addrs_array, sizeof(DB_ADDR), (size_t)nitems, dtbs_addr_fp); if (num_writes != nitems) { DtSearchExit (98); } /* Copy the old addresses immediately after the new ones */ num_writes = fwrite (word_addrs_ii, sizeof(DB_ADDR), (size_t)num_addrs, dtbs_addr_fp); if (num_writes != num_addrs) { printf (catgets (dtsearch_catd, MS_cborodin, 776, msg_776), PROGNAME"776", strerror(errno)); DtSearchExit (76); } /* Write foxes to the free holes, if any, no byte swap */ for (int32 = 0; int32 < got_word.or_hwfree; int32++) addrs_array [int32] = 0xFFFFFFFF; num_writes = fwrite (addrs_array, sizeof(DB_ADDR), (size_t)got_word.or_hwfree, dtbs_addr_fp); if (num_writes != got_word.or_hwfree) { printf (catgets (dtsearch_catd, MS_cborodin, 776, msg_776), PROGNAME"786", strerror(errno)); DtSearchExit (86); } } /* end if (nitems > got_word.or_hwfree), had to get bigger slot */ /* Else can reuse existing slot. * Write the new addresses into free holes. * The remaining free holes should already have foxes. (?) */ else { fseek (dtbs_addr_fp, got_word.or_hwoffset, SEEK_SET); num_writes = fwrite (addrs_array, sizeof(DB_ADDR), (size_t)nitems, dtbs_addr_fp); if (num_writes != nitems) { printf (catgets (dtsearch_catd, MS_cborodin, 776, msg_776), PROGNAME"798", strerror(errno)); DtSearchExit (87); } /* Copy the old addresses immediately after the new ones */ num_writes = fwrite (word_addrs_ii, sizeof(DB_ADDR), (size_t)num_addrs, dtbs_addr_fp); if (num_writes != num_addrs) { printf (catgets (dtsearch_catd, MS_cborodin, 776, msg_776), PROGNAME"889", strerror(errno)); DtSearchExit (89); } got_word.or_hwfree -= nitems; } } /* put_addrs_2_dtbs_addr_file() */ /****************************************/ /* */ /* write_2_dtbs_addr_file */ /* */ /****************************************/ /* Subroutine of fill_data1() from Pass 2. * Updates OLD (preexisting) word's d99 file. * * The vista word rec has already been read into global 'got_word'. * record_addr_word [num_addrs_for_word] is the array of dba's * for docs from this batch that contain the current word (built by * fill_data1 from the dba_list for the word's Pass 1 binary tree node, * and still in 'host' byte swap order). * This function freads all the old addresses for that word from * the d99 file. It then deletes(!) d99 addrs that * are in the word's Pass 1 tree node. It then calls * put_addrs_2_dtbs_addr_file() to fwrite out the * dba's in the tree, which are either brand new, * or are 'updating' the deleted addrs. * Then it writes the modified old addrs. * Then rewrites vista word rec with new data. * * The bit vector dbas_bits_batch contains a 1 bit * for every dba for every doc in the fzk file. * got_word structure: * .or_hwordkey - the word. (always in a 'huge' word buffer). * .or_hwoffset - offset in a d99 inverted index file for * a given word. the first address starts * at this position. * .or_hwaddrs - total number of addresses for a given word. * .or_hwfree - number of free slots in a database * addresses file for a given word. */ void write_2_dtbs_addr_file (void) { DtSrINT32 num_addrs_ii; DtSrINT32 num_reads; DtSrINT32 i_start, k, cur_ind; DtSrINT32 num_delete_addrs = 0; char addrs_removed = FALSE; DtSrINT32 i; DtSrINT32 cur_byte; char bit_addrs; DB_ADDR temp1; if (debugging & DEBUG_O) printf (" old vis '%s' ofs=%ld adr=%ld fre=%ld\n", displayable(got_word.or_hwordkey), (long) got_word.or_hwoffset, (long) got_word.or_hwaddrs, (long) got_word.or_hwfree); num_addrs_ii = got_word.or_hwaddrs; if (num_addrs_ii > or_reccount) { printf (catgets (dtsearch_catd, MS_cborodin, 713, "\n%s Word '%s' occurs in %ld records,\n" " but there are only %ld records in database!\n" " (This may be a good candidate for the stoplist).\n"), PROGNAME"713", (long) got_word.or_hwordkey, (long) num_addrs_ii, (long) or_reccount); DtSearchExit (68); } if (fseek (dtbs_addr_fp, (long) got_word.or_hwoffset, SEEK_SET) != 0) { printf (catgets (dtsearch_catd, MS_cborodin, 875, "\n%s Could not fseek d99 file to offset %ld.\n"), PROGNAME"875", got_word.or_hwoffset); DtSearchExit (98); } num_reads = fread (word_addrs_ii, sizeof(DB_ADDR), (size_t)num_addrs_ii, dtbs_addr_fp); if (num_reads != num_addrs_ii) { printf (catgets (dtsearch_catd, MS_cborodin, 848, "\n%s Could not fread %ld bytes (%ld dba's) of d99 file\n" " at offset %ld. Number of dba's read (return code) = %ld.\n"), PROGNAME"848", sizeof(DB_ADDR) * num_addrs_ii, (long)num_addrs_ii, (long)got_word.or_hwoffset, (long)num_reads); DtSearchExit (98); } #ifdef BYTE_SWAP for (i = 0; i < num_addrs_ii; i++) NTOHL (word_addrs_ii[i]); /* Now both addr arrays are in 'host' byte swap order */ #endif /* If there are only new docs, * this switch will prevent the checking for updates. */ if (check_existing_addrs) { i_start = 0; /* Loop on every preexisting dba for word as read from d99 */ for (i = 0; i < num_addrs_ii; i++) { if (debugging & DEBUG_o) printf (" old d99 %ld: x%lx(%ld:%ld)", (long) i, (long) word_addrs_ii[i], (long) word_addrs_ii[i] >> 8, (long) word_addrs_ii[i] & 0xffL); /* Get 'record number' by shifting hi 3 bytes 1 byte (8 bits) * to right over stat wt byte. D99 rec#'s start at 1, * so subtract 1 to start at 0 for bit vector. */ temp1 = (*(word_addrs_ii + i) >> 8) - 1; /* = rec#, base 0 */ cur_byte = temp1 >> 3; /* get matching byte# in bit vector */ if (cur_byte >= bit_vector_size) { printf ( catgets(dtsearch_catd, MS_cborodin, 9, "\n%s Corrupted d99 file for word '%s',\n" " database address %ld @ file position %ld => bitvector[%ld]," " but max bitvector allocation = %ld.\n") , PROGNAME"727", displayable(got_word.or_hwordkey), (long)temp1, (long)i, (long)cur_byte, (long)bit_vector_size); DtSearchExit (69); } bit_addrs = 0; bit_addrs |= 1 << (temp1 % 8); /* bit mask */ /* * If this dba, which is on the current word's old d99 * addrs list, is also a doc in the fzk file (dbas_bits_batch), * delete it from the d99 list by writing subsequent dba's * over it. Boy this recursive nested loop has gotta be slow. * Faster algorithm? Add 'good' addrs to the end of * record_addr_word[]. No nested overlay loop, only one write! */ if (bit_addrs & (*(dbas_bits_batch + cur_byte))) { addrs_removed = TRUE; num_delete_addrs++; if (i_start == 0) { cur_ind = i; i_start = i + 1; } else { if (i_start < i) { /* compress: move good addrs over * space of deleted ones */ for (k = i_start; k < i; k++) { word_addrs_ii[cur_ind] = word_addrs_ii[k]; cur_ind++; } } i_start = i + 1; } } /* end if where dba is on both fzk list and curr d99 */ } /* end loop on every d99 addr for this word */ if (addrs_removed) { /* final overlay compression */ if (i_start < i) { /* compress: move good addrs over * space of deleted ones */ for (k = i_start; k < i; k++) { word_addrs_ii[cur_ind] = word_addrs_ii[k]; cur_ind++; } } } } /* end if (check_existing_addrs) */ got_word.or_hwaddrs -= num_delete_addrs; got_word.or_hwfree += num_delete_addrs; /* The old dba array word_addrs_ii[] is now 'compressed', * it contains only addrs not in fzk file. * And the vista rec 'got_word' now matches it. * And record_addr_word[] still contains * the new/updated addrs from the fzk file. * Now Efim calls a func to write them both back out to d99 file. */ put_addrs_2_dtbs_addr_file (record_addr_word, num_addrs_for_word); write_wordstr (&got_word, 0); /* update vista WORD rec */ return; } /* write_2_dtbs_addr_file() */ /********************************/ /* */ /* write_new_word_2_dtbs */ /* */ /********************************/ /* Subroutine of fill_data1() in Pass 2 for a NEW word. * Writes d99 data, and updates (empty) got_word vista record. * record_addr_word [num_addrs_for_word] is the array of addrs * for docs from this batch that contain the current word (built by * fill_data1 from the dba_list for the word's Pass 1 binary tree node). * It will be byte swapped from 'host' to 'network' order in this function. */ void write_new_word_2_dtbs (void) { FREE_SPACE_STR *free_slot; DtSrINT32 num_writes; int ret_fseek; DtSrINT32 int32; if (debugging & (DEBUG_n | DEBUG_N)) printf (" new word '%s', adrs=%ld,", got_word.or_hwordkey, (long)num_addrs_for_word); free_slot = find_free_space (num_addrs_for_word, &fl_hdr); if (free_slot == NULL) { /* append addrs to end of d99 file */ ret_fseek = fseek (dtbs_addr_fp, 0L, SEEK_END); got_word.or_hwoffset = ftell (dtbs_addr_fp); got_word.or_hwfree = 0; if (debugging & (DEBUG_n | DEBUG_N)) printf ("APPEND ofs=%ld, fre=0\n", (long int) got_word.or_hwoffset); } else { ret_fseek = fseek (dtbs_addr_fp, (long)free_slot->offset, SEEK_SET); got_word.or_hwoffset = free_slot->offset; got_word.or_hwfree = free_slot->hole_size - num_addrs_for_word; if (debugging & (DEBUG_n | DEBUG_N)) printf (" REUSE slot ofs=%ld, fre=%ld\n", (long int) got_word.or_hwoffset, (long int) got_word.or_hwfree); } /***** Write new database addresses to d99 file *********/ if (debugging & DEBUG_n) { for (int32 = 0; int32 < num_addrs_for_word; int32++) { printf (" dba #%ld: x%lx(%ld:%ld)\n", (long)int32, (long)record_addr_word[int32], (long)record_addr_word[int32] >> 8, (long)record_addr_word[int32] & 0xffL); } } #ifdef BYTE_SWAP /* Put addr array in 'network' byte order */ for (int32 = 0; int32 < num_addrs_for_word; int32++) HTONL (record_addr_word[int32]); #endif num_writes = fwrite (record_addr_word, sizeof(DB_ADDR), (size_t)num_addrs_for_word, dtbs_addr_fp); if (num_writes != num_addrs_for_word) DtSearchExit (97); got_word.or_hwaddrs = num_addrs_for_word; if (got_word.or_hwfree != 0) { /* Fill unused free holes with foxes for debugging. * Note that byte swap is unnecessary for foxes. * Note that record_addr_word is now available for this action. */ for (int32 = 0; int32 < got_word.or_hwfree; int32++) *(record_addr_word + int32) = 0xFFFFFFFF; num_writes = fwrite (record_addr_word, sizeof(DB_ADDR), (size_t)got_word.or_hwfree, dtbs_addr_fp); if (num_writes != got_word.or_hwfree) { printf (catgets (dtsearch_catd, MS_cborodin, 776, msg_776), PROGNAME"960", strerror(errno)); DtSearchExit (96); } } /* Save changed word_info structure back to the vista database! */ write_wordstr (&got_word, 0); return; } /* write_new_word_2_dtbs() */ /************************/ /* */ /* fill_data1 */ /* */ /************************/ /* Called from write_to_file() in Pass 2. * Write_to_file() is 'visit node' function of tree traversal. * It has converted dbalist in each word node in tree to * array of dbas (record_addr_word [num_addrs_for_word]) * with correct statistical weighting, still in 'host' byte swap order. * This function seeks word key in database. If word is new, * it calls functions to write new vista rec and d99 data. * If word is old it calls functions to read word rec and update d99. */ void fill_data1 (char *node_word) { char miker[1024]; strcpy (miker, node_word); count_word_ii++; if (shutdown_now) { printf (catgets (dtsearch_catd, MS_cborodin, 164, "\n%s Abort due to signal %d. Database %s\n" " probably corrupted. Restore backup database.\n"), PROGNAME"164", shutdown_now, dicname); DtSearchExit (10); } /* print occasional progress dots and msgs */ if (!(count_word_ii % words_per_dot)) { putchar ('.'); dotcount++; if (!(dotcount % 10)) putchar (' '); if (dotcount >= 50) { dotcount = 0; seconds_left = (unsigned long) (((float) num_of_diff_words / (float) count_word_ii - 1.) * (float) (time (NULL) - timestart)); printf (catgets (dtsearch_catd, MS_cborodin, 849, "\n%s: Word #%ld, %.0f%% done. Est %lum %02lus " "to completion.\n"), aa_argv0, count_word_ii, (float) count_word_ii / (float) num_of_diff_words * 100.0, /***(count_word_ii * 100L) / num_of_diff_words,***/ seconds_left / 60L, seconds_left % 60L); } else fflush (stdout); } /* endif for progress dots and msgs */ strncpy (got_word.or_hwordkey, node_word, DtSrMAXWIDTH_HWORD); got_word.or_hwordkey[DtSrMAXWIDTH_HWORD - 1] = 0; find_keyword (miker, 0); /* vista KEYFIND for word rec */ if (db_status == S_NOTFOUND) { /* this is a NEW word */ got_word.or_hwoffset = 0; got_word.or_hwfree = 0; got_word.or_hwaddrs = 0; fillnew_wordrec (&got_word, 0); /* write (empty) vista word rec */ if (db_status != S_OKAY) vista_abort (PROGNAME"981"); write_new_word_2_dtbs(); /* write NEW word's d99 entries * and update vista word rec */ return; } /* update previously existing word */ read_wordstr (&got_word, 0); /* read OLD word rec into got_word */ if (db_status == S_OKAY) write_2_dtbs_addr_file(); /* update OLD word's d99 entries * and update vista word rec */ return; } /* fill_data1() */ /************************************************/ /* */ /* load_into_bintree */ /* */ /************************************************/ /* Pass 1 function. * Loads parsed word token or stem token into * inverted index binary tree along with passed dba. * Token is allowed to be empty, ie first byte is \0. * Derived from Efim's original 'teskey_parse()' * and bin_tree() functions. * Variables static for speeeeeeed. */ static void load_into_bintree ( char *parser_token, int token_is_stem, DB_ADDR dba) { static DtSrINT16 or_maxwordsz; static char *cptr; static int i; static TREENODE **this_link; static TREENODE *newnode; static DBALIST *newdba; static char *tokbuf = NULL; if (*parser_token == 0) { if (debugging & DEBUG_I) printf (" bintr= dba=%ld\n", (long)dba); return; } /* Copy token to a buffer. * Stems have a special prefix character * to distinguish them from words. * Also increment total dba word count. */ if (tokbuf == NULL) { or_maxwordsz = dblk.dbrec.or_maxwordsz; tokbuf = austext_malloc ((size_t) or_maxwordsz + 4, PROGNAME"1152", NULL); } if (token_is_stem) { tokbuf[0] = STEM_CH; strncpy (tokbuf + 1, parser_token, (size_t)or_maxwordsz); dbas_word_count[dba]++; } else strncpy (tokbuf, parser_token, (size_t)or_maxwordsz); tokbuf [or_maxwordsz] = 0; if (debugging & DEBUG_I) printf (" bintr='%s' dba=%ld ", displayable(tokbuf), (long)dba); /* TREE TRAVERSAL. Search binary tree to find either * insertion point or identical preexisting token. */ for (this_link = &root_node; *this_link != NULL; ) { i = strcmp (tokbuf, (*this_link)->word); /* If identical word/stem token already exists... */ if (i == 0) { /* If token appears more than once in current * document (dba already exists at top of dba list), * just increment the word count in the list. */ if ((*this_link)->dba_list->dba == dba) (*this_link)->dba_list->w_c++; /* If this is first appearance of token for this doc * (dba is not at start of token's dba list), * insert dba at start of token's dba list. */ else { if ((newdba = malloc (sizeof(DBALIST))) == NULL) { printf (catgets (dtsearch_catd, MS_cborodin, 374, msg_374), PROGNAME"1150"); DtSearchExit (26); } newdba->dba = dba; newdba->w_c = 1; newdba->next_dba = (*this_link)->dba_list; (*this_link)->dba_list = newdba; } if (debugging & DEBUG_I) printf (" Old %ld=%ld\n", (long)((*this_link)->dba_list->dba), (long)((*this_link)->dba_list->w_c)); return; /* done with token */ } /* endif where token was found in binary tree */ /* Increment link ptr by descending to correct subtree */ if (i < 0) { this_link = &(*this_link)->llink; if (debugging & DEBUG_I) putchar ('L'); } else { this_link = &(*this_link)->rlink; if (debugging & DEBUG_I) putchar ('R'); } } /* end tree traversal */ /* Tree traversal never found a preexisting token node. * Create a new node and insert it at the point * indicated by link ptr. */ newnode = austext_malloc (sizeof(TREENODE) + strlen(tokbuf) + 4, PROGNAME"1234", NULL); newnode->llink = NULL; newnode->rlink = NULL; newnode->word = (char *) (newnode + 1); /* use mem at end of node */ strcpy (newnode->word, tokbuf); newdba = austext_malloc (sizeof(DBALIST), PROGNAME"1235", NULL); newnode->dba_list = newdba; newdba->dba = dba; newdba->w_c = 1; newdba->next_dba = NULL; *this_link = newnode; num_of_diff_words++; if (debugging & DEBUG_I) printf (" New %ld=%ld\n", (long)((*this_link)->dba_list->dba), (long)((*this_link)->dba_list->w_c)); return; } /* load_into_bintree() */ /**********************************************/ /* */ /* MAIN */ /* */ /**********************************************/ main (int argc, char **argv) { int i; long word_offset; /* <-- PARG.offsetp */ long bytes_in; /* ftell() */ DtSrINT32 dba_offset; int got_ETX; char *cptr, *src; char temp_buf[40]; char db_key [DtSrMAX_DB_KEYSIZE + 2]; int oops = FALSE; DtSrINT32 cur_byte; struct tm *tmptr; DB_ADDR dba, temp_dba; time_t elapsed; size_t mallocsz; char *parsebufp, *stembufp; /******************* INITIALIZE ******************/ setlocale (LC_ALL, ""); dtsearch_catd = catopen (FNAME_DTSRCAT, 0); aa_argv0 = strdup (argv[0]); time (&elapsed); tmptr = localtime (&elapsed); strftime (buf, sizeof(buf), catgets (dtsearch_catd, MS_misc, 22, "%A, %b %d %Y, %I:%M %p"), tmptr); printf (catgets (dtsearch_catd, MS_cborodin, 1, "%s. Run %s.\n"), aa_argv0, buf); austext_exit_last = print_exit_code; batch_size = BATCH_SIZE; init_user_interrupt (); default_hashsize = duprec_hashsize; memset (&dblk, 0, sizeof(DBLK)); memset (&parg, 0, sizeof(PARG)); parg.dblk = &dblk; parg.etxdelim = ETXDELIM; /* default, can be changed */ parg.offsetp = &word_offset; parg.flags |= PA_INDEXING; /* do compounding, if parser can */ /* Read user specified command line arguments */ user_args_processor (argc, argv); /* Finish init now that we know final values */ inbuf = austext_malloc (inbufsz + 16, PROGNAME"1349", NULL); temp = austext_malloc (inbufsz + 16, PROGNAME"1285", NULL); sprintbuffer = austext_malloc (inbufsz + _POSIX_PATH_MAX + 16, PROGNAME"1286", NULL); record_addr_word = austext_malloc ((sizeof(DB_ADDR) * batch_size) + 16, PROGNAME "1133", NULL); /* Save dicname and path in dblk. Save full name of d99 file. */ strcpy (dblk.name, dicname); dblk.path = dicpath; strcpy (dtbs_addr_file, dicpath); strcat (dtbs_addr_file, dicname); strcat (dtbs_addr_file, EXT_DTBS); /* Open the database */ if (!austext_dopen (dicname, dicpath, NULL, cache_size, &dbrec)) { fprintf (aa_stderr, "%s\n", DtSearchGetMessages()); DtSearchExit (3); } memcpy (&dblk.dbrec, &dbrec, sizeof(DBREC)); /* Load database's parser, stemmer, and linguistic files into dblk. */ if (!load_language (&dblk, NULL)) { puts (DtSearchGetMessages()); printf (catgets (dtsearch_catd, MS_cborodin, 1097, "%s Aborting due to errors in loading language files.\n"), PROGNAME"1097"); DtSearchExit(3); } RECFRST (PROGNAME "1067", OR_OBJREC, 0); CRGET (PROGNAME "1069", &dba, 0); /* byte swap already done in vista */ or_reccount = dbrec.or_reccount; /* DtSrINT32 */ or_recslots = dbrec.or_recslots; /* promoted to DtSrINT32 */ or_maxdba = dbrec.or_maxdba; /* DtSrINT32 lim of dbas_word_count */ bit_vector_size = ((or_maxdba / or_recslots + 1) >> 3) + 1; /* DtSrINT32 */ dba_offset = or_recslots - (dba & 0x00FFFFFF); /* DtSrINT32 */ if (debugging) printf (PROGNAME"1286 " "realnumrec=%ld recslots=%ld bitvecsz=%ld" " dbaoffset=%d maxdba=%ld\n", (long)or_reccount, (long)or_recslots, (long)bit_vector_size, (int)dba_offset, (long)or_maxdba); /* Allocate memory space for the arrays. * dbas_bits_batch = 'bit vector', one bit for every possible rec#. * the 1 bits = only the dba's that are in this fzk batch. * word_addrs_ii = fread buffer for d99 file. * dbas_word_count = summing bkts for word count statistics. */ dbas_bits_batch = (char *) austext_malloc ((size_t)bit_vector_size + 48, PROGNAME "1150", NULL); word_addrs_ii = (DB_ADDR *) austext_malloc ( sizeof (DB_ADDR) * (or_reccount + 1) + 48, PROGNAME "1152", NULL); mallocsz = sizeof(DtSrINT32) * (or_maxdba + 1) + 48; dbas_word_count = (DtSrINT32 *) austext_malloc (mallocsz, PROGNAME "1154", NULL); memset (dbas_bits_batch, 0, (size_t)bit_vector_size + 48); memset (dbas_word_count, 0, mallocsz); root_node = NULL; /* Open the d99 file that contains database addresses. * If the file doesn't exist, it means the database * for keyword search is empty - open a new file. */ if ((dtbs_addr_fp = fopen (dtbs_addr_file, "r+b")) == NULL) { dtbs_addr_fp = fopen (dtbs_addr_file, "w+b"); check_existing_addrs = FALSE; new_dtbs_file = TRUE; if (dtbs_addr_fp == NULL) { /* msg 1068 used multiple places */ printf (catgets (dtsearch_catd, MS_cborodin, 1068, "%s Can't open new inverted index file '%s': %s\n"), PROGNAME"1068", dtbs_addr_file, strerror(errno)); DtSearchExit (13); } /* write New Header Information to a file */ init_header (dtbs_addr_fp, &fl_hdr); } else { /* read Header Information from d99 file */ if (!fread_d99_header (&fl_hdr, dtbs_addr_fp)) { /* msg 1068 used multiple places */ printf (catgets (dtsearch_catd, MS_cborodin, 1068, "%s Can't read header data for '%s': %s\n"), PROGNAME"1422", dtbs_addr_file, strerror(errno)); DtSearchExit (13); } } /* open input .fzk file */ src = getcwd (sprintbuffer, _POSIX_PATH_MAX); if (!src && debugging) printf (PROGNAME"1336 Can't getcwd: %s.\n", strerror(errno)); if (!src) src = getenv ("PWD"); printf (catgets (dtsearch_catd, MS_misc, 24, "%s: current working directory = '%s', .fzk file = '%s'\n"), aa_argv0, (src) ? src : catgets (dtsearch_catd, MS_misc, 6, ""), fname_input); if ((instream = fopen (fname_input, "rt")) == NULL) { BAD_INPUT_FILE: printf (catgets (dtsearch_catd, MS_cborodin, 1083, "%s Can't read input file '%s': %s\n"), PROGNAME"1083", fname_input, strerror(errno)); DtSearchExit (14); } if (fstat (fileno (instream), &fstat_input) == -1) goto BAD_INPUT_FILE; parg.ftext = instream; /* for readchar_ftext(), discard_to_ETX() */ time (&totalstart); /* for total elapsed time */ timestart = totalstart; /* for Pass 1 elapsed time */ /*------------ PASS 1: ------------ * Main Read Loop. For each text record in input file, * parse and stem words, store them into binary tree * inverted index in memory. * The first few lines are database administrative values. * They are presumed ascii and read with fgets() as * 'lines' terminated with \n. The text of the document * itself is presumed to be in the appropriate database * 'language', so it is *not* presumed to be lines * terminated with \n. The document text is read by * the language's parser() a 'word' at a time, which * ultimately means a byte at a time. */ printf (catgets (dtsearch_catd, MS_cborodin, 1108, "%s: Beginning Pass 1, reading records from '%s'.\n" " Each dot = %d records.\n"), aa_argv0, fname_input, recs_per_dot); dotcount = 0; while (!feof(instream)) { /* 1. Read and discard the FZKEY line. * 2. Read and discard the ABSTRACT line. * 3. Read the UNIQUE KEY for the record. * Do some record initialization steps here. * 4. Read and discard the DATE line. * 5. Let the parser read and parse rest of record, ie doc text... */ /*----- READ LINE #1, fzkey -----*/ if (fgets (inbuf, inbufsz, instream) == NULL) break; inbuf [inbufsz] = 0; /* just to be sure */ if (shutdown_now) { printf (catgets (dtsearch_catd, MS_cborodin, 164, "\n%s: %s Abort due to signal %d. Database %s\n" " possibly corrupted. Restore backup database.\n"), aa_argv0, PROGNAME"1299", shutdown_now, dicname); DtSearchExit (11); } /* Silently skip null records just like dtsrload */ if (strcmp (inbuf, parg.etxdelim) == 0) continue; record_count++; /*----- READ LINE #2, abstract -----*/ if (fgets (inbuf, inbufsz, instream) == NULL) { INVALID_FZK_FORMAT: printf (catgets (dtsearch_catd, MS_cborodin, 1129, "%s: %s Invalid .fzk file format.\n"), fname_input, PROGNAME"1129"); DtSearchExit (22); } inbuf[inbufsz] = 0; /* just to be sure */ /*--- READ LINE #3, unique database key ---*/ if (fgets (inbuf, inbufsz, instream) == NULL) goto INVALID_FZK_FORMAT; inbuf[inbufsz] = 0; /* just to be sure */ if ((cptr = strtok (inbuf, " \t\n")) == NULL) goto INVALID_FZK_FORMAT; /* If necessary, discard long keys exactly like cravel */ if (strlen (cptr) >= DtSrMAX_DB_KEYSIZE) { printf (catgets (dtsearch_catd, MS_cborodin, 659, "\n%s: %s Discarding record, key too long:\n '%s'.\n"), aa_argv0, PROGNAME"659", cptr); discard_to_ETX (&parg); continue; } strcpy (db_key, cptr); /* Skip duplicate record ids in same order as dtsrload */ i = is_duprec (db_key); if (i == 2) { /* out of memory */ printf (catgets (dtsearch_catd, MS_cborodin, 374, msg_374), PROGNAME"1317"); DtSearchExit (57); } else if (i == 1) { /* duplicate record id */ duplicate_recids++; if (dotcount > 0) putchar ('\n'); printf (catgets (dtsearch_catd, MS_cborodin, 1402, "%s: Discarded duplicate rec #%lu '%s'.\n"), aa_argv0, record_count, db_key); discard_to_ETX (&parg); continue; } /****** FFFFFFFFFFFFFFFFFFFFF **********/ /* Convert database address (slot #) to 'record number', * what dba would have been if all records took up * only one slot and there were no dbrec at top of file. * Record numbers on d99, like dba's, start at #1, * but rec numbers here (in bit vector) start at #0. */ KEYFIND (PROGNAME "222", OR_OBJKEY, (char *) db_key, 0); if (db_status != S_OKAY) { normal_retncode = 1; /* = 'warning' */ if (dotcount > 0) putchar ('\n'); printf (catgets (dtsearch_catd, MS_cborodin, 1168, "%s: %s Discarded '%s', key not in database.\n"), aa_argv0, PROGNAME"1168", displayable(db_key)); discard_to_ETX (&parg); continue; } CRGET (PROGNAME "224", &temp_dba, 0); /* vista already byte swapped */ temp_dba &= 0x00FFFFFF; /* = slot# */ dba = (temp_dba + dba_offset) / or_recslots; /* = rec#, base 1 */ /* * Don't change this 'dba'!--eventually it goes * into d99 in this exact format! It will also * be used as an index into dbas_word_count[] in * load_into_bintree() so do a sanity check * to make sure that it hasn't exceeded the size * of that array. (The count increments have been * reported as as 'uninitialized memory reads' * by a debugger). This might happen for example * if user failed to run dtsrload before dtsrindex? */ if (dba < 1 || dba > or_maxdba) { printf ( catgets(dtsearch_catd, MS_cborodin, 21, "\n%s '%s' record overflows word counter array.\n" "Record number %ld > maxdba %ld, dba=%ld, " "recslots=%ld, offs=%d.\n") , PROGNAME"1526", displayable(db_key), (long)dba, (long)or_maxdba, (long)temp_dba, (long)or_recslots, (int)dba_offset); DtSearchExit (68); } temp_dba = dba - 1; /* = rec# starting at 0 */ cur_byte = temp_dba >> 3; /* bits to bytes: div by 8 */ if (cur_byte >= bit_vector_size) { printf ( catgets(dtsearch_catd, MS_cborodin, 22, "\n%s '%s' record in database (dba=%ld)\n" " overflows bitvector allocation (%ld >= %ld).\n") , PROGNAME"1475", displayable(db_key), (long)dba, (long)cur_byte, (long)bit_vector_size); DtSearchExit (69); } dbas_bits_batch[cur_byte] |= 1 << (temp_dba % 8); /* Print occasional progress dots and msgs */ if (!(record_count % recs_per_dot)) { putchar ('.'); dotcount++; if (!(dotcount % 10)) putchar (' '); if (dotcount >= 50) { dotcount = 0; bytes_in = ftell (instream); seconds_left = (unsigned long) (((float) fstat_input.st_size / (float) bytes_in - 1.) * (float) (time (NULL) - timestart)); printf (catgets (dtsearch_catd, MS_cborodin, 1190, "\n%s: Rec #%lu, %.0f%% done. " "Est %lum %02lus to end Pass 1.\n"), aa_argv0, record_count, (float) bytes_in / (float) fstat_input.st_size * 100.0, seconds_left / 60UL, seconds_left % 60UL); } fflush (stdout); } /*----- READ LINE #4, date -----*/ if (fgets (inbuf, inbufsz, instream) == NULL) goto INVALID_FZK_FORMAT; inbuf[inbufsz] = 0; /* just to be sure */ /* PARSE LOOP FOR CURRENT TEXT BLOCK. * We must be in the middle of a record ('lines' #5 and beyond). * From here to ETX, which is either the record delimiter string * or the end of file, read the file a 'word' at a time * using the parse() function for the language specified * for the database. * Load_into_bintree() stores each token into * inverted index binary tree. * Note: dba here MUST still be rec#, base 1. * It's stored as is by load_into_bintree(), * and will be moved as is into d99 file in Pass 2. */ if (debugging & DEBUG_P) printf ("\nRecord #%lu '%s'\n" "Offset Word---- Stem----\n", record_count, db_key); for ( cptr = dblk.parser (&parg); cptr; cptr = dblk.parser (NULL)) { if (debugging & DEBUG_P) { printf ("%6ld %s %n", word_offset, cptr, &i); if (!(debugging & DEBUG_I)) while (i++ < 30) putchar (' '); } load_into_bintree (cptr, FALSE, dba); cptr = dblk.stemmer (cptr, &dblk); if (debugging & DEBUG_P) { printf ("%s\n", cptr); fflush (stdout); } load_into_bintree (cptr, TRUE, dba); } } /* end of PASS 1 Main read loop */ elapsed = time(NULL) - timestart; if (dotcount > 0) { putchar ('\n'); dotcount = 0; } if (duplicate_recids > 0L) { normal_retncode = 1; /* 'warning' */ sprintf (buf, catgets (dtsearch_catd, MS_cborodin, 40, "Ignored %ld duplicate records"), duplicate_recids); } else strcpy (buf, catgets (dtsearch_catd, MS_cborodin, 41, "No duplicate records found")); printf (catgets (dtsearch_catd, MS_cborodin, 1225, "%s: Pass 1 completed in %lum %lus, read %lu records.\n" " %s, parsed %lu words.\n"), aa_argv0, elapsed / 60L, elapsed % 60L, record_count, buf, num_of_diff_words); if (record_count > batch_size) { printf (catgets (dtsearch_catd, MS_cborodin, 33, "\n%s Number of incoming records exceeded %d.\n" " This will usually result in 'Out of Paging Space' " "error in Pass 2\n" " and corruption of database. Either split the incoming file to\n" " reduce record count or use the -b option, and rerun.\n"), PROGNAME"33", (int)batch_size); DtSearchExit (33); } /*----------------- PASS 2: ----------------- * Traverse completed binary tree and write it to d99 file. */ printf (catgets (dtsearch_catd, MS_cborodin, 1233, "%s: Beginning Pass 2: batch index traversal and database update.\n" " Each dot = %d words.\n"), aa_argv0, words_per_dot); dotcount = 0; time (×tart); traverse_tree (); /* actual Pass 2 */ if (dotcount) { putchar ('\n'); dotcount = 0; } /* Write header information to the d99 file */ if (!fwrite_d99_header (&fl_hdr, dtbs_addr_fp)) { printf (catgets (dtsearch_catd, MS_cborodin, 776, msg_776), PROGNAME"1723", strerror(errno)); DtSearchExit (13); } d_close (); fclose (dtbs_addr_fp); elapsed = time (NULL) - timestart; printf (catgets (dtsearch_catd, MS_cborodin, 1246, "%s: Pass 2 completed in %lum %lus, updated %lu words.\n"), aa_argv0, elapsed / 60L, elapsed % 60L, count_word_ii); if (normal_retncode == 1) printf (catgets (dtsearch_catd, MS_cborodin, 2, "%s: Warnings were detected.\n"), aa_argv0); DtSearchExit (normal_retncode); } /* main() */ /*************************** DTSRINDEX.C ****************************/