1811 lines
57 KiB
C
1811 lines
57 KiB
C
/*
|
|
* CDE - Common Desktop Environment
|
|
*
|
|
* Copyright (c) 1993-2012, The Open Group. All rights reserved.
|
|
*
|
|
* These libraries and programs are free software; you can
|
|
* redistribute them and/or modify them under the terms of the GNU
|
|
* Lesser General Public License as published by the Free Software
|
|
* Foundation; either version 2 of the License, or (at your option)
|
|
* any later version.
|
|
*
|
|
* These libraries and programs are distributed in the hope that
|
|
* they will be useful, but WITHOUT ANY WARRANTY; without even the
|
|
* implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
|
* PURPOSE. See the GNU Lesser General Public License for more
|
|
* details.
|
|
*
|
|
* You should have received a copy of the GNU Lesser General Public
|
|
* License along with these libraries and programs; if not, write
|
|
* to the Free Software Foundation, Inc., 51 Franklin Street, Fifth
|
|
* Floor, Boston, MA 02110-1301 USA
|
|
*/
|
|
/*
|
|
* COMPONENT_NAME: austext
|
|
*
|
|
* FUNCTIONS: descend_tree
|
|
* displayable
|
|
* fill_data1
|
|
* load_into_bintree
|
|
* main
|
|
* print_exit_code
|
|
* print_usage_msg
|
|
* put_addrs_2_dtbs_addr_file
|
|
* segregate_dicname
|
|
* traverse_tree
|
|
* user_args_processor
|
|
* write_2_dtbs_addr_file
|
|
* write_new_word_2_dtbs
|
|
* write_to_file
|
|
*
|
|
* ORIGINS: 27
|
|
*
|
|
*
|
|
* (C) COPYRIGHT International Business Machines Corp. 1992,1996
|
|
* All Rights Reserved
|
|
* Licensed Materials - Property of IBM
|
|
* US Government Users Restricted Rights - Use, duplication or
|
|
* disclosure restricted by GSA ADP Schedule Contract with IBM Corp.
|
|
*/
|
|
/************************ DTSRINDEX.C *******************************
|
|
* $XConsortium: dtsrindex.c /main/10 1996/09/23 21:02:54 cde-ibm $
|
|
* CDE version of borodin.c
|
|
* Formerly dtsrindex.c was cborodin.c.
|
|
*
|
|
* INPUT FORMAT:
|
|
* Text file in FZK format.
|
|
* Each record contains 4 formatted 'lines' (text strings ending in \n):
|
|
* 1. fzkey (not used in this program).
|
|
* 2. abstract (not used in this program).
|
|
* 3. unique database key for the record. Used to find the database
|
|
* address of the record which is the reference for the inverted index.
|
|
* 4. The record's date (not used in this program).
|
|
*
|
|
* The rest of the record is unformatted text (not necessarily organized
|
|
* into 'lines'). It is read a character at a time and parsed into
|
|
* individual words by the parser function for the database's language.
|
|
* Each record ends with a delimiter string specified by command line arg.
|
|
*
|
|
* $Log$
|
|
* Revision 2.8 1996/04/10 19:50:38 miker
|
|
* Deleted dangerous and unnecessary -a option.
|
|
*
|
|
* Revision 2.7 1996/03/25 18:54:15 miker
|
|
* Changed FILENAME_MAX to _POSIX_PATH_MAX.
|
|
*
|
|
* Revision 2.6 1996/02/01 18:25:44 miker
|
|
* AusText 2.1.11, DtSearch 0.3. Pass 1 changed to accommodate
|
|
* new single-character reading parser/stemmers.
|
|
*
|
|
* Revision 2.5 1995/12/29 17:16:04 miker
|
|
* Bug fix: Opened wrong msg catalog.
|
|
*
|
|
* Revision 2.4 1995/12/27 21:18:40 miker
|
|
* Msg bug: 'percent done' was negative number.
|
|
*
|
|
* Revision 2.3 1995/12/01 16:15:44 miker
|
|
* Deleted unnecessary log2 var, conflict with Solaris function.
|
|
* Added -r command line arg.
|
|
*
|
|
* Revision 2.2 1995/10/26 15:26:53 miker
|
|
* Added prolog.
|
|
*
|
|
* Revision 2.1 1995/09/22 19:29:53 miker
|
|
* Freeze DtSearch 0.1, AusText 2.1.8
|
|
*
|
|
* Revision 1.3 1995/09/05 21:08:54 miker
|
|
* Fixed bug: appeared as if 1 and 2 char 'words' were being indexed.
|
|
* Added DEBUG_P switch.
|
|
*
|
|
* Revision 1.2 1995/09/01 22:17:02 miker
|
|
* Fixed solaris segfault: too many args to printf in print_usage().
|
|
*
|
|
* Revision 1.1 1995/08/31 20:51:08 miker
|
|
* Initial revision of dtsrindex.c, copied from cborodin.c.
|
|
*
|
|
* Log: cborodin.c,v
|
|
* Revision 1.18 1995/05/30 18:58:54 miker
|
|
* Correct bug introduced by previous fix (2.1.5c).
|
|
*
|
|
* Revision 1.17 1995/05/18 22:54:08 miker
|
|
* 2.1.5b cborodin bug. Segfault due to overflowing bitvector
|
|
* after many deletions and no mrclean.
|
|
*/
|
|
#include "SearchP.h"
|
|
#include <limits.h>
|
|
#include <stdlib.h>
|
|
#include <unistd.h>
|
|
#include <string.h>
|
|
#include <ctype.h>
|
|
#include <time.h>
|
|
#include <errno.h>
|
|
#include <math.h>
|
|
#include <sys/stat.h>
|
|
#include <locale.h>
|
|
#include "vista.h"
|
|
|
|
extern void find_keyword (char *cur_word, int vista_num);
|
|
extern void read_wordstr (struct or_hwordrec * glob_word, int vista_num);
|
|
extern void write_wordstr (struct or_hwordrec * glob_word, int vista_num);
|
|
extern void fill_data1 (char *ch);
|
|
|
|
#define PROGNAME "DTSRINDEX"
|
|
|
|
#define BATCH_SIZE 10000L
|
|
#define WORDS_PER_DOT 500
|
|
#define RECS_PER_DOT 20
|
|
#define INBUFSZ 1024 /* default input text header line size */
|
|
#define MS_misc 1
|
|
#define MS_cborodin 14
|
|
|
|
/******************* BIT VECTORS *****************/
|
|
DB_ADDR *word_addrs_ii; /* fread buf for d99 (= tot # dbas) */
|
|
DtSrINT32 *dbas_word_count;
|
|
char *dbas_bits_batch;
|
|
DB_ADDR *record_addr_word;
|
|
DtSrINT32 num_addrs_for_word;
|
|
DtSrINT32 or_reccount;
|
|
DtSrINT32 bit_vector_size;
|
|
|
|
/*-------------------------- GLOBALS ----------------------------*/
|
|
/* batch_size also used by fileman.c for allocating unused holes
|
|
* in order to no go past end of 'record_addr_word' array.
|
|
*/
|
|
extern DtSrINT32 batch_size;
|
|
|
|
char buf[1024];
|
|
static int cache_size = CACHE_SIZE;
|
|
static int check_existing_addrs = TRUE;
|
|
long count_word_ii = 0L;
|
|
long dbkey_seqno = 0L;
|
|
DBLK dblk;
|
|
DBREC dbrec;
|
|
static int debugging = 0;
|
|
#define DEBUG_I 0x01 /* P1 tree insertions */
|
|
#define DEBUG_P 0x10 /* P1 parser/stemmer */
|
|
#define DEBUG_T 0x02 /* P2 tree dump (words) */
|
|
#define DEBUG_N 0x04 /* P2 NEW words, vista */
|
|
#define DEBUG_O 0x08 /* P2 OLD words, vista) */
|
|
#define DEBUG_t 0x20 /* P2 tree dump (dbas) */
|
|
#define DEBUG_n 0x40 /* P2 NEW d99 for new words */
|
|
#define DEBUG_o 0x80 /* P2 OLD d99 updates for old words */
|
|
static unsigned long
|
|
default_hashsize;
|
|
char dicname [10];
|
|
char dicpath [_POSIX_PATH_MAX];
|
|
static int dotcount = 0;
|
|
char dtbs_addr_file [_POSIX_PATH_MAX];
|
|
FILE *dtbs_addr_fp;
|
|
long dtbs_size_records = 0L;
|
|
static long duplicate_recids = 0L;
|
|
struct stat fstat_input;
|
|
FILE_HEADER fl_hdr;
|
|
static char fname_input [_POSIX_PATH_MAX];
|
|
struct or_hwordrec
|
|
got_word;
|
|
static FILE *instream;
|
|
char *inbuf;
|
|
int inbuf_overflowed = FALSE;
|
|
size_t inbufsz = INBUFSZ;
|
|
int is_pmr;
|
|
static DtSrINT32
|
|
or_maxdba = 0;
|
|
static char msg_374[] = "\n%s Out of Memory!\n"
|
|
" Split the incoming file into several "
|
|
"smaller files and try again.\n";
|
|
static char msg_776[] = "\n%s Write Failure d99 file: %s\n";
|
|
char new_dtbs_file = FALSE;
|
|
long num_of_diff_words = 0L;
|
|
int normal_retncode = 0;
|
|
static PARG parg;
|
|
int parsep_char = END_RETAIN_PAGE;
|
|
char rec_type;
|
|
unsigned long record_count = 0UL;
|
|
int record_lines;
|
|
static int recs_per_dot = RECS_PER_DOT;
|
|
static unsigned long
|
|
seconds_left;
|
|
extern int shutdown_now;
|
|
static DtSrINT32
|
|
or_recslots;
|
|
char *sprintbuffer = NULL;
|
|
char *temp = NULL;
|
|
extern int debugging_teskey;
|
|
time_t timestart = 0;
|
|
time_t totalstart = 0;
|
|
static int words_per_dot = WORDS_PER_DOT;
|
|
|
|
/************************************************/
|
|
/* */
|
|
/* DBALIST */
|
|
/* */
|
|
/************************************************/
|
|
typedef struct dba_str {
|
|
DB_ADDR dba;
|
|
DtSrINT32 w_c;
|
|
struct dba_str *next_dba;
|
|
} DBALIST;
|
|
|
|
/************************************************/
|
|
/* */
|
|
/* TREENODE */
|
|
/* */
|
|
/************************************************/
|
|
typedef struct _treen_ {
|
|
char *word; /* ptr to word in stop list */
|
|
struct _treen_ *llink; /* left link in binary tree */
|
|
struct _treen_ *rlink; /* ptr to right link in binary tree */
|
|
DBALIST *dba_list;
|
|
} TREENODE;
|
|
|
|
static TREENODE *root_node = NULL;
|
|
static TREENODE *top_of_stack;
|
|
static TREENODE *stack;
|
|
static TREENODE *pres;
|
|
static TREENODE *prev;
|
|
static TREENODE *next;
|
|
static TREENODE *avail_node;
|
|
|
|
|
|
|
|
/************************************************/
|
|
/* */
|
|
/* displayable */
|
|
/* */
|
|
/************************************************/
|
|
/* Returns static string same as passed string except nonprintable
|
|
* and nonascii chars replaced by '^' for display.
|
|
*/
|
|
static char *displayable (char *passed_string)
|
|
{
|
|
static char *buf = NULL;
|
|
static size_t buflen = 0;
|
|
size_t passed_len = strlen (passed_string);
|
|
char *targ, *src;
|
|
if (buflen < passed_len) {
|
|
if (buf)
|
|
free (buf);
|
|
buflen = passed_len;
|
|
buf = austext_malloc (buflen + 4, PROGNAME"158", NULL);
|
|
}
|
|
targ = buf;
|
|
for (src = passed_string; *src != 0; src++) {
|
|
if (*src >= 32 && *src < 127)
|
|
*targ++ = *src;
|
|
else
|
|
*targ++ = '^';
|
|
}
|
|
*targ = 0;
|
|
return buf;
|
|
} /* displayable() */
|
|
|
|
|
|
/************************************************/
|
|
/* */
|
|
/* print_exit_code */
|
|
/* */
|
|
/************************************************/
|
|
/* Called from inside DtSearchExit() at (*austext_exit_last)() */
|
|
static void print_exit_code (int exit_code)
|
|
{
|
|
if(dotcount) {
|
|
putchar ('\n');
|
|
dotcount = 0;
|
|
}
|
|
/* Put total seconds into totalstart */
|
|
if (totalstart > 0)
|
|
totalstart = time (NULL) - totalstart;
|
|
printf (catgets (dtsearch_catd, MS_cborodin, 206,
|
|
"%s: Exit Code = %d, Total elapsed time %ldm %lds.\n"),
|
|
aa_argv0, exit_code, totalstart / 60L, totalstart % 60L);
|
|
return;
|
|
} /* print_exit_code() */
|
|
|
|
|
|
/****************************************/
|
|
/* */
|
|
/* write_to_file() */
|
|
/* */
|
|
/****************************************/
|
|
/* This is the 'visit node' point for the tree traversal
|
|
* functions of Pass 2 (traverse_tree() and descend_tree()).
|
|
*
|
|
* Each tree node = word or stem + linked list of dbas.
|
|
* When called, each dba list member just contains the number
|
|
* of times the token appears in that document. This function
|
|
* chains through the list, builds a statistical 'weight'
|
|
* for each doc/word pair, and stores it as a reformatted 'dba'
|
|
* in array 'record_addr_word[]', in 'host' byte swap order.
|
|
* The count of the current number of addrs
|
|
* in the array is stored in 'num_addrs_for_word'.
|
|
* Fill_data1() is then called to update or write a new
|
|
* vista record and d99 data for the token.
|
|
*
|
|
* The weight stored for each doc-word instance is 1 byte.
|
|
* It's the ratio of log of number of times given word occurs in doc,
|
|
* divided by log of total count of all words in doc,
|
|
* scaled to range 0 to 255.
|
|
* Fundamentally it's a word count of that word in the doc,
|
|
* but adjusted as follows:
|
|
* 1) Large occurrences in small documents weigh more than
|
|
* the same number of occurrences in large documents.
|
|
* 2) Taking the log skews the ratio to be more linear,
|
|
* ie take advantage of higher ranges of the 'weight'.
|
|
* For example a word that occurs in 10% of the document,
|
|
* will have a weight of .5 (50%).
|
|
* 3) The scaling changes the ratio, a float between 0. and .9999,
|
|
* to an integer between 0 and 255.
|
|
*/
|
|
void write_to_file (TREENODE * output_node)
|
|
{
|
|
DBALIST *print_dba;
|
|
DB_ADDR mydba;
|
|
|
|
/* 'record_addr_word[]' was permanently allocated
|
|
* with a size = max batch size so it can hold
|
|
* all the addrs for a single word node in the tree.
|
|
* In effect it will replace the dba linked list.
|
|
* Note: word_addrs_ii (io buffer for d99 file) != record_addr_word[].
|
|
*/
|
|
|
|
if (debugging & (DEBUG_T | DEBUG_t)) { /* Print out tree node */
|
|
printf (" node '%s' %c%c%c",
|
|
displayable(output_node->word),
|
|
(output_node->llink)? 'L' : '.',
|
|
(output_node->rlink)? 'R' : '.',
|
|
(debugging & DEBUG_t)? '\n' : ' ');
|
|
}
|
|
|
|
num_addrs_for_word = 0; /* DtSrINT32 */
|
|
print_dba = output_node->dba_list;
|
|
while (print_dba != NULL) {
|
|
|
|
mydba = print_dba->dba;
|
|
if (debugging & DEBUG_t)
|
|
printf (" dba #%ld: node adr=%ld cnt=%ld",
|
|
(long)num_addrs_for_word, (long)mydba, (long)print_dba->w_c);
|
|
|
|
record_addr_word [num_addrs_for_word] =
|
|
mydba << 8; /* rec# in hi 3 bytes */
|
|
record_addr_word [num_addrs_for_word] +=
|
|
(log ((double) (print_dba->w_c) + 0.5) /
|
|
log ((double) (dbas_word_count[mydba] + 1))) * 256;
|
|
|
|
if (debugging & DEBUG_t)
|
|
printf (" -> x%lx (%ld:%ld)\n",
|
|
(long)record_addr_word [num_addrs_for_word],
|
|
(long)record_addr_word [num_addrs_for_word] >> 8,
|
|
(long)record_addr_word [num_addrs_for_word] & 0xffL);
|
|
|
|
print_dba = print_dba->next_dba;
|
|
num_addrs_for_word++;
|
|
if (num_addrs_for_word >= batch_size) {
|
|
printf (catgets (dtsearch_catd, MS_cborodin, 280,
|
|
"\n%s num_addrs_for_word (%ld) >= batchsz (%ld).\n"),
|
|
PROGNAME"280", (long)num_addrs_for_word, (long)batch_size);
|
|
DtSearchExit (91);
|
|
}
|
|
}
|
|
if ((debugging & DEBUG_T) && !(debugging & DEBUG_t))
|
|
printf (" dbacnt=%ld\n", (long)num_addrs_for_word);
|
|
|
|
fill_data1 (output_node->word);
|
|
|
|
return;
|
|
} /* write_to_file() */
|
|
|
|
|
|
/****************************************/
|
|
/* */
|
|
/* descend_tree() */
|
|
/* */
|
|
/****************************************/
|
|
/* Coroutine of traverse_tree(), Pass 2 Robson tree traversal.
|
|
* The write_to_file() function is the 'preorder visit' point.
|
|
*/
|
|
void descend_tree (void)
|
|
{
|
|
int not_done = TRUE;
|
|
|
|
while (not_done) {
|
|
if ((pres->llink == NULL) && (pres->rlink == NULL)) {
|
|
write_to_file (pres);
|
|
avail_node = pres;
|
|
return;
|
|
}
|
|
if (pres->llink != NULL) {
|
|
next = pres->llink;
|
|
pres->llink = prev;
|
|
prev = pres;
|
|
pres = next;
|
|
}
|
|
else {
|
|
write_to_file (pres);
|
|
next = pres->rlink;
|
|
pres->rlink = prev;
|
|
prev = pres;
|
|
pres = next;
|
|
}
|
|
}
|
|
return;
|
|
} /* descend_tree() */
|
|
|
|
|
|
/********************************/
|
|
/* */
|
|
/* traverse_tree */
|
|
/* */
|
|
/********************************/
|
|
/* This is the actual Pass 2 function, a tree traversal
|
|
* of Pass 1's word-dba binary tree.
|
|
* The algorithm is based on the J. M. ROBSON link inversion traversal
|
|
* algorithm for binary trees. Ref. Thomas A. STANDISH pp. 77-78.
|
|
* The write_to_file() function is the 'preorder visit' point.
|
|
*/
|
|
void traverse_tree (void)
|
|
{
|
|
int not_done = TRUE;
|
|
int descend = TRUE;
|
|
|
|
/* Dheck for the empty tree */
|
|
if (root_node == NULL) {
|
|
printf (catgets (dtsearch_catd, MS_cborodin, 288,
|
|
"%s Abort. There are no words in the input file %s.\n"),
|
|
PROGNAME"288", fname_input);
|
|
DtSearchExit (34);
|
|
}
|
|
/* Initialize the variables */
|
|
pres = root_node;
|
|
prev = pres;
|
|
top_of_stack = NULL;
|
|
stack = NULL;
|
|
|
|
while (not_done) {
|
|
if (descend) {
|
|
descend_tree ();
|
|
}
|
|
if (pres == root_node) {
|
|
return;
|
|
}
|
|
if (prev->rlink == NULL) {
|
|
write_to_file (prev);
|
|
next = prev->llink;
|
|
prev->llink = pres;
|
|
pres = prev;
|
|
prev = next;
|
|
descend = FALSE;
|
|
}
|
|
else {
|
|
if (prev->llink == NULL) {
|
|
next = prev->rlink;
|
|
prev->rlink = pres;
|
|
pres = prev;
|
|
prev = next;
|
|
descend = FALSE;
|
|
}
|
|
else {
|
|
if (prev == top_of_stack) {
|
|
next = stack;
|
|
top_of_stack = stack->rlink;
|
|
stack = stack->llink;
|
|
next->llink = NULL;
|
|
next->rlink = NULL;
|
|
next = prev->llink;
|
|
prev->llink = prev->rlink;
|
|
prev->rlink = pres;
|
|
pres = prev;
|
|
prev = next;
|
|
descend = FALSE;
|
|
}
|
|
else {
|
|
write_to_file (prev);
|
|
avail_node->llink = stack;
|
|
avail_node->rlink = top_of_stack;
|
|
stack = avail_node;
|
|
top_of_stack = prev;
|
|
next = prev->rlink;
|
|
prev->rlink = pres;
|
|
pres = next;
|
|
descend = TRUE;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
} /* traverse_tree() */
|
|
|
|
|
|
|
|
/********************************************************/
|
|
/* */
|
|
/* print_usage_msg */
|
|
/* */
|
|
/********************************************************/
|
|
static void print_usage_msg (void)
|
|
{
|
|
printf (catgets (dtsearch_catd, MS_cborodin, 17,
|
|
"\n"
|
|
"USAGE: %s -d<dbname> [options] <infile>\n"
|
|
" Listed default file name extensions can be overridden.\n"
|
|
" -d<dbname> 1 - 8 character database name, include optional path prefix.\n"
|
|
" -t<etxstr> End of text document delimiter string. Default '\\f\\n'.\n"
|
|
" -r<N> Change Pass 1 records-per-dot from %d to <N>.\n"
|
|
" -b<N> Change max batch size from %ld to <N>.\n"
|
|
" -c<N> Change database paging cache from %ld 1K pages to <N> 1K pages.\n"
|
|
" <N> >= 16 by powers of 2. Initially try only small changes.\n"
|
|
" -i<N> Change (i)nput buffer size from default %d to <N>.\n"
|
|
" -h<N> Change duplicate record id hash table size from %ld to <N>.\n"
|
|
" -h0 means there are no duplicates, do not check for them.\n"
|
|
" <infile> Input [path]file name. Default extension %s.\n"),
|
|
aa_argv0,
|
|
(int) RECS_PER_DOT,
|
|
(long) BATCH_SIZE, (long) CACHE_SIZE,
|
|
(int) INBUFSZ, default_hashsize, EXT_FZKEY);
|
|
return;
|
|
} /* print_usage_msg() */
|
|
|
|
|
|
/********************************************************/
|
|
/* */
|
|
/* segregate_dicname */
|
|
/* */
|
|
/********************************************************/
|
|
/* Separates dictionary name from pathname and loads
|
|
* them into the globals 'dicname' and 'dicpath'.
|
|
* Returns TRUE if dicname is valid, else returns FALSE.
|
|
*/
|
|
static int segregate_dicname (char *string)
|
|
{
|
|
char mybuf[_POSIX_PATH_MAX];
|
|
char *ptr;
|
|
int i;
|
|
|
|
strncpy (mybuf, string, sizeof (mybuf));
|
|
mybuf[sizeof (mybuf) - 1] = 0;
|
|
|
|
/*
|
|
* Set 'ptr' to just the 8 char dictionary name by moving
|
|
* it backwards until first non-alphanumeric character
|
|
* (such as a ":" in the dos drive id or a slash between directories),
|
|
* or to the beginning of string.
|
|
*/
|
|
for (ptr = mybuf + strlen (mybuf) - 1; ptr >= mybuf; ptr--)
|
|
if (!isalnum (*ptr)) {
|
|
ptr++;
|
|
break;
|
|
}
|
|
if (ptr < mybuf)
|
|
ptr = mybuf;
|
|
|
|
/* test for valid dictionary name */
|
|
i = strlen (ptr);
|
|
if (i < 1 || i > 8)
|
|
return FALSE;
|
|
|
|
strcpy (dicname, ptr);
|
|
*ptr = 0;
|
|
strncpy (dicpath, mybuf, sizeof (dicpath));
|
|
dicpath[sizeof (dicpath) - 1] = 0;
|
|
return TRUE;
|
|
} /* segregate_dicname() */
|
|
|
|
|
|
/********************************************************/
|
|
/* */
|
|
/* USER_ARGS_PROCESSOR */
|
|
/* */
|
|
/********************************************************/
|
|
/* handles command line arguments for 'main' */
|
|
void user_args_processor (int argc, char **argv)
|
|
{
|
|
char *argptr;
|
|
char *targ, *src;
|
|
int i;
|
|
|
|
if (argc <= 1) {
|
|
print_usage_msg ();
|
|
DtSearchExit (2);
|
|
}
|
|
/* Initialize some variables prior to parsing command line */
|
|
dicname[0] = 0;
|
|
dicpath[0] = 0;
|
|
|
|
/* Each pass grabs new parm of "-xxx" format */
|
|
while (--argc > 0 && (*++argv)[0] == '-') {
|
|
argptr = argv[0];
|
|
switch (argptr[1]) {
|
|
|
|
case 't': /* ETX delimiter string */
|
|
/* Replace any "\n" string with real linefeed */
|
|
targ = parg.etxdelim = malloc (strlen (argptr + 2) + 4);
|
|
src = argptr + 2;
|
|
while (*src) {
|
|
if (src[0] == '\\' && src[1] == 'n') {
|
|
*targ++ = '\n';
|
|
src += 2;
|
|
}
|
|
else
|
|
*targ++ = *src++;
|
|
}
|
|
*targ = 0;
|
|
break;
|
|
|
|
case 'r':
|
|
if ((recs_per_dot = atoi (argptr + 2)) <= 0) {
|
|
printf (catgets (dtsearch_catd, MS_cborodin, 577,
|
|
"%s Invalid arg '%s'. Using default -r%d.\n"),
|
|
PROGNAME"577", argptr, RECS_PER_DOT);
|
|
recs_per_dot = RECS_PER_DOT;
|
|
}
|
|
break;
|
|
|
|
case 'h':
|
|
duprec_hashsize = atol (argptr + 2);
|
|
if (duprec_hashsize == 0UL)
|
|
printf (catgets (dtsearch_catd, MS_cborodin, 539,
|
|
"%s Duplicate record id checking disabled.\n"),
|
|
PROGNAME"539");
|
|
break;
|
|
|
|
case 'b':
|
|
batch_size = atol (argptr + 2);
|
|
if (batch_size <= 0L) {
|
|
printf (catgets (dtsearch_catd, MS_cborodin, 595,
|
|
"%s Invalid batch size argument '%s'.\n"),
|
|
PROGNAME"595", argptr);
|
|
goto BADPARM;
|
|
}
|
|
break;
|
|
|
|
case 'c':
|
|
cache_size = atoi (argptr + 2);
|
|
if (cache_size < 16) {
|
|
/* minimum size is 16 */
|
|
if (cache_size > 0)
|
|
cache_size = 16;
|
|
/* on error reset size to default */
|
|
else
|
|
cache_size = CACHE_SIZE;
|
|
CACHE_ADJUSTED:
|
|
printf (catgets (dtsearch_catd, MS_cborodin, 600,
|
|
"%sCache size readjusted to %d.\n"),
|
|
PROGNAME "600 ", cache_size);
|
|
break;
|
|
}
|
|
/* If necessary, round up to nearest power of 2 */
|
|
for (i = 4; i < 12; i++)
|
|
if (1 << i >= cache_size)
|
|
break;
|
|
i = 1 << i;
|
|
if (i != cache_size) {
|
|
cache_size = i;
|
|
goto CACHE_ADJUSTED;
|
|
}
|
|
break;
|
|
|
|
case 'D': /* unadvertised debugging feature */
|
|
for (i = 2; argptr[i] != 0; i++) {
|
|
switch (argptr[i]) {
|
|
case 'I': debugging |= DEBUG_I; break;
|
|
case 'P': debugging |= DEBUG_P;
|
|
/******* debugging_teskey = TRUE; ******/
|
|
break;
|
|
case 'N': debugging |= DEBUG_N; break;
|
|
case 'n': debugging |= DEBUG_n; break;
|
|
case 'O': debugging |= DEBUG_O; break;
|
|
case 'o': debugging |= DEBUG_o; break;
|
|
case 'T': debugging |= DEBUG_T; break;
|
|
case 't': debugging |= DEBUG_t; break;
|
|
default: goto BADPARM;
|
|
}
|
|
}
|
|
break;
|
|
|
|
case 'd':
|
|
/* May include both dicname and dicpath */
|
|
if (!segregate_dicname (argptr + 2)) {
|
|
printf (catgets (dtsearch_catd, MS_cborodin, 550,
|
|
"%s '%s' is invalid path/database name.\n"),
|
|
PROGNAME"550", argptr);
|
|
goto BADPARM;
|
|
}
|
|
break;
|
|
|
|
case 'i': /* (I)nput buffer size */
|
|
if ((inbufsz = atol (argptr + 2)) <= 0) {
|
|
printf (catgets (dtsearch_catd, MS_cborodin, 558,
|
|
"%s Invalid input buffer size '%s'.\n"),
|
|
PROGNAME"558", argptr);
|
|
goto BADPARM;
|
|
}
|
|
break;
|
|
|
|
default:
|
|
printf (catgets (dtsearch_catd, MS_cborodin, 567,
|
|
"%s Unknown command line argument '%s'.\n"),
|
|
PROGNAME"567", argptr);
|
|
BADPARM:
|
|
print_usage_msg ();
|
|
DtSearchExit (2); /* abort */
|
|
|
|
} /* endswitch */
|
|
} /* endwhile for cmd line '-'processing */
|
|
|
|
/* Validate input file name */
|
|
if (argc-- <= 0) {
|
|
printf (catgets (dtsearch_catd, MS_cborodin, 580,
|
|
"%s Missing required input file name.\n"),
|
|
PROGNAME"580");
|
|
goto BADPARM;
|
|
}
|
|
/* Don't incr argv yet--save input file name */
|
|
else
|
|
append_ext (fname_input, _POSIX_PATH_MAX, argv[0], EXT_FZKEY);
|
|
|
|
/* Check for missing database name */
|
|
if (dicname[0] == 0) {
|
|
printf (catgets (dtsearch_catd, MS_cborodin, 589,
|
|
"%s No database name specified (-d argument).\a\n"),
|
|
PROGNAME"589");
|
|
goto BADPARM;
|
|
}
|
|
strcpy (dblk.name, dicname);
|
|
dblk.path = dicpath;
|
|
return;
|
|
} /* user_args_processor() */
|
|
|
|
|
|
/****************************************/
|
|
/* */
|
|
/* put_addrs_2_dtbs_addr_file */
|
|
/* */
|
|
/****************************************/
|
|
/* Suboutine of write_2_dtbs_addr_file() from Pass 2.
|
|
* That function has used a bit vector to determine
|
|
* the total change in old d99 addrs for preexisting words,
|
|
* and prepared for writing an array of old dbas that
|
|
* are not in the current words tree node (globally named
|
|
* word_addrs_ii [num_addrs]).
|
|
* The addrs that ARE in the Pass 1 node fzk file were previously
|
|
* prepared in a similar array of dbas, globally named
|
|
* record_addr_word [num_addrs_for_word] but passed here as
|
|
* 'addrs_array' and 'nitems'.
|
|
* Both arrays will be byte swapped from 'host' to
|
|
* 'network' order in this function.
|
|
* This function does the actual fwrite of both arrays to the d99.
|
|
* If the number of new addrs can fit in the available free slots,
|
|
* it rewrites to original offset, otherwise appends to end of d99.
|
|
*/
|
|
static void put_addrs_2_dtbs_addr_file (
|
|
DB_ADDR *addrs_array,
|
|
DtSrINT32 nitems)
|
|
{
|
|
FREE_SPACE_STR *free_slot;
|
|
FREE_SPACE_STR del_rec;
|
|
DtSrINT32 int32;
|
|
DtSrINT32 num_writes;
|
|
DtSrINT32 num_addrs;
|
|
|
|
if (nitems >= batch_size) {
|
|
printf ( catgets(dtsearch_catd, MS_cborodin, 6,
|
|
"put_addrs_2_dtbs_addr_file() nitems=%d, batchsz=%ld\n") ,
|
|
(int)nitems, (long)batch_size);
|
|
DtSearchExit (58);
|
|
}
|
|
|
|
num_addrs = got_word.or_hwaddrs;
|
|
got_word.or_hwaddrs += nitems; /** somehow, this can exceed total
|
|
**** num addrs in database by 1 (!?) ******/
|
|
/* (...only if prev 'overlay/compression' didn't delete all) */
|
|
|
|
#ifdef BYTE_SWAP
|
|
/* Put both arrays in 'network' byte order */
|
|
for (int32 = 0; int32 < nitems; int32++)
|
|
HTONL (addrs_array[int32]);
|
|
for (int32 = 0; int32 < num_addrs; int32++)
|
|
HTONL (word_addrs_ii[int32]);
|
|
#endif
|
|
|
|
/*
|
|
* If number of new addresses greater than number of free holes,
|
|
* find new free slot that is big enough to hold the data .
|
|
*/
|
|
if (nitems > got_word.or_hwfree) {
|
|
/* Discard old slot, find new one. */
|
|
del_rec.hole_size = num_addrs + got_word.or_hwfree;
|
|
del_rec.offset = got_word.or_hwoffset;
|
|
free_slot = find_free_space (got_word.or_hwaddrs, &fl_hdr);
|
|
add_free_space (&del_rec, &fl_hdr);
|
|
if (free_slot == NULL) {
|
|
fseek (dtbs_addr_fp, 0L, SEEK_END);
|
|
got_word.or_hwoffset = ftell (dtbs_addr_fp);
|
|
got_word.or_hwfree = 0;
|
|
}
|
|
else {
|
|
fseek (dtbs_addr_fp, free_slot->offset, SEEK_SET);
|
|
got_word.or_hwoffset = free_slot->offset;
|
|
got_word.or_hwfree = free_slot->hole_size -
|
|
got_word.or_hwaddrs;
|
|
}
|
|
/*----- Write new database addresses to a file -----*/
|
|
num_writes = fwrite (addrs_array, sizeof(DB_ADDR),
|
|
(size_t)nitems, dtbs_addr_fp);
|
|
if (num_writes != nitems) {
|
|
DtSearchExit (98);
|
|
}
|
|
|
|
/* Copy the old addresses immediately after the new ones */
|
|
num_writes = fwrite (word_addrs_ii, sizeof(DB_ADDR), (size_t)num_addrs,
|
|
dtbs_addr_fp);
|
|
if (num_writes != num_addrs) {
|
|
printf (catgets (dtsearch_catd, MS_cborodin, 776, msg_776),
|
|
PROGNAME"776", strerror(errno));
|
|
DtSearchExit (76);
|
|
}
|
|
|
|
/* Write foxes to the free holes, if any, no byte swap */
|
|
for (int32 = 0; int32 < got_word.or_hwfree; int32++)
|
|
addrs_array [int32] = 0xFFFFFFFF;
|
|
num_writes = fwrite (addrs_array, sizeof(DB_ADDR),
|
|
(size_t)got_word.or_hwfree, dtbs_addr_fp);
|
|
if (num_writes != got_word.or_hwfree) {
|
|
printf (catgets (dtsearch_catd, MS_cborodin, 776, msg_776),
|
|
PROGNAME"786", strerror(errno));
|
|
DtSearchExit (86);
|
|
}
|
|
} /* end if (nitems > got_word.or_hwfree), had to get bigger slot */
|
|
|
|
/* Else can reuse existing slot.
|
|
* Write the new addresses into free holes.
|
|
* The remaining free holes should already have foxes. (?)
|
|
*/
|
|
else {
|
|
fseek (dtbs_addr_fp, got_word.or_hwoffset, SEEK_SET);
|
|
num_writes = fwrite (addrs_array, sizeof(DB_ADDR),
|
|
(size_t)nitems, dtbs_addr_fp);
|
|
if (num_writes != nitems) {
|
|
printf (catgets (dtsearch_catd, MS_cborodin, 776, msg_776),
|
|
PROGNAME"798", strerror(errno));
|
|
DtSearchExit (87);
|
|
}
|
|
/* Copy the old addresses immediately after the new ones */
|
|
num_writes = fwrite (word_addrs_ii, sizeof(DB_ADDR),
|
|
(size_t)num_addrs, dtbs_addr_fp);
|
|
if (num_writes != num_addrs) {
|
|
printf (catgets (dtsearch_catd, MS_cborodin, 776, msg_776),
|
|
PROGNAME"889", strerror(errno));
|
|
DtSearchExit (89);
|
|
}
|
|
got_word.or_hwfree -= nitems;
|
|
}
|
|
} /* put_addrs_2_dtbs_addr_file() */
|
|
|
|
|
|
/****************************************/
|
|
/* */
|
|
/* write_2_dtbs_addr_file */
|
|
/* */
|
|
/****************************************/
|
|
/* Subroutine of fill_data1() from Pass 2.
|
|
* Updates OLD (preexisting) word's d99 file.
|
|
*
|
|
* The vista word rec has already been read into global 'got_word'.
|
|
* record_addr_word [num_addrs_for_word] is the array of dba's
|
|
* for docs from this batch that contain the current word (built by
|
|
* fill_data1 from the dba_list for the word's Pass 1 binary tree node,
|
|
* and still in 'host' byte swap order).
|
|
* This function freads all the old addresses for that word from
|
|
* the d99 file. It then deletes(!) d99 addrs that
|
|
* are in the word's Pass 1 tree node. It then calls
|
|
* put_addrs_2_dtbs_addr_file() to fwrite out the
|
|
* dba's in the tree, which are either brand new,
|
|
* or are 'updating' the deleted addrs.
|
|
* Then it writes the modified old addrs.
|
|
* Then rewrites vista word rec with new data.
|
|
*
|
|
* The bit vector dbas_bits_batch contains a 1 bit
|
|
* for every dba for every doc in the fzk file.
|
|
* got_word structure:
|
|
* .or_hwordkey - the word. (always in a 'huge' word buffer).
|
|
* .or_hwoffset - offset in a d99 inverted index file for
|
|
* a given word. the first address starts
|
|
* at this position.
|
|
* .or_hwaddrs - total number of addresses for a given word.
|
|
* .or_hwfree - number of free slots in a database
|
|
* addresses file for a given word.
|
|
*/
|
|
void write_2_dtbs_addr_file (void)
|
|
{
|
|
DtSrINT32 num_addrs_ii;
|
|
DtSrINT32 num_reads;
|
|
DtSrINT32 i_start, k, cur_ind;
|
|
DtSrINT32 num_delete_addrs = 0;
|
|
char addrs_removed = FALSE;
|
|
DtSrINT32 i;
|
|
DtSrINT32 cur_byte;
|
|
char bit_addrs;
|
|
DB_ADDR temp1;
|
|
|
|
if (debugging & DEBUG_O)
|
|
printf (" old vis '%s' ofs=%ld adr=%ld fre=%ld\n",
|
|
displayable(got_word.or_hwordkey),
|
|
(long) got_word.or_hwoffset,
|
|
(long) got_word.or_hwaddrs,
|
|
(long) got_word.or_hwfree);
|
|
|
|
num_addrs_ii = got_word.or_hwaddrs;
|
|
if (num_addrs_ii > or_reccount) {
|
|
printf (catgets (dtsearch_catd, MS_cborodin, 713,
|
|
"\n%s Word '%s' occurs in %ld records,\n"
|
|
" but there are only %ld records in database!\n"
|
|
" (This may be a good candidate for the stoplist).\n"),
|
|
PROGNAME"713",
|
|
(long) got_word.or_hwordkey,
|
|
(long) num_addrs_ii,
|
|
(long) or_reccount);
|
|
DtSearchExit (68);
|
|
}
|
|
|
|
if (fseek (dtbs_addr_fp, (long) got_word.or_hwoffset, SEEK_SET) != 0)
|
|
{
|
|
printf (catgets (dtsearch_catd, MS_cborodin, 875,
|
|
"\n%s Could not fseek d99 file to offset %ld.\n"),
|
|
PROGNAME"875", got_word.or_hwoffset);
|
|
DtSearchExit (98);
|
|
}
|
|
num_reads = fread (word_addrs_ii, sizeof(DB_ADDR),
|
|
(size_t)num_addrs_ii, dtbs_addr_fp);
|
|
if (num_reads != num_addrs_ii) {
|
|
printf (catgets (dtsearch_catd, MS_cborodin, 848,
|
|
"\n%s Could not fread %ld bytes (%ld dba's) of d99 file\n"
|
|
" at offset %ld. Number of dba's read (return code) = %ld.\n"),
|
|
PROGNAME"848", sizeof(DB_ADDR) * num_addrs_ii, (long)num_addrs_ii,
|
|
(long)got_word.or_hwoffset, (long)num_reads);
|
|
DtSearchExit (98);
|
|
}
|
|
#ifdef BYTE_SWAP
|
|
for (i = 0; i < num_addrs_ii; i++)
|
|
NTOHL (word_addrs_ii[i]);
|
|
/* Now both addr arrays are in 'host' byte swap order */
|
|
#endif
|
|
|
|
/* If there are only new docs,
|
|
* this switch will prevent the checking for updates.
|
|
*/
|
|
if (check_existing_addrs) {
|
|
i_start = 0;
|
|
|
|
/* Loop on every preexisting dba for word as read from d99 */
|
|
for (i = 0; i < num_addrs_ii; i++) {
|
|
if (debugging & DEBUG_o)
|
|
printf (" old d99 %ld: x%lx(%ld:%ld)",
|
|
(long) i,
|
|
(long) word_addrs_ii[i],
|
|
(long) word_addrs_ii[i] >> 8,
|
|
(long) word_addrs_ii[i] & 0xffL);
|
|
|
|
/* Get 'record number' by shifting hi 3 bytes 1 byte (8 bits)
|
|
* to right over stat wt byte. D99 rec#'s start at 1,
|
|
* so subtract 1 to start at 0 for bit vector.
|
|
*/
|
|
temp1 = (*(word_addrs_ii + i) >> 8) - 1; /* = rec#, base 0 */
|
|
cur_byte = temp1 >> 3; /* get matching byte# in bit vector */
|
|
if (cur_byte >= bit_vector_size) {
|
|
printf ( catgets(dtsearch_catd, MS_cborodin, 9,
|
|
"\n%s Corrupted d99 file for word '%s',\n"
|
|
" database address %ld @ file position %ld => bitvector[%ld],"
|
|
" but max bitvector allocation = %ld.\n") ,
|
|
PROGNAME"727", displayable(got_word.or_hwordkey),
|
|
(long)temp1, (long)i,
|
|
(long)cur_byte, (long)bit_vector_size);
|
|
DtSearchExit (69);
|
|
}
|
|
bit_addrs = 0;
|
|
bit_addrs |= 1 << (temp1 % 8); /* bit mask */
|
|
/*
|
|
* If this dba, which is on the current word's old d99
|
|
* addrs list, is also a doc in the fzk file (dbas_bits_batch),
|
|
* delete it from the d99 list by writing subsequent dba's
|
|
* over it. Boy this recursive nested loop has gotta be slow.
|
|
* Faster algorithm? Add 'good' addrs to the end of
|
|
* record_addr_word[]. No nested overlay loop, only one write!
|
|
*/
|
|
if (bit_addrs & (*(dbas_bits_batch + cur_byte))) {
|
|
addrs_removed = TRUE;
|
|
num_delete_addrs++;
|
|
if (i_start == 0) {
|
|
cur_ind = i;
|
|
i_start = i + 1;
|
|
}
|
|
else {
|
|
if (i_start < i) {
|
|
/* compress: move good addrs over
|
|
* space of deleted ones */
|
|
for (k = i_start; k < i; k++) {
|
|
word_addrs_ii[cur_ind] = word_addrs_ii[k];
|
|
cur_ind++;
|
|
}
|
|
}
|
|
i_start = i + 1;
|
|
}
|
|
} /* end if where dba is on both fzk list and curr d99 */
|
|
} /* end loop on every d99 addr for this word */
|
|
|
|
if (addrs_removed) { /* final overlay compression */
|
|
if (i_start < i) {
|
|
/* compress: move good addrs over
|
|
* space of deleted ones */
|
|
for (k = i_start; k < i; k++) {
|
|
word_addrs_ii[cur_ind] = word_addrs_ii[k];
|
|
cur_ind++;
|
|
}
|
|
}
|
|
}
|
|
} /* end if (check_existing_addrs) */
|
|
|
|
got_word.or_hwaddrs -= num_delete_addrs;
|
|
got_word.or_hwfree += num_delete_addrs;
|
|
|
|
/* The old dba array word_addrs_ii[] is now 'compressed',
|
|
* it contains only addrs not in fzk file.
|
|
* And the vista rec 'got_word' now matches it.
|
|
* And record_addr_word[] still contains
|
|
* the new/updated addrs from the fzk file.
|
|
* Now Efim calls a func to write them both back out to d99 file.
|
|
*/
|
|
put_addrs_2_dtbs_addr_file (record_addr_word, num_addrs_for_word);
|
|
write_wordstr (&got_word, 0); /* update vista WORD rec */
|
|
|
|
return;
|
|
} /* write_2_dtbs_addr_file() */
|
|
|
|
|
|
/********************************/
|
|
/* */
|
|
/* write_new_word_2_dtbs */
|
|
/* */
|
|
/********************************/
|
|
/* Subroutine of fill_data1() in Pass 2 for a NEW word.
|
|
* Writes d99 data, and updates (empty) got_word vista record.
|
|
* record_addr_word [num_addrs_for_word] is the array of addrs
|
|
* for docs from this batch that contain the current word (built by
|
|
* fill_data1 from the dba_list for the word's Pass 1 binary tree node).
|
|
* It will be byte swapped from 'host' to 'network' order in this function.
|
|
*/
|
|
void write_new_word_2_dtbs (void)
|
|
{
|
|
FREE_SPACE_STR *free_slot;
|
|
DtSrINT32 num_writes;
|
|
int ret_fseek;
|
|
DtSrINT32 int32;
|
|
|
|
if (debugging & (DEBUG_n | DEBUG_N))
|
|
printf (" new word '%s', adrs=%ld,",
|
|
got_word.or_hwordkey, (long)num_addrs_for_word);
|
|
|
|
free_slot = find_free_space (num_addrs_for_word, &fl_hdr);
|
|
if (free_slot == NULL) {
|
|
/* append addrs to end of d99 file */
|
|
ret_fseek = fseek (dtbs_addr_fp, 0L, SEEK_END);
|
|
got_word.or_hwoffset = ftell (dtbs_addr_fp);
|
|
got_word.or_hwfree = 0;
|
|
if (debugging & (DEBUG_n | DEBUG_N))
|
|
printf ("APPEND ofs=%ld, fre=0\n", (long int) got_word.or_hwoffset);
|
|
}
|
|
else {
|
|
ret_fseek = fseek (dtbs_addr_fp,
|
|
(long)free_slot->offset, SEEK_SET);
|
|
got_word.or_hwoffset = free_slot->offset;
|
|
got_word.or_hwfree = free_slot->hole_size -
|
|
num_addrs_for_word;
|
|
if (debugging & (DEBUG_n | DEBUG_N))
|
|
printf (" REUSE slot ofs=%ld, fre=%ld\n",
|
|
(long int) got_word.or_hwoffset, (long int) got_word.or_hwfree);
|
|
}
|
|
|
|
/***** Write new database addresses to d99 file *********/
|
|
if (debugging & DEBUG_n) {
|
|
for (int32 = 0; int32 < num_addrs_for_word; int32++) {
|
|
printf (" dba #%ld: x%lx(%ld:%ld)\n",
|
|
(long)int32,
|
|
(long)record_addr_word[int32],
|
|
(long)record_addr_word[int32] >> 8,
|
|
(long)record_addr_word[int32] & 0xffL);
|
|
}
|
|
}
|
|
#ifdef BYTE_SWAP
|
|
/* Put addr array in 'network' byte order */
|
|
for (int32 = 0; int32 < num_addrs_for_word; int32++)
|
|
HTONL (record_addr_word[int32]);
|
|
#endif
|
|
num_writes = fwrite (record_addr_word, sizeof(DB_ADDR),
|
|
(size_t)num_addrs_for_word, dtbs_addr_fp);
|
|
if (num_writes != num_addrs_for_word)
|
|
DtSearchExit (97);
|
|
|
|
got_word.or_hwaddrs = num_addrs_for_word;
|
|
|
|
if (got_word.or_hwfree != 0) {
|
|
/* Fill unused free holes with foxes for debugging.
|
|
* Note that byte swap is unnecessary for foxes.
|
|
* Note that record_addr_word is now available for this action.
|
|
*/
|
|
for (int32 = 0; int32 < got_word.or_hwfree; int32++)
|
|
*(record_addr_word + int32) = 0xFFFFFFFF;
|
|
num_writes = fwrite (record_addr_word, sizeof(DB_ADDR),
|
|
(size_t)got_word.or_hwfree, dtbs_addr_fp);
|
|
if (num_writes != got_word.or_hwfree) {
|
|
printf (catgets (dtsearch_catd, MS_cborodin, 776, msg_776),
|
|
PROGNAME"960", strerror(errno));
|
|
DtSearchExit (96);
|
|
}
|
|
}
|
|
|
|
/* Save changed word_info structure back to the vista database! */
|
|
write_wordstr (&got_word, 0);
|
|
return;
|
|
} /* write_new_word_2_dtbs() */
|
|
|
|
|
|
/************************/
|
|
/* */
|
|
/* fill_data1 */
|
|
/* */
|
|
/************************/
|
|
/* Called from write_to_file() in Pass 2.
|
|
* Write_to_file() is 'visit node' function of tree traversal.
|
|
* It has converted dbalist in each word node in tree to
|
|
* array of dbas (record_addr_word [num_addrs_for_word])
|
|
* with correct statistical weighting, still in 'host' byte swap order.
|
|
* This function seeks word key in database. If word is new,
|
|
* it calls functions to write new vista rec and d99 data.
|
|
* If word is old it calls functions to read word rec and update d99.
|
|
*/
|
|
void fill_data1 (char *node_word)
|
|
{
|
|
char miker[1024];
|
|
strcpy (miker, node_word);
|
|
|
|
count_word_ii++;
|
|
if (shutdown_now) {
|
|
printf (catgets (dtsearch_catd, MS_cborodin, 164,
|
|
"\n%s Abort due to signal %d. Database %s\n"
|
|
" probably corrupted. Restore backup database.\n"),
|
|
PROGNAME"164", shutdown_now, dicname);
|
|
DtSearchExit (10);
|
|
}
|
|
|
|
/* print occasional progress dots and msgs */
|
|
if (!(count_word_ii % words_per_dot)) {
|
|
putchar ('.');
|
|
dotcount++;
|
|
if (!(dotcount % 10))
|
|
putchar (' ');
|
|
if (dotcount >= 50) {
|
|
dotcount = 0;
|
|
seconds_left = (unsigned long)
|
|
(((float) num_of_diff_words /
|
|
(float) count_word_ii - 1.) *
|
|
(float) (time (NULL) - timestart));
|
|
printf (catgets (dtsearch_catd, MS_cborodin, 849,
|
|
"\n%s: Word #%ld, %.0f%% done. Est %lum %02lus "
|
|
"to completion.\n"),
|
|
aa_argv0, count_word_ii,
|
|
(float) count_word_ii / (float) num_of_diff_words * 100.0,
|
|
/***(count_word_ii * 100L) / num_of_diff_words,***/
|
|
seconds_left / 60L, seconds_left % 60L);
|
|
}
|
|
else
|
|
fflush (stdout);
|
|
} /* endif for progress dots and msgs */
|
|
|
|
strncpy (got_word.or_hwordkey, node_word, DtSrMAXWIDTH_HWORD);
|
|
got_word.or_hwordkey[DtSrMAXWIDTH_HWORD - 1] = 0;
|
|
find_keyword (miker, 0); /* vista KEYFIND for word rec */
|
|
if (db_status == S_NOTFOUND) { /* this is a NEW word */
|
|
got_word.or_hwoffset = 0;
|
|
got_word.or_hwfree = 0;
|
|
got_word.or_hwaddrs = 0;
|
|
fillnew_wordrec (&got_word, 0); /* write (empty) vista word rec */
|
|
if (db_status != S_OKAY)
|
|
vista_abort (PROGNAME"981");
|
|
write_new_word_2_dtbs(); /* write NEW word's d99 entries
|
|
* and update vista word rec */
|
|
return;
|
|
}
|
|
|
|
/* update previously existing word */
|
|
read_wordstr (&got_word, 0); /* read OLD word rec into got_word */
|
|
if (db_status == S_OKAY)
|
|
write_2_dtbs_addr_file(); /* update OLD word's d99 entries
|
|
* and update vista word rec */
|
|
return;
|
|
} /* fill_data1() */
|
|
|
|
|
|
/************************************************/
|
|
/* */
|
|
/* load_into_bintree */
|
|
/* */
|
|
/************************************************/
|
|
/* Pass 1 function.
|
|
* Loads parsed word token or stem token into
|
|
* inverted index binary tree along with passed dba.
|
|
* Token is allowed to be empty, ie first byte is \0.
|
|
* Derived from Efim's original 'teskey_parse()'
|
|
* and bin_tree() functions.
|
|
* Variables static for speeeeeeed.
|
|
*/
|
|
static void load_into_bintree (
|
|
char *parser_token,
|
|
int token_is_stem,
|
|
DB_ADDR dba)
|
|
{
|
|
static DtSrINT16 or_maxwordsz;
|
|
static char *cptr;
|
|
static int i;
|
|
static TREENODE **this_link;
|
|
static TREENODE *newnode;
|
|
static DBALIST *newdba;
|
|
static char *tokbuf = NULL;
|
|
|
|
if (*parser_token == 0) {
|
|
if (debugging & DEBUG_I)
|
|
printf (" bintr=<empty> dba=%ld\n", (long)dba);
|
|
return;
|
|
}
|
|
|
|
/* Copy token to a buffer.
|
|
* Stems have a special prefix character
|
|
* to distinguish them from words.
|
|
* Also increment total dba word count.
|
|
*/
|
|
if (tokbuf == NULL) {
|
|
or_maxwordsz = dblk.dbrec.or_maxwordsz;
|
|
tokbuf = austext_malloc ((size_t) or_maxwordsz + 4,
|
|
PROGNAME"1152", NULL);
|
|
}
|
|
if (token_is_stem) {
|
|
tokbuf[0] = STEM_CH;
|
|
strncpy (tokbuf + 1, parser_token, (size_t)or_maxwordsz);
|
|
dbas_word_count[dba]++;
|
|
}
|
|
else
|
|
strncpy (tokbuf, parser_token, (size_t)or_maxwordsz);
|
|
tokbuf [or_maxwordsz] = 0;
|
|
if (debugging & DEBUG_I)
|
|
printf (" bintr='%s' dba=%ld ", displayable(tokbuf), (long)dba);
|
|
|
|
/* TREE TRAVERSAL. Search binary tree to find either
|
|
* insertion point or identical preexisting token.
|
|
*/
|
|
for (this_link = &root_node; *this_link != NULL; ) {
|
|
i = strcmp (tokbuf, (*this_link)->word);
|
|
|
|
/* If identical word/stem token already exists... */
|
|
if (i == 0) {
|
|
/* If token appears more than once in current
|
|
* document (dba already exists at top of dba list),
|
|
* just increment the word count in the list.
|
|
*/
|
|
if ((*this_link)->dba_list->dba == dba)
|
|
(*this_link)->dba_list->w_c++;
|
|
|
|
/* If this is first appearance of token for this doc
|
|
* (dba is not at start of token's dba list),
|
|
* insert dba at start of token's dba list.
|
|
*/
|
|
else {
|
|
if ((newdba = malloc (sizeof(DBALIST))) == NULL) {
|
|
printf (catgets (dtsearch_catd, MS_cborodin, 374,
|
|
msg_374), PROGNAME"1150");
|
|
DtSearchExit (26);
|
|
}
|
|
newdba->dba = dba;
|
|
newdba->w_c = 1;
|
|
newdba->next_dba = (*this_link)->dba_list;
|
|
(*this_link)->dba_list = newdba;
|
|
}
|
|
if (debugging & DEBUG_I)
|
|
printf (" Old %ld=%ld\n",
|
|
(long)((*this_link)->dba_list->dba),
|
|
(long)((*this_link)->dba_list->w_c));
|
|
return; /* done with token */
|
|
|
|
} /* endif where token was found in binary tree */
|
|
|
|
/* Increment link ptr by descending to correct subtree */
|
|
if (i < 0) {
|
|
this_link = &(*this_link)->llink;
|
|
if (debugging & DEBUG_I)
|
|
putchar ('L');
|
|
}
|
|
else {
|
|
this_link = &(*this_link)->rlink;
|
|
if (debugging & DEBUG_I)
|
|
putchar ('R');
|
|
}
|
|
} /* end tree traversal */
|
|
|
|
/* Tree traversal never found a preexisting token node.
|
|
* Create a new node and insert it at the point
|
|
* indicated by link ptr.
|
|
*/
|
|
newnode = austext_malloc (sizeof(TREENODE) + strlen(tokbuf) + 4,
|
|
PROGNAME"1234", NULL);
|
|
newnode->llink = NULL;
|
|
newnode->rlink = NULL;
|
|
newnode->word = (char *) (newnode + 1); /* use mem at end of node */
|
|
strcpy (newnode->word, tokbuf);
|
|
|
|
newdba = austext_malloc (sizeof(DBALIST), PROGNAME"1235", NULL);
|
|
newnode->dba_list = newdba;
|
|
newdba->dba = dba;
|
|
newdba->w_c = 1;
|
|
newdba->next_dba = NULL;
|
|
|
|
*this_link = newnode;
|
|
num_of_diff_words++;
|
|
|
|
if (debugging & DEBUG_I)
|
|
printf (" New %ld=%ld\n",
|
|
(long)((*this_link)->dba_list->dba),
|
|
(long)((*this_link)->dba_list->w_c));
|
|
return;
|
|
} /* load_into_bintree() */
|
|
|
|
|
|
/**********************************************/
|
|
/* */
|
|
/* MAIN */
|
|
/* */
|
|
/**********************************************/
|
|
main (int argc, char **argv)
|
|
{
|
|
int i;
|
|
long word_offset; /* <-- PARG.offsetp */
|
|
long bytes_in; /* ftell() */
|
|
DtSrINT32 dba_offset;
|
|
int got_ETX;
|
|
char *cptr, *src;
|
|
char temp_buf[40];
|
|
char db_key [DtSrMAX_DB_KEYSIZE + 2];
|
|
int oops = FALSE;
|
|
DtSrINT32 cur_byte;
|
|
struct tm *tmptr;
|
|
DB_ADDR dba, temp_dba;
|
|
time_t elapsed;
|
|
size_t mallocsz;
|
|
char *parsebufp, *stembufp;
|
|
|
|
/******************* INITIALIZE ******************/
|
|
setlocale (LC_ALL, "");
|
|
dtsearch_catd = catopen (FNAME_DTSRCAT, 0);
|
|
|
|
aa_argv0 = strdup (argv[0]);
|
|
time (&elapsed);
|
|
tmptr = localtime (&elapsed);
|
|
strftime (buf, sizeof(buf),
|
|
catgets (dtsearch_catd, MS_misc, 22, "%A, %b %d %Y, %I:%M %p"),
|
|
tmptr);
|
|
printf (catgets (dtsearch_catd, MS_cborodin, 1, "%s. Run %s.\n"),
|
|
aa_argv0, buf);
|
|
austext_exit_last = print_exit_code;
|
|
batch_size = BATCH_SIZE;
|
|
init_user_interrupt ();
|
|
default_hashsize = duprec_hashsize;
|
|
|
|
memset (&dblk, 0, sizeof(DBLK));
|
|
|
|
memset (&parg, 0, sizeof(PARG));
|
|
parg.dblk = &dblk;
|
|
parg.etxdelim = ETXDELIM; /* default, can be changed */
|
|
parg.offsetp = &word_offset;
|
|
parg.flags |= PA_INDEXING; /* do compounding, if parser can */
|
|
|
|
/* Read user specified command line arguments */
|
|
user_args_processor (argc, argv);
|
|
|
|
/* Finish init now that we know final values */
|
|
inbuf = austext_malloc (inbufsz + 16, PROGNAME"1349", NULL);
|
|
temp = austext_malloc (inbufsz + 16, PROGNAME"1285", NULL);
|
|
sprintbuffer = austext_malloc (inbufsz + _POSIX_PATH_MAX + 16,
|
|
PROGNAME"1286", NULL);
|
|
record_addr_word = austext_malloc ((sizeof(DB_ADDR) * batch_size) + 16,
|
|
PROGNAME "1133", NULL);
|
|
|
|
/* Save dicname and path in dblk. Save full name of d99 file. */
|
|
strcpy (dblk.name, dicname);
|
|
dblk.path = dicpath;
|
|
strcpy (dtbs_addr_file, dicpath);
|
|
strcat (dtbs_addr_file, dicname);
|
|
strcat (dtbs_addr_file, EXT_DTBS);
|
|
|
|
/* Open the database */
|
|
if (!austext_dopen (dicname, dicpath, NULL, cache_size, &dbrec)) {
|
|
fprintf (aa_stderr, "%s\n", DtSearchGetMessages());
|
|
DtSearchExit (3);
|
|
}
|
|
memcpy (&dblk.dbrec, &dbrec, sizeof(DBREC));
|
|
|
|
/* Load database's parser, stemmer, and linguistic files into dblk. */
|
|
if (!load_language (&dblk, NULL)) {
|
|
puts (DtSearchGetMessages());
|
|
printf (catgets (dtsearch_catd, MS_cborodin, 1097,
|
|
"%s Aborting due to errors in loading language files.\n"),
|
|
PROGNAME"1097");
|
|
DtSearchExit(3);
|
|
}
|
|
|
|
RECFRST (PROGNAME "1067", OR_OBJREC, 0);
|
|
CRGET (PROGNAME "1069", &dba, 0); /* byte swap already done in vista */
|
|
|
|
or_reccount = dbrec.or_reccount; /* DtSrINT32 */
|
|
or_recslots = dbrec.or_recslots; /* promoted to DtSrINT32 */
|
|
or_maxdba = dbrec.or_maxdba; /* DtSrINT32 lim of dbas_word_count */
|
|
bit_vector_size = ((or_maxdba / or_recslots + 1) >> 3) + 1; /* DtSrINT32 */
|
|
dba_offset = or_recslots - (dba & 0x00FFFFFF); /* DtSrINT32 */
|
|
|
|
if (debugging)
|
|
printf (PROGNAME"1286 "
|
|
"realnumrec=%ld recslots=%ld bitvecsz=%ld"
|
|
" dbaoffset=%d maxdba=%ld\n",
|
|
(long)or_reccount, (long)or_recslots, (long)bit_vector_size,
|
|
(int)dba_offset, (long)or_maxdba);
|
|
|
|
/* Allocate memory space for the arrays.
|
|
* dbas_bits_batch = 'bit vector', one bit for every possible rec#.
|
|
* the 1 bits = only the dba's that are in this fzk batch.
|
|
* word_addrs_ii = fread buffer for d99 file.
|
|
* dbas_word_count = summing bkts for word count statistics.
|
|
*/
|
|
dbas_bits_batch = (char *) austext_malloc ((size_t)bit_vector_size + 48,
|
|
PROGNAME "1150", NULL);
|
|
word_addrs_ii = (DB_ADDR *) austext_malloc (
|
|
sizeof (DB_ADDR) * (or_reccount + 1) + 48,
|
|
PROGNAME "1152", NULL);
|
|
mallocsz = sizeof(DtSrINT32) * (or_maxdba + 1) + 48;
|
|
dbas_word_count = (DtSrINT32 *) austext_malloc (mallocsz,
|
|
PROGNAME "1154", NULL);
|
|
memset (dbas_bits_batch, 0, (size_t)bit_vector_size + 48);
|
|
memset (dbas_word_count, 0, mallocsz);
|
|
|
|
root_node = NULL;
|
|
|
|
/* Open the d99 file that contains database addresses.
|
|
* If the file doesn't exist, it means the database
|
|
* for keyword search is empty - open a new file.
|
|
*/
|
|
if ((dtbs_addr_fp = fopen (dtbs_addr_file, "r+b")) == NULL) {
|
|
dtbs_addr_fp = fopen (dtbs_addr_file, "w+b");
|
|
check_existing_addrs = FALSE;
|
|
new_dtbs_file = TRUE;
|
|
if (dtbs_addr_fp == NULL) {
|
|
/* msg 1068 used multiple places */
|
|
printf (catgets (dtsearch_catd, MS_cborodin, 1068,
|
|
"%s Can't open new inverted index file '%s': %s\n"),
|
|
PROGNAME"1068", dtbs_addr_file, strerror(errno));
|
|
DtSearchExit (13);
|
|
}
|
|
/* write New Header Information to a file */
|
|
init_header (dtbs_addr_fp, &fl_hdr);
|
|
}
|
|
else {
|
|
/* read Header Information from d99 file */
|
|
if (!fread_d99_header (&fl_hdr, dtbs_addr_fp)) {
|
|
/* msg 1068 used multiple places */
|
|
printf (catgets (dtsearch_catd, MS_cborodin, 1068,
|
|
"%s Can't read header data for '%s': %s\n"),
|
|
PROGNAME"1422", dtbs_addr_file, strerror(errno));
|
|
DtSearchExit (13);
|
|
}
|
|
}
|
|
|
|
/* open input .fzk file */
|
|
src = getcwd (sprintbuffer, _POSIX_PATH_MAX);
|
|
if (!src && debugging)
|
|
printf (PROGNAME"1336 Can't getcwd: %s.\n", strerror(errno));
|
|
if (!src)
|
|
src = getenv ("PWD");
|
|
printf (catgets (dtsearch_catd, MS_misc, 24,
|
|
"%s: current working directory = '%s', .fzk file = '%s'\n"),
|
|
aa_argv0,
|
|
(src) ? src : catgets (dtsearch_catd, MS_misc, 6, "<unknown>"),
|
|
fname_input);
|
|
if ((instream = fopen (fname_input, "rt")) == NULL) {
|
|
BAD_INPUT_FILE:
|
|
printf (catgets (dtsearch_catd, MS_cborodin, 1083,
|
|
"%s Can't read input file '%s': %s\n"),
|
|
PROGNAME"1083", fname_input, strerror(errno));
|
|
DtSearchExit (14);
|
|
}
|
|
if (fstat (fileno (instream), &fstat_input) == -1)
|
|
goto BAD_INPUT_FILE;
|
|
parg.ftext = instream; /* for readchar_ftext(), discard_to_ETX() */
|
|
|
|
time (&totalstart); /* for total elapsed time */
|
|
timestart = totalstart; /* for Pass 1 elapsed time */
|
|
|
|
/*------------ PASS 1: ------------
|
|
* Main Read Loop. For each text record in input file,
|
|
* parse and stem words, store them into binary tree
|
|
* inverted index in memory.
|
|
* The first few lines are database administrative values.
|
|
* They are presumed ascii and read with fgets() as
|
|
* 'lines' terminated with \n. The text of the document
|
|
* itself is presumed to be in the appropriate database
|
|
* 'language', so it is *not* presumed to be lines
|
|
* terminated with \n. The document text is read by
|
|
* the language's parser() a 'word' at a time, which
|
|
* ultimately means a byte at a time.
|
|
*/
|
|
printf (catgets (dtsearch_catd, MS_cborodin, 1108,
|
|
"%s: Beginning Pass 1, reading records from '%s'.\n"
|
|
" Each dot = %d records.\n"),
|
|
aa_argv0, fname_input, recs_per_dot);
|
|
dotcount = 0;
|
|
|
|
while (!feof(instream)) {
|
|
|
|
/* 1. Read and discard the FZKEY line.
|
|
* 2. Read and discard the ABSTRACT line.
|
|
* 3. Read the UNIQUE KEY for the record.
|
|
* Do some record initialization steps here.
|
|
* 4. Read and discard the DATE line.
|
|
* 5. Let the parser read and parse rest of record, ie doc text...
|
|
*/
|
|
|
|
/*----- READ LINE #1, fzkey -----*/
|
|
if (fgets (inbuf, inbufsz, instream) == NULL)
|
|
break;
|
|
inbuf [inbufsz] = 0; /* just to be sure */
|
|
|
|
if (shutdown_now) {
|
|
printf (catgets (dtsearch_catd, MS_cborodin, 164,
|
|
"\n%s: %s Abort due to signal %d. Database %s\n"
|
|
" possibly corrupted. Restore backup database.\n"),
|
|
aa_argv0, PROGNAME"1299", shutdown_now, dicname);
|
|
DtSearchExit (11);
|
|
}
|
|
|
|
/* Silently skip null records just like dtsrload */
|
|
if (strcmp (inbuf, parg.etxdelim) == 0)
|
|
continue;
|
|
|
|
record_count++;
|
|
|
|
/*----- READ LINE #2, abstract -----*/
|
|
if (fgets (inbuf, inbufsz, instream) == NULL) {
|
|
INVALID_FZK_FORMAT:
|
|
printf (catgets (dtsearch_catd, MS_cborodin, 1129,
|
|
"%s: %s Invalid .fzk file format.\n"),
|
|
fname_input, PROGNAME"1129");
|
|
DtSearchExit (22);
|
|
}
|
|
inbuf[inbufsz] = 0; /* just to be sure */
|
|
|
|
/*--- READ LINE #3, unique database key ---*/
|
|
if (fgets (inbuf, inbufsz, instream) == NULL)
|
|
goto INVALID_FZK_FORMAT;
|
|
inbuf[inbufsz] = 0; /* just to be sure */
|
|
|
|
if ((cptr = strtok (inbuf, " \t\n")) == NULL)
|
|
goto INVALID_FZK_FORMAT;
|
|
|
|
/* If necessary, discard long keys exactly like cravel */
|
|
if (strlen (cptr) >= DtSrMAX_DB_KEYSIZE) {
|
|
printf (catgets (dtsearch_catd, MS_cborodin, 659,
|
|
"\n%s: %s Discarding record, key too long:\n '%s'.\n"),
|
|
aa_argv0, PROGNAME"659", cptr);
|
|
discard_to_ETX (&parg);
|
|
continue;
|
|
}
|
|
strcpy (db_key, cptr);
|
|
|
|
/* Skip duplicate record ids in same order as dtsrload */
|
|
i = is_duprec (db_key);
|
|
if (i == 2) { /* out of memory */
|
|
printf (catgets (dtsearch_catd, MS_cborodin, 374, msg_374),
|
|
PROGNAME"1317");
|
|
DtSearchExit (57);
|
|
}
|
|
else if (i == 1) { /* duplicate record id */
|
|
duplicate_recids++;
|
|
if (dotcount > 0)
|
|
putchar ('\n');
|
|
printf (catgets (dtsearch_catd, MS_cborodin, 1402,
|
|
"%s: Discarded duplicate rec #%lu '%s'.\n"),
|
|
aa_argv0, record_count, db_key);
|
|
discard_to_ETX (&parg);
|
|
continue;
|
|
}
|
|
|
|
/****** FFFFFFFFFFFFFFFFFFFFF **********/
|
|
/* Convert database address (slot #) to 'record number',
|
|
* what dba would have been if all records took up
|
|
* only one slot and there were no dbrec at top of file.
|
|
* Record numbers on d99, like dba's, start at #1,
|
|
* but rec numbers here (in bit vector) start at #0.
|
|
*/
|
|
KEYFIND (PROGNAME "222", OR_OBJKEY, (char *) db_key, 0);
|
|
if (db_status != S_OKAY) {
|
|
normal_retncode = 1; /* = 'warning' */
|
|
if (dotcount > 0)
|
|
putchar ('\n');
|
|
printf (catgets (dtsearch_catd, MS_cborodin, 1168,
|
|
"%s: %s Discarded '%s', key not in database.\n"),
|
|
aa_argv0, PROGNAME"1168", displayable(db_key));
|
|
discard_to_ETX (&parg);
|
|
continue;
|
|
}
|
|
|
|
CRGET (PROGNAME "224", &temp_dba, 0); /* vista already byte swapped */
|
|
temp_dba &= 0x00FFFFFF; /* = slot# */
|
|
dba = (temp_dba + dba_offset) / or_recslots; /* = rec#, base 1 */
|
|
/*
|
|
* Don't change this 'dba'!--eventually it goes
|
|
* into d99 in this exact format! It will also
|
|
* be used as an index into dbas_word_count[] in
|
|
* load_into_bintree() so do a sanity check
|
|
* to make sure that it hasn't exceeded the size
|
|
* of that array. (The count increments have been
|
|
* reported as as 'uninitialized memory reads'
|
|
* by a debugger). This might happen for example
|
|
* if user failed to run dtsrload before dtsrindex?
|
|
*/
|
|
if (dba < 1 || dba > or_maxdba) {
|
|
printf ( catgets(dtsearch_catd, MS_cborodin, 21,
|
|
"\n%s '%s' record overflows word counter array.\n"
|
|
"Record number %ld > maxdba %ld, dba=%ld, "
|
|
"recslots=%ld, offs=%d.\n") ,
|
|
PROGNAME"1526", displayable(db_key),
|
|
(long)dba, (long)or_maxdba, (long)temp_dba,
|
|
(long)or_recslots, (int)dba_offset);
|
|
DtSearchExit (68);
|
|
}
|
|
temp_dba = dba - 1; /* = rec# starting at 0 */
|
|
cur_byte = temp_dba >> 3; /* bits to bytes: div by 8 */
|
|
if (cur_byte >= bit_vector_size) {
|
|
printf ( catgets(dtsearch_catd, MS_cborodin, 22,
|
|
"\n%s '%s' record in database (dba=%ld)\n"
|
|
" overflows bitvector allocation (%ld >= %ld).\n") ,
|
|
PROGNAME"1475", displayable(db_key), (long)dba,
|
|
(long)cur_byte, (long)bit_vector_size);
|
|
DtSearchExit (69);
|
|
}
|
|
dbas_bits_batch[cur_byte] |= 1 << (temp_dba % 8);
|
|
|
|
/* Print occasional progress dots and msgs */
|
|
if (!(record_count % recs_per_dot)) {
|
|
putchar ('.');
|
|
dotcount++;
|
|
if (!(dotcount % 10))
|
|
putchar (' ');
|
|
if (dotcount >= 50) {
|
|
dotcount = 0;
|
|
bytes_in = ftell (instream);
|
|
seconds_left = (unsigned long)
|
|
(((float) fstat_input.st_size /
|
|
(float) bytes_in - 1.) *
|
|
(float) (time (NULL) - timestart));
|
|
printf (catgets (dtsearch_catd, MS_cborodin, 1190,
|
|
"\n%s: Rec #%lu, %.0f%% done. "
|
|
"Est %lum %02lus to end Pass 1.\n"),
|
|
aa_argv0,
|
|
record_count,
|
|
(float) bytes_in / (float) fstat_input.st_size * 100.0,
|
|
seconds_left / 60UL,
|
|
seconds_left % 60UL);
|
|
}
|
|
fflush (stdout);
|
|
}
|
|
|
|
/*----- READ LINE #4, date -----*/
|
|
if (fgets (inbuf, inbufsz, instream) == NULL)
|
|
goto INVALID_FZK_FORMAT;
|
|
inbuf[inbufsz] = 0; /* just to be sure */
|
|
|
|
/* PARSE LOOP FOR CURRENT TEXT BLOCK.
|
|
* We must be in the middle of a record ('lines' #5 and beyond).
|
|
* From here to ETX, which is either the record delimiter string
|
|
* or the end of file, read the file a 'word' at a time
|
|
* using the parse() function for the language specified
|
|
* for the database.
|
|
* Load_into_bintree() stores each token into
|
|
* inverted index binary tree.
|
|
* Note: dba here MUST still be rec#, base 1.
|
|
* It's stored as is by load_into_bintree(),
|
|
* and will be moved as is into d99 file in Pass 2.
|
|
*/
|
|
if (debugging & DEBUG_P)
|
|
printf ("\nRecord #%lu '%s'\n"
|
|
"Offset Word---- Stem----\n",
|
|
record_count, db_key);
|
|
for ( cptr = dblk.parser (&parg);
|
|
cptr;
|
|
cptr = dblk.parser (NULL)) {
|
|
|
|
if (debugging & DEBUG_P) {
|
|
printf ("%6ld %s %n", word_offset, cptr, &i);
|
|
if (!(debugging & DEBUG_I))
|
|
while (i++ < 30)
|
|
putchar (' ');
|
|
}
|
|
load_into_bintree (cptr, FALSE, dba);
|
|
cptr = dblk.stemmer (cptr, &dblk);
|
|
if (debugging & DEBUG_P) {
|
|
printf ("%s\n", cptr);
|
|
fflush (stdout);
|
|
}
|
|
load_into_bintree (cptr, TRUE, dba);
|
|
}
|
|
|
|
} /* end of PASS 1 Main read loop */
|
|
|
|
elapsed = time(NULL) - timestart;
|
|
if (dotcount > 0) {
|
|
putchar ('\n');
|
|
dotcount = 0;
|
|
}
|
|
if (duplicate_recids > 0L) {
|
|
normal_retncode = 1; /* 'warning' */
|
|
sprintf (buf, catgets (dtsearch_catd, MS_cborodin, 40,
|
|
"Ignored %ld duplicate records"),
|
|
duplicate_recids);
|
|
}
|
|
else
|
|
strcpy (buf, catgets (dtsearch_catd, MS_cborodin, 41,
|
|
"No duplicate records found"));
|
|
printf (catgets (dtsearch_catd, MS_cborodin, 1225,
|
|
"%s: Pass 1 completed in %lum %lus, read %lu records.\n"
|
|
" %s, parsed %lu words.\n"),
|
|
aa_argv0, elapsed / 60L, elapsed % 60L, record_count,
|
|
buf, num_of_diff_words);
|
|
if (record_count > batch_size) {
|
|
printf (catgets (dtsearch_catd, MS_cborodin, 33,
|
|
"\n%s Number of incoming records exceeded %d.\n"
|
|
" This will usually result in 'Out of Paging Space' "
|
|
"error in Pass 2\n"
|
|
" and corruption of database. Either split the incoming file to\n"
|
|
" reduce record count or use the -b option, and rerun.\n"),
|
|
PROGNAME"33", (int)batch_size);
|
|
DtSearchExit (33);
|
|
}
|
|
|
|
/*----------------- PASS 2: -----------------
|
|
* Traverse completed binary tree and write it to d99 file.
|
|
*/
|
|
printf (catgets (dtsearch_catd, MS_cborodin, 1233,
|
|
"%s: Beginning Pass 2: batch index traversal and database update.\n"
|
|
" Each dot = %d words.\n"),
|
|
aa_argv0, words_per_dot);
|
|
dotcount = 0;
|
|
time (×tart);
|
|
traverse_tree (); /* actual Pass 2 */
|
|
if (dotcount) {
|
|
putchar ('\n');
|
|
dotcount = 0;
|
|
}
|
|
|
|
/* Write header information to the d99 file */
|
|
if (!fwrite_d99_header (&fl_hdr, dtbs_addr_fp)) {
|
|
printf (catgets (dtsearch_catd, MS_cborodin, 776, msg_776),
|
|
PROGNAME"1723", strerror(errno));
|
|
DtSearchExit (13);
|
|
}
|
|
d_close ();
|
|
fclose (dtbs_addr_fp);
|
|
|
|
elapsed = time (NULL) - timestart;
|
|
printf (catgets (dtsearch_catd, MS_cborodin, 1246,
|
|
"%s: Pass 2 completed in %lum %lus, updated %lu words.\n"),
|
|
aa_argv0, elapsed / 60L, elapsed % 60L, count_word_ii);
|
|
if (normal_retncode == 1)
|
|
printf (catgets (dtsearch_catd, MS_cborodin, 2,
|
|
"%s: Warnings were detected.\n"), aa_argv0);
|
|
DtSearchExit (normal_retncode);
|
|
|
|
} /* main() */
|
|
|
|
/*************************** DTSRINDEX.C ****************************/
|
|
|