cdesktopenv/cde/programs/dtsr/dtsrload.c

1292 lines
41 KiB
C

/*
* CDE - Common Desktop Environment
*
* Copyright (c) 1993-2012, The Open Group. All rights reserved.
*
* These libraries and programs are free software; you can
* redistribute them and/or modify them under the terms of the GNU
* Lesser General Public License as published by the Free Software
* Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* These libraries and programs are distributed in the hope that
* they will be useful, but WITHOUT ANY WARRANTY; without even the
* implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
* PURPOSE. See the GNU Lesser General Public License for more
* details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with these libraries and programs; if not, write
* to the Free Software Foundation, Inc., 51 Franklin Street, Fifth
* Floor, Boston, MA 02110-1301 USA
*/
/*
* COMPONENT_NAME: austext
*
* FUNCTIONS: UPDATE_MAXDBA
* count_all_records
* create_object
* load_next_miscrec
* main
* print_exit_code
* print_progress
* read_dbrec
* segregate_dicname
* update_object
* user_args_processor
* write_dbrec
*
* ORIGINS: 27
*
*
* (C) COPYRIGHT International Business Machines Corp. 1993,1995
* All Rights Reserved
* Licensed Materials - Property of IBM
* US Government Users Restricted Rights - Use, duplication or
* disclosure restricted by GSA ADP Schedule Contract with IBM Corp.
*/
/*********************** DTSRLOAD.C ***************************
* $XConsortium: dtsrload.c /main/8 1996/09/23 21:04:17 cde-ibm $
* October 1993.
* Formerly dtsrload.c was cravel.c.
* Input: Standard AusText .fzk file.
* Function: Adds to or updates corresponding DtSearch-
* AusText database records.
*
* $Log$
* Revision 2.7 1996/03/25 18:54:44 miker
* Changed FILENAME_MAX to _POSIX_PATH_MAX.
*
* Revision 2.6 1996/03/13 22:53:47 miker
* Changed char to UCHAR several places.
*
* Revision 2.5 1996/02/01 18:46:02 miker
* AusText 2.1.11, DtSearch 0.3. Changed document text reads from fgets
* to new single character reading functions to match dtsrindex.
* Added -t etx delimiter string command line arg.
*
* Revision 2.4 1995/12/01 16:18:22 miker
* Added fflush for stdout and stderr for clean printing to AusBuild log.
*
* Revision 2.3 1995/10/26 17:48:45 miker
* Fixed duplicate msgs catopen().
*
* Revision 2.2 1995/10/25 18:39:52 miker
* Added prolog.
*
* Revision 2.1 1995/09/22 19:31:48 miker
* Freeze DtSearch 0.1, AusText 2.1.8
*
* Revision 1.3 1995/09/20 22:52:47 miker
* Fixed bug: DtSrFlNOTAVAIL was being set in wrong obj field.
*
* Revision 1.2 1995/09/19 21:59:53 miker
* Set DtSrFlNOTAVAIL when appropriate for doc.
* If DtSearch, use DtSrVERSION instead of AUSAPI_VERSION in banner.
*
* Revision 1.1 1995/08/31 20:52:34 miker
* Initial revision
*
* Revision 1.12 1995/06/08 19:42:44 miker
* 2.1.5f: Removed -w option. It no longer had an effect.
*/
#include "SearchP.h"
#include <limits.h>
#include <errno.h>
#include <string.h>
#include <signal.h>
#include <ctype.h>
#include <sys/stat.h>
#include <locale.h>
#include <unistd.h>
#include <stdlib.h>
#include "vista.h"
#include <sys/types.h>
#include <netinet/in.h>
void init_user_interrupt(void); // lib/DtSearch/userint.c
#define PROGNAME "DTSRLOAD"
#define RECS_PER_DOT 20
#define TERMINATE_LINE if (dotcount>0) { putchar('\n'); }
#define EXIT_NORMAL 0 /* perfect return code */
#define EXIT_WARNING 1 /* functioned ok, but with warnings */
#define EXIT_VANISH 3 /* input file effectively empty */
#define MS_misc 1
#define MS_cravel 11
/*--------------- EXTERNS ------------------*/
extern volatile int
shutdown_now;
extern void gen_vec (char *fname_huffcode_tab);
extern long gen_vec_hufid;
/*--------------- GLOBALS ------------------*/
static char *abstrbuf = NULL;
static int blobs_are_used; /* boolean */
static long created_reccount = 0L;
static long dbrec_hufid = 1L;
unsigned long default_hashsize;
int debug_mode = FALSE;
int debug_encode = FALSE;
static char dicname[10]; /* 1 - 8 char database name */
char dicpath[_POSIX_PATH_MAX];
static int dotcount = 0;
static long duplicate_recids = 0L;
char fname_huffcode_tab[_POSIX_PATH_MAX];
char fname_input[_POSIX_PATH_MAX];
struct stat fstat_input;
static FILE *infile = NULL;
static long input_reccount = 0L;
static DtSrINT32
maxdba = 0;
static int need_final_progress_msg = TRUE;
static int normal_exitcode = EXIT_NORMAL;
static DtSrINT32
objsize = 0;
static DtSrObjdate
objdate = 0;
static DB_ADDR objdba = NULL_DBA;
static PARG parg;
static int recs_per_dot = RECS_PER_DOT;
static time_t starttime = 0L;
static DtSrObjdate
starttimeobjd = 0;
char sprintbuf[1024 + _POSIX_PATH_MAX];
static int sumblobs = 0;
static int sumlines = 0;
static DtSrINT32
system_reccount = 0;
static long updated_reccount = 0L;
struct or_dbrec dbrec;
struct or_objrec objrec;
struct or_miscrec miscrec;
struct or_blobrec blobrec;
/********************************************************/
/* */
/* UPDATE_MAXDBA */
/* */
/********************************************************/
/* Ensures global var 'maxdba' always contains highest D00 slot number */
#define UPDATE_MAXDBA(dba) {if((dba&0xffffff)>maxdba)maxdba=dba&0xffffff;}
/********************************************************/
/* */
/* segregate_dicname */
/* */
/********************************************************/
/* Separates dictionary name from pathname and loads
* them into the globals 'dicname' and 'dicpath'.
* Returns TRUE if dicname is valid, else returns FALSE.
*/
static int segregate_dicname (char *string)
{
char *ptr;
int i;
strncpy (dicpath, string, sizeof (dicpath));
dicpath[sizeof (dicpath) - 1] = 0;
/* Set 'ptr' to just the 8 char dictionary name by moving
* it backwards until first non-alphanumeric character
* (such as a ":" in the dos drive id or a slash between directories),
* or to the beginning of string.
*/
for (ptr = dicpath + strlen (dicpath) - 1; ptr >= dicpath; ptr--)
if (!isalnum (*ptr)) {
ptr++;
break;
}
if (ptr < dicpath)
ptr = dicpath;
/* test for valid dictionary name */
i = strlen (ptr);
if (i < 1 || i > 8)
return FALSE;
strcpy (dicname, ptr);
*ptr = 0; /* truncate dicname off of full path/dicname */
return TRUE;
} /* segregate_dicname() */
/********************************************************/
/* */
/* user_args_processor */
/* */
/********************************************************/
/* handles command line arguments for 'main' */
static void user_args_processor (int argc, char **argv)
{
char *argptr;
char *src, *targ;
if (argc <= 1) {
PRINT_USAGE:
printf (CATGETS(dtsearch_catd, MS_cravel, 1,
"\nUSAGE: %s -d<dbname> [options] infile\n"
" Listed default file name extensions can be overridden.\n"
" -d<dbname> 1 - 8 char database name, incl optional path prefix.\n"
" File name extensions automatically appended.\n"
" -t<etxstr> End of text doc delimiter string. Default '\\f\\n'.\n"
" -c Initialize database record count by counting records.\n"
" -p<N> Print a progress dot every <N> records (default %d).\n"
" -h<N> Change duplicate rec id hash table size from %ld to <N>.\n"
" -h0 means there are no duplicates, don't check for them.\n"
" -e<path> Path-filename of huffman encode table (default %s).\n"
" <infile> Input [path]file name. Default extension %s.\n"
),
aa_argv0,
RECS_PER_DOT, default_hashsize,
FNAME_HUFFCODE_TAB, EXT_FZKEY);
DtSearchExit (2);
}
/* Each pass grabs new parm of "-xxx" format */
for (argc--, argv++; argc > 0 && ((*argv)[0] == '-' || (*argv)[0] == '+');
argc--, argv++) {
argptr = argv[0];
if (strncmp (argptr, "-russell", 8) == 0) {
debug_mode = TRUE;
if (argptr[8] == '2')
debug_encode = TRUE;
continue;
}
argptr[1] = tolower (argptr[1]);
switch (argptr[1]) {
case 'd': /* (D)ictionary */
/* May include both dicname and dicpath */
if (!segregate_dicname (argptr + 2)) {
printf (CATGETS(dtsearch_catd, MS_cravel, 246,
"\n%s '%s' is invalid path/dictionary name.\n"),
PROGNAME, argptr);
goto PRINT_USAGE;
}
break;
case 't': /* ETX delimiter string */
/* Replace any "\n" string with real linefeed */
targ = parg.etxdelim = malloc (strlen (argptr + 2) + 4);
src = argptr + 2;
while (*src) {
if (src[0] == '\\' && src[1] == 'n') {
*targ++ = '\n';
src += 2;
}
else
*targ++ = *src++;
}
*targ = 0;
break;
case 'p':
if ((recs_per_dot = atoi (argptr + 2)) <= 0) {
recs_per_dot = RECS_PER_DOT;
printf (CATGETS(dtsearch_catd, MS_cravel, 582,
"%sIgnored invalid progress dot argument '%s'.\n"),
PROGNAME "582 ", argptr);
}
break;
case 'e':
append_ext (fname_huffcode_tab, sizeof (fname_huffcode_tab),
argptr + 2, EXT_HUFFCODE);
break;
case 'h':
duprec_hashsize = atol (argptr + 2);
if (duprec_hashsize == 0UL)
printf (CATGETS(dtsearch_catd, MS_cravel, 13,
"%s Duplicate record id checking disabled.\n"),
PROGNAME);
break;
case 'c': /* force correct initial reccount by counting
* records */
system_reccount = -1;
break;
default:
UNKNOWN_ARG:
printf (CATGETS(dtsearch_catd, MS_cravel, 14,
"\n%s Unknown command line argument '%s'.\n"),
PROGNAME, argptr);
} /* endswitch */
} /* endwhile for cmd line '-'processing */
/* validate input file name */
if (argc <= 0) {
puts (CATGETS(dtsearch_catd, MS_cravel, 15,
"\nMissing required input file name.\a"));
goto PRINT_USAGE;
}
else
append_ext (fname_input, sizeof (fname_input), argv[0], EXT_FZKEY);
/* check for missing database name */
if (dicname[0] == 0) {
puts (CATGETS(dtsearch_catd, MS_cravel, 16,
"\nNo database name specified (-d argument).\a"));
goto PRINT_USAGE;
}
return;
} /* user_args_processor() */
/****************************************/
/* */
/* count_all_records */
/* */
/****************************************/
/* Initializes system_reccount and maxdba by
* actually counting all records in database.
* Must be called after dbrec has been read to ensure
* maxdba accounts for last miscrec slot number.
*/
static void count_all_records (void)
{
char keybuf[DtSrMAX_DB_KEYSIZE + 4];
printf (CATGETS(dtsearch_catd, MS_cravel, 17,
"%s Initializing total record count "
"in database by actually counting...\n"),
PROGNAME);
system_reccount = 0;
maxdba = 0;
KEYFRST (PROGNAME "286", OR_OBJKEY, 0);
while (db_status == S_OKAY) {
KEYREAD (PROGNAME "288", keybuf);
if (db_status != S_OKAY)
vista_abort (PROGNAME "288");
/* don't count records beginning with ctrl char */
if (keybuf[0] >= 32) {
system_reccount++;
CRGET (PROGNAME "251", &objdba, 0);
UPDATE_MAXDBA (objdba);
}
KEYNEXT (PROGNAME "291", OR_OBJKEY, 0);
}
/* account for last record's misc record slots */
maxdba += dbrec.or_recslots;
return;
} /* count_all_records() */
/****************************************/
/* */
/* read_dbrec */
/* */
/****************************************/
/* Read the database's dbrec and load global variables
* system_reccount and maxdba with current values from db.
*/
static void read_dbrec (void)
{
RECFRST (PROGNAME "285", OR_DBREC, 0); /* seqtl retrieval */
if (db_status != S_OKAY) {
printf (CATGETS(dtsearch_catd, MS_misc, 13,
"%sNo DB record in database '%s'.\n"
" The usual cause is failure to initialize "
"the database (run initausd).\n"),
PROGNAME"296 ", dicname);
DtSearchExit (8);
}
RECREAD (PROGNAME "302", &dbrec, 0);
if (db_status != S_OKAY)
vista_abort (PROGNAME "303");
swab_dbrec (&dbrec, NTOH);
if (debug_mode) {
printf (PROGNAME
" DBREC: reccount=%ld maxdba=%ld vers='%s' dbacc=%d\n"
" fzkeysz=%d abstrsz=%d maxwordsz=%d otype=%d lang=%d\n"
" hufid=%ld flags=x%x compflags=x%x uflags=x%lx sec=x%lx\n"
,(long)dbrec.or_reccount
,(long)dbrec.or_maxdba
,dbrec.or_version
,(int)dbrec.or_dbaccess
,(int)dbrec.or_fzkeysz
,(int)dbrec.or_abstrsz
,(int)dbrec.or_maxwordsz
,(int)dbrec.or_dbotype
,(int)dbrec.or_language
,(long)dbrec.or_hufid
,(int)dbrec.or_dbflags
,(int)dbrec.or_compflags
,(long)dbrec.or_dbuflags
,(long)dbrec.or_dbsecmask
);
}
dbrec_hufid = dbrec.or_hufid;
/* Confirm compatible program-database version numbers */
if (!is_compatible_version (dbrec.or_version, SCHEMA_VERSION)) {
printf (CATGETS(dtsearch_catd, MS_misc, 14,
"%s Program schema version '%s' incompatible with "
"database '%s' version '%s'.\n") ,
PROGNAME"245", SCHEMA_VERSION, dicname, dbrec.or_version);
DtSearchExit(4);
}
/* If blobs are specified for the database,
* they must be compressed blobs.
*/
switch (dbrec.or_dbaccess) {
case ORA_VARIES: /* use of blobs determined obj by obj */
case ORA_BLOB: /* objects stored directly in blobs */
case ORA_REFBLOB: /* refs to objects stored in blobs */
blobs_are_used = TRUE;
if (!(dbrec.or_compflags & ORC_COMPBLOB)) {
/* = don't compress blobs */
printf (CATGETS(dtsearch_catd, MS_cravel, 717,
"%s Aborting: Uncompressed blobs not yet supported.\n"),
PROGNAME"717");
DtSearchExit (5);
}
break;
default:
blobs_are_used = FALSE;
break;
}
/* Initialize global variable maxdba, which records largest slot number.
* If requested, init tot reccount by actually counting records.
*/
if (system_reccount == -1)
count_all_records ();
else {
system_reccount = dbrec.or_reccount;
maxdba = dbrec.or_maxdba;
}
printf (CATGETS(dtsearch_catd, MS_cravel, 18,
"%s: '%s' schema ver = %s, rec count = %ld, last slot = %ld.\n"),
aa_argv0, dicname, dbrec.or_version,
(long)system_reccount, (long)maxdba);
return;
} /* read_dbrec() */
/****************************************/
/* */
/* write_dbrec */
/* */
/****************************************/
/* Write the database's updated reccount and maxdba fields */
static void write_dbrec (void)
{
int i;
DtSrINT32 int32;
RECFRST (PROGNAME "355", OR_DBREC, 0); /* seqtl retrieval */
if (db_status != S_OKAY)
vista_abort (PROGNAME "356");
int32 = htonl (system_reccount);
CRWRITE (PROGNAME "341", OR_RECCOUNT, &int32, 0);
int32 = htonl (maxdba);
CRWRITE (PROGNAME "342", OR_MAXDBA, &int32, 0);
/* If this was the first load of a new database,
* save the huffman encode table id.
*/
if (blobs_are_used && dbrec_hufid == -1) {
int32 = htonl ((DtSrINT32)gen_vec_hufid);
CRWRITE (PROGNAME "343", OR_HUFID, &int32, 0);
}
if (db_status != S_OKAY)
vista_abort (PROGNAME "344");
printf (CATGETS(dtsearch_catd, MS_cravel, 19,
"%s: Final database record count = %ld, last slot = %ld.\n"),
aa_argv0, (long)system_reccount, (long)maxdba);
return;
} /* write_dbrec() */
/************************************************/
/* */
/* print_progress */
/* */
/************************************************/
/* prints complete progress message and statistics to stdout */
static void print_progress (void)
{
time_t seconds = time (NULL) - starttime;
long bytes_in = ftell (infile);
if (bytes_in <= 0L)
bytes_in = fstat_input.st_size; /* make final msg "100%" */
TERMINATE_LINE
printf (CATGETS(dtsearch_catd, MS_cravel, 20,
"%s: %ld input records processed in %ldm %lds, (%ld%%).\n"
" %ld duplicates, %ld new, %ld updates.\n"),
aa_argv0,
input_reccount, seconds / 60L, seconds % 60L,
(bytes_in * 100L) / fstat_input.st_size,
duplicate_recids, created_reccount, updated_reccount);
need_final_progress_msg = FALSE;
return;
} /* print_progress() */
/************************************************/
/* */
/* print_exit_code */
/* */
/************************************************/
/* Called from inside DtSearchExit() at austext_exit_last */
static void print_exit_code (int exit_code)
{
if (dotcount) {
putchar ('\n');
dotcount = 0;
}
printf ( CATGETS(dtsearch_catd, MS_cravel, 2,
"%s: Exit code = %d\n") ,
aa_argv0, exit_code);
fflush (aa_stderr);
fflush (stdout);
return;
} /* print_exit_code() */
/************************************************/
/* */
/* load_next_miscrec */
/* */
/************************************************/
/* Repeatedly called from create_object() or update_object()
* to fill miscrec buffer with next FZKABSTR type miscrec
* from input file data saved in fzkbuf and abstrbuf.
* First call for a given object is signaled by passed arg.
* Thereafter static pointers keep track of where we are
* in the source bufs to correctly load the next miscrec.
* Initial state = fill-with-fzkey, if there is a fzkey.
* Second state = fill-with-abstract, if there is an abstract.
* Last state = zero-fill balance of remaining misc records.
* Returns TRUE until last state completed (no more miscrecs can be written).
*/
static int load_next_miscrec (int first_call)
{
static enum {
FILL_FZKEY, FILL_ABSTR, FILL_ZEROS
}
fill_state = FILL_ZEROS;
static char *src = NULL;
static int srclen = 0;
static int totbytes = 0;
int i;
char *targ;
/* Initialize static variables at first call. */
if (first_call) {
/* If fzkey-abstract misc recs not used, return immediately. */
if ((totbytes = dbrec.or_fzkeysz + dbrec.or_abstrsz) <= 0)
return FALSE;
if (dbrec.or_fzkeysz > 0) {
fprintf (aa_stderr, CATGETS(dtsearch_catd, MS_cravel, 522,
"%s This version of %s does not support semantic databases.\n"),
PROGNAME"522", aa_argv0);
DtSearchExit (13);
}
else {
fill_state = FILL_ABSTR;
src = abstrbuf;
srclen = dbrec.or_abstrsz;
}
}
/* If NOT first call, but we've finished writing everything out,
* then tell the caller there's nothing left to do.
*/
else if (totbytes <= 0)
return FALSE;
/* Main loop is on each byte of the or_misc field of miscrec.
* Depending on the fill state, the byte will be a fzkey byte,
* an abstract byte, or a binary zero byte.
*/
targ = (char *) miscrec.or_misc;
for (i = 0; i < sizeof(miscrec.or_misc); i++, totbytes--) {
switch (fill_state) {
case FILL_FZKEY:
*targ++ = *src++;
if (--srclen <= 0) { /* end of fzkey? */
if (dbrec.or_abstrsz > 0) {
fill_state = FILL_ABSTR;
src = abstrbuf;
srclen = dbrec.or_abstrsz;
}
else
fill_state = FILL_ZEROS;
}
break;
case FILL_ABSTR:
if (*src == 0 || --srclen <= 0) /* end of abstract? */
fill_state = FILL_ZEROS;
*targ++ = *src++;
break;
case FILL_ZEROS:
*targ++ = 0;
break;
default:
fprintf (aa_stderr, CATGETS(dtsearch_catd, MS_misc, 25,
"%sAbort due to program error.\n"),
PROGNAME "549 ");
DtSearchExit (54);
} /* end switch */
} /* end for-loop */
miscrec.or_misctype = ORM_FZKABS;
return TRUE;
} /* load_next_miscrec() */
/************************************************/
/* */
/* create_object */
/* */
/************************************************/
/* Creates new object rec and misc recs from current vista rec.
* Sets global objdba to new rec's dba and updates maxdba if necessary.
* 1 create fields in objrec buffer, and write it.
* (or_objsize will be rewritten after text size has been determined.)
* 2 create fzkey-abstract rec as necessary.
*/
static void create_object (char *key)
{
int i;
char *src, *targ;
DB_ADDR tempdba;
memset (&objrec, 0, sizeof (objrec));
/* Copy the key into the buffer. The previous initialization
* ensures that the key will be padded on the right with zero fill.
* At this point, key length should never be too long because
* it has been previously tested (when the line was first read in).
*/
src = key;
targ = objrec.or_objkey;
for (i = 0; i < DtSrMAX_DB_KEYSIZE; i++) {
if (*src == 0)
break;
*targ++ = *src++;
}
/* Objdate will be updated later if line #4 has
* valid DtSrObjdate format. Otherwise current
* date/time stamp will be the default.
*/
objrec.or_objdate = starttimeobjd;
/* If all objects in database are same type, mark approp obj flag */
if (dbrec.or_dbotype != 0)
objrec.or_objtype = dbrec.or_dbotype;
/* If blobs are never used, mark each obj as 'unretrievable' */
if (!blobs_are_used)
objrec.or_objflags |= DtSrFlNOTAVAIL;
swab_objrec (&objrec, HTON);
FILLNEW (PROGNAME "487", OR_OBJREC, &objrec, 0);
if (db_status != S_OKAY)
vista_abort (PROGNAME "495");
CRGET (PROGNAME "375", &objdba, 0); /* save object's dba */
UPDATE_MAXDBA (objdba);
if (debug_mode)
printf ("---> new rec: inrecno %6ld, slot %6ld, key '%s'\n",
(long int) input_reccount, (long int) objdba & 0xffffff, objrec.or_objkey);
/* Make current object record the owner of all its sets */
SETOR (PROGNAME "376", OR_OBJ_BLOBS, 0);
SETOR (PROGNAME "377", OR_OBJ_MISCS, 0);
/* If fzkeys and/or abstracts are used,
* write out the misc record(s) now.
*/
if (load_next_miscrec (TRUE))
do {
HTONS (miscrec.or_misctype);
FILLNEW (PROGNAME "501", OR_MISCREC, &miscrec, 0);
CRGET (PROGNAME "503", &tempdba, 0);
UPDATE_MAXDBA (tempdba);
CONNECT (PROGNAME "505", OR_OBJ_MISCS, 0);
} while (load_next_miscrec (FALSE));
system_reccount++; /* new obj rec, so incr tot num database recs */
created_reccount++;
return;
} /* create_object() */
/************************************************/
/* */
/* update_object */
/* */
/************************************************/
/* Reinitializes portions of preexisting object rec.
* (Presumes vista 'current record' is desired object rec.)
* Sets objdba to rec's dba and updates maxdba if necessary.
* System_reccount is not altered because this is not a new record.
* 1 reinit certain fields in objrec, and rewrite it.
* (or_objsize will be rewritten after text size has been determined.)
* 2 delete all blobs (there should be no hyper recs,
* and existing user notes should not be changed).
* 3 update fzkey-abstract rec(s) as necessary.
* Important: misc rec updates should always be IN-PLACE.
* If miscrecs are deleted first then readded,
* there is no guarantee that their slots will be adjacent.
* This will screw up bit vector calculations in the inverted
* index word searches. In-place updates are faster anyway,
* and we know that the number of misc rec slots is constant.
*/
static void update_object (char *key)
{
int i;
int first_fzkabstr = TRUE;
DtSrINT16 misctype;
DtSrINT32 int32;
DB_ADDR tempdba;
DtSrINT32 zero_objsize = 0;
/* Slot number is dba with high order byte (filenum) parsed out */
CRGET (PROGNAME "467", &objdba, 0); /* save object's dba */
UPDATE_MAXDBA (objdba);
if (debug_mode)
printf ("----> update: inrecno %6ld, slot %6ld, key '%s'\n",
(long int) input_reccount, (long int) objdba & 0xffffff, key);
/* Reinit certain fields.
* Objsize will be rewritten after new text size determined.
* Objdate will be rewritten if .fzk file has valid
* DtSrObjdate format in line #4.
*/
CRWRITE (PROGNAME "472", OR_OBJSIZE, &zero_objsize, 0);
int32 = htonl (starttimeobjd);
CRWRITE (PROGNAME "681", OR_OBJDATE, &int32, 0);
/* Make current object record the owner of all its sets */
SETOR (PROGNAME "475", OR_OBJ_BLOBS, 0);
SETOR (PROGNAME "476", OR_OBJ_MISCS, 0);
/* Delete all blobs in a loop */
FINDFM (PROGNAME "480", OR_OBJ_BLOBS, 0);
while (db_status == S_OKAY) {
DISDEL (PROGNAME "482", 0);
FINDFM (PROGNAME "483", OR_OBJ_BLOBS, 0);
}
/* Update all miscrecs in a loop.
* User notes are left alone,
* and fzkey-abstracts are updated.
* Currently other types are not allowed.
*/
first_fzkabstr = TRUE;
FINDFM (PROGNAME "480", OR_OBJ_MISCS, 0);
while (db_status == S_OKAY) {
CRREAD (PROGNAME "496", OR_MISCTYPE, &misctype, 0);
NTOHS (misctype);
switch (misctype) {
case ORM_OLDNOTES:
case ORM_HUGEKEY:
break; /* do nothing */
case ORM_FZKABS: /* combined fzkey-abstract rec */
if (load_next_miscrec (first_fzkabstr)) {
HTONS (miscrec.or_misctype);
RECWRITE (PROGNAME "601", &miscrec, 0);
CRGET (PROGNAME "605", &tempdba, 0);
UPDATE_MAXDBA (tempdba);
first_fzkabstr = FALSE;
}
else {
DISDEL (PROGNAME "709", 0);
}
break;
default:
DISDEL (PROGNAME "529", 0);
} /* end switch */
FINDNM (PROGNAME "506", OR_OBJ_MISCS, 0);
} /* end update loop for all members of OBJ_MISCS set */
updated_reccount++;
return;
} /* update_object() */
/************************************************/
/* */
/* call_encoder */
/* */
/************************************************/
/* Called from main while reading document text.
* Calls huffman compression encoder at convenient
* intervals and at ETX.
*/
static void call_encoder (UCHAR *ucharbuf, int buflen)
{
objsize += buflen;
if (debug_encode) {
sumlines += buflen;
printf ("buflen = %d, sumlines = %d, cum objsize = %ld\n",
(int)buflen, (int)sumlines, (long)objsize);
}
if (hc_encode (&blobrec, ucharbuf, buflen, FALSE)) {
if (debug_encode) {
sumblobs += blobrec.or_bloblen;
printf ("---> WRITE sumlines = %d, bloblen = %d, "
"sumblobs = %d, objsize = %ld\n",
sumlines, (int)blobrec.or_bloblen,
(int)sumblobs, (long)objsize);
sumlines = 0;
}
HTONS (blobrec.or_bloblen);
FILLNEW (PROGNAME "572", OR_BLOBREC, &blobrec, 0);
CONNECT (PROGNAME "578", OR_OBJ_BLOBS, 0);
}
return;
} /* call_encoder() */
/************************************************/
/* */
/* main */
/* */
/************************************************/
int main (int argc, char *argv[])
{
static int hufftab_never_loaded = TRUE;
DBLK dblk;
int i, linelen;
DtSrINT32 int32;
char *cptr, *targ, *src;
char *db_key;
char uniqkey [DtSrMAX_DB_KEYSIZE + 4];
char linebuf [2048];
struct tm *tmptr;
/* Init globals */
setlocale (LC_ALL, "");
dtsearch_catd = CATOPEN(FNAME_DTSRCAT, 0);
aa_argv0 = argv[0];
time (&starttime);
tmptr = localtime (&starttime);
starttimeobjd = tm2objdate (tmptr);
strftime (linebuf, sizeof (linebuf),
CATGETS(dtsearch_catd, MS_misc, 22, "%A, %b %d %Y, %I:%M %p"),
tmptr);
printf (CATGETS(dtsearch_catd, MS_misc, 23,
"%s: Version %s. Run %s.\n"),
aa_argv0,
DtSrVERSION,
linebuf);
austext_exit_last = print_exit_code;
init_user_interrupt (); /* specify signal handlers */
default_hashsize = duprec_hashsize; /* deflt val in isduprec.c */
strcpy (fname_huffcode_tab, FNAME_HUFFCODE_TAB);
dicname[0] = 0;
dicpath[0] = 0;
memset (&dblk, 0, sizeof(DBLK));
memset (&parg, 0, sizeof(PARG));
parg.dblk = &dblk;
parg.etxdelim = ETXDELIM;
/* Parse user's command line args and maybe change global variables */
user_args_processor (argc, argv);
strcpy (dblk.name, dicname);
/* Open the database */
if (debug_mode)
printf (PROGNAME "211 database OPEN string = '%s%s'\n",
dicpath, dicname);
if (!austext_dopen (dicname, dicpath, NULL, 0, NULL)) {
fprintf (aa_stderr, "%s\n", DtSearchGetMessages());
DtSearchExit (3);
}
src = getcwd (linebuf, sizeof (linebuf));
if (!src)
src = getenv ("PWD");
printf (CATGETS(dtsearch_catd, MS_misc, 24,
"%s: cwd = '%s', fzkfile = '%s'\n"),
aa_argv0,
(src) ? src : CATGETS(dtsearch_catd, MS_misc, 6, "<unknown>"),
fname_input);
if ((infile = fopen (fname_input, "r")) == NULL) {
fprintf (aa_stderr, CATGETS(dtsearch_catd, MS_misc, 12,
"%sUnable to open %s:\n %s\n"),
PROGNAME "326 ", fname_input, strerror (errno));
DtSearchExit (6);
}
parg.ftext = infile; /* for discard_to_ETX() */
/* Read in starting database record count and other db config/status data */
read_dbrec ();
/* If fzkeys and/or abstracts are used,
* create correctly sized buffers for them.
*/
if (dbrec.or_fzkeysz > 0) {
fprintf (aa_stderr, CATGETS(dtsearch_catd, MS_cravel, 522,
"%s This version of %s does not support semantic databases.\n"),
PROGNAME"523", aa_argv0);
DtSearchExit (13);
}
if (dbrec.or_abstrsz > 0)
abstrbuf = austext_malloc (dbrec.or_abstrsz + 16, PROGNAME "744", NULL);
/* Get input file size for progress msgs */
if (fstat (fileno (infile), &fstat_input) == -1) {
fprintf (aa_stderr, CATGETS(dtsearch_catd, MS_cravel, 29,
"%s Unable to get status for %s: %s\n"),
PROGNAME"337", fname_input, strerror (errno));
DtSearchExit (10);
}
if (fstat_input.st_size <= 0L) {
fprintf (aa_stderr, CATGETS(dtsearch_catd, MS_cravel, 30,
"%s File %s is empty.\n"),
PROGNAME"343", fname_input);
DtSearchExit (7);
}
printf (CATGETS(dtsearch_catd, MS_cravel, 31,
"%s: Each dot = %d records processed.\n"),
aa_argv0, recs_per_dot);
/*-------------------- MAIN LOOP --------------------
* Executed once for each new input record.
* 1. Read and process the FZKEY line.
* 2. Read and process the ABSTRACT line.
* 3. Read the UNIQUE KEY line.
* Write out an object record at this point.
* 4. Read and process the DATE line, update object rec.
* 5. Use readchar_ftext to read document text until ETX.
* Either blob it or discard it as appropriate.
*/
while (!feof(infile)) {
/*----- READ LINE #1, fzkey -------------------------
* First line of new record.
* Abort now if a shutdown signal was sent.
* Skip null records (ETX str followed immediately by ETX str).
* If this database uses fzkeys, "pack" current fzkey
* and save it in the correct miscrec buffer.
* If fzkeys are combined with abstracts they share the same
* miscrec, otherwise they they reside in their own miscrec.
* WARNING! Presumes or_fzkeysz <= the space allocated
* for it in the correct miscrec.
*-----------------------------------------------------*/
if (fgets (linebuf, sizeof(linebuf) - 1, infile) == NULL)
break;
/* Got at least one line of a new record. Print progress dots */
if (!(input_reccount % recs_per_dot)) {
if (input_reccount) {
putchar ('.');
dotcount++;
if (!(dotcount % 10))
putchar (' ');
if (dotcount >= 50) {
print_progress ();
dotcount = 0;
}
else
fflush (stdout);
}
}
input_reccount++;
need_final_progress_msg = TRUE;
linebuf [sizeof(linebuf)-1] = 0;
linelen = strlen (linebuf);
objsize = 0;
if (shutdown_now) {
TERMINATE_LINE
printf (CATGETS(dtsearch_catd, MS_misc, 15,
"%sReceived abort signal %d.\n"),
PROGNAME"373 ", shutdown_now);
write_dbrec (); /* at least update reccount and maxdba */
DtSearchExit (100 + shutdown_now);
}
/* Skip null record */
if (strcmp (linebuf, parg.etxdelim) == 0)
continue;
/*----- READ LINE #2, abstract ------------------------
* Second line is abstract line. Save it in record buffer,
* hopping over the first 10 chars ("ABSTRACT: ....").
*-----------------------------------------------------*/
if (fgets (linebuf, sizeof (linebuf) - 1, infile) == NULL)
break;
linebuf [sizeof(linebuf)-1] = 0;
linelen = strlen (linebuf);
if (strncmp (linebuf, "ABSTRACT: ", 10) != 0) {
cptr = PROGNAME"580";
INVALID_FORMAT:
normal_exitcode = EXIT_WARNING;
TERMINATE_LINE
printf (CATGETS(dtsearch_catd, MS_cravel, 579,
"%s Discarded rec #%ld: Invalid .fzk file format.\n"),
cptr, input_reccount);
if (strcmp (linebuf, parg.etxdelim) != 0)
discard_to_ETX (&parg);
continue;
}
/* If abstracts are used, save this one in the abstract buffer */
if (dbrec.or_abstrsz > 0) {
linebuf[--linelen] = 0; /* delete terminating \n */
strncpy (abstrbuf, linebuf + 10, dbrec.or_abstrsz);
abstrbuf[dbrec.or_abstrsz - 1] = 0;
}
/*--- READ LINE #3, unique database key ------------------
* Third line is 'unique record id'.
* If key is valid update old objrec
* or create new one as necessary.
* (There may be one more write required
* after we determine total blob size).
*-----------------------------------------------------*/
if (fgets (linebuf, sizeof (linebuf) - 1, infile) == NULL)
break;
linebuf [sizeof(linebuf)-1] = 0;
linelen = strlen (linebuf);
if (strcmp (linebuf, parg.etxdelim) == 0) {
cptr = PROGNAME"1068";
goto INVALID_FORMAT;
}
/*
* Isolate first token surrounded by whitespace
* (and parse out \n)
*/
if ((db_key = strtok (linebuf, " \t\n")) == NULL) {
cptr = PROGNAME"1076";
goto INVALID_FORMAT;
}
if (strlen (db_key) > DtSrMAX_DB_KEYSIZE - 1) {
normal_exitcode = EXIT_WARNING;
TERMINATE_LINE
printf (CATGETS(dtsearch_catd, MS_cravel, 33,
"%s Discarded rec #%ld: Key too long:\n '%s'.\n"),
PROGNAME"606", input_reccount, db_key);
discard_to_ETX (&parg);
continue;
}
if (!isalnum (db_key[0])) {
normal_exitcode = EXIT_WARNING;
TERMINATE_LINE
printf (CATGETS(dtsearch_catd, MS_cravel, 927,
"%s Discarded rec #%ld: First char (keytype) of key\n"
" '%s' is not alphanumeric.\n"),
PROGNAME"927", input_reccount, db_key);
discard_to_ETX (&parg);
continue;
}
/* If duplicate record in fzk file, discard it. */
i = is_duprec (db_key);
if (i == 2) {
TERMINATE_LINE
printf (CATGETS(dtsearch_catd, MS_cravel, 34,
"%s Out of Memory! "
"Set -h arg to a smaller number,\n"
" or reduce the number of input records.\n"),
PROGNAME"1096");
DtSearchExit (55);
}
else if (i == 1) { /* skip duplicate record id */
normal_exitcode = EXIT_WARNING;
TERMINATE_LINE
printf (CATGETS(dtsearch_catd, MS_cravel, 35,
"%s: Discarded duplicate rec #%ld '%s'.\n"),
aa_argv0, input_reccount, db_key);
duplicate_recids++;
discard_to_ETX (&parg);
continue;
}
/*
* Try to read the object record from the database. If it
* already exists (UPDATE): delete all its blobs (there
* should be no hyper recs). create or update
* fzkey-abstract recs as necessary. dont change any
* existing user notes. update fields in objrec buffer,
* but don't write it yet-- objrec will be rewritten
* after text size has been determined. If it doesn't
* exist (CREATE): create fields in objrec buffer, and
* write it. create fzkey-abstract recs as necessary.
* objrec will be rewritten after text size has been
* determined. After update or create, objdba contains
* dba of curr obj record.
*/
strcpy (uniqkey, db_key);
KEYFIND (PROGNAME "489", OR_OBJKEY, uniqkey, 0);
if (db_status == S_OKAY)
update_object (uniqkey);
else
create_object (uniqkey);
/*----- READ LINE #4, date -----------------------------
* Line #4 is object date/time string (OBJDATESTR format).
* It is no longer optional. If invalid, the current
* run date that was preloaded into the record is used.
*-----------------------------------------------------*/
if (fgets (linebuf, sizeof (linebuf) - 1, infile) == NULL)
break;
linebuf [sizeof(linebuf)-1] = 0;
linelen = strlen (linebuf);
if (!is_objdatestr (linebuf, &objdate)) {
normal_exitcode = EXIT_WARNING;
if (strcmp (linebuf, parg.etxdelim) == 0) {
cptr = PROGNAME"1155";
goto INVALID_FORMAT;
}
TERMINATE_LINE
printf (CATGETS(dtsearch_catd, MS_cravel, 1086,
"%s Record '%s' has invalid date format--"
"using run date.\n"),
PROGNAME"1086", uniqkey);
}
else { /* objdate is valid */
KEYFIND (PROGNAME "1098", OR_OBJKEY, uniqkey, 0);
if (db_status != S_OKAY)
vista_abort (PROGNAME "1101");
HTONL (objdate); /* ready for record writes */
CRWRITE (PROGNAME "1102", OR_OBJDATE, &objdate, 0);
}
/*----- READ TO ETX, record text ---------------------
* Balance of record (after line 4 to end of record marker)
* is text. It may or may not be formatted in neat ascii
* lines, ie it may not have periodic linefeeds (\n).
* If this database does not store compressed records (blobs)
* we just discard all chars to end of text delimiter (ETX).
* Otherwise we read it char by char using readchar_ftext()
* and fill linebuf to some convenient size.
*
* Repeated calls to hc_encode() build
* a compressed record in its own internal blobrec buffer.
* When the buffer is full, hc_encode copies it to
* the passed blobrec buffer and returns TRUE.
* The caller should then write out the blobrec.
* If hc_encode returns FALSE, its internal blobrec is not
* yet full so the caller should not yet write out his record.
*-----------------------------------------------------*/
if (!blobs_are_used) {
discard_to_ETX (&parg);
continue;
}
/*
* Initialize blob compression by reading in huffman
* encode table (first execution only). Ensure table id
* is same as one used for previous compressions, if any.
*/
if (hufftab_never_loaded) {
hufftab_never_loaded = FALSE;
gen_vec (fname_huffcode_tab);
if (dbrec_hufid != gen_vec_hufid && dbrec_hufid != -1L) {
TERMINATE_LINE
printf (CATGETS(dtsearch_catd, MS_cravel, 1153,
"%s Current data compression table id"
" in '%s' is %ld.\n"
" Database '%s' previously compressed"
" with table %ld.\n"),
PROGNAME"1153 ", fname_huffcode_tab,
gen_vec_hufid, dicname, dbrec_hufid);
DtSearchExit (53);
}
}
/*
* Compress document text. Repeatedly load linebuf
* with fixed number of chars and compress it.
*/
if (debug_encode) {
sumlines = 0;
sumblobs = 0;
}
if ((linebuf[0] = readchar_ftext (&parg)) == 0) {
normal_exitcode = EXIT_WARNING;
TERMINATE_LINE
printf ( CATGETS(dtsearch_catd, MS_cravel, 1215,
"%s Warning. Record '%s' has no text.\n"),
PROGNAME"1215" , uniqkey);
continue;
}
linelen = 1;
while (linebuf [linelen] = readchar_ftext (NULL)) {
if (++linelen >= 80) {
call_encoder ((UCHAR *)linebuf, linelen);
linelen = 0;
}
}
/*
* At ETX: If a partial line remains, process it just like
* the full lines above. Then write out total size to
* object record, and make the final call to hc_encode with
* empty line and TRUE flag to indicate 'no more text,
* flush your last partial buffer'.
*/
if (linelen)
call_encoder ((UCHAR *)linebuf, linelen);
CRSET (PROGNAME "685", &objdba, 0);
int32 = htonl (objsize);
CRWRITE (PROGNAME "686", OR_OBJSIZE, &int32, 0);
if (hc_encode (&blobrec, (UCHAR *)"", 0, TRUE)) {
if (debug_encode) {
sumblobs += blobrec.or_bloblen;
printf ("---> FINAL sumlines =%d, bloblen = %d, "
"sumblobs = %ld, objsize = %ld\n",
(int)sumlines, (int)blobrec.or_bloblen,
(long)sumblobs, (long)objsize);
}
HTONS (blobrec.or_bloblen);
FILLNEW (PROGNAME "624", OR_BLOBREC, &blobrec, 0);
CONNECT (PROGNAME "625", OR_OBJ_BLOBS, 0);
}
} /* end main record loop */
if (need_final_progress_msg)
print_progress ();
fclose (infile);
write_dbrec ();
/* If all input records were discarded, complete processing
* but upgrade warning exit code to hard error code.
*/
if (created_reccount <= 0L && updated_reccount <= 0L) {
normal_exitcode = EXIT_VANISH;
fprintf (stderr, CATGETS(dtsearch_catd, MS_cravel, 1048,
"%sDatabase objects not changed because input "
"file effectively empty.\n"),
PROGNAME "1048 ");
}
/* Close database and print return code via exits.
* Return code is either 0 (perfect), 1 (warnings),
* or 3 (input file effectively empty).
*/
DtSearchExit (normal_exitcode);
} /* main() */
/*********************** DTSRLOAD.C ***************************/