cdesktopenv/cde/lib/DtSearch/vestatis.c

1507 lines
43 KiB
C

/*
* CDE - Common Desktop Environment
*
* Copyright (c) 1993-2012, The Open Group. All rights reserved.
*
* These libraries and programs are free software; you can
* redistribute them and/or modify them under the terms of the GNU
* Lesser General Public License as published by the Free Software
* Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* These libraries and programs are distributed in the hope that
* they will be useful, but WITHOUT ANY WARRANTY; without even the
* implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
* PURPOSE. See the GNU Lesser General Public License for more
* details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with these libraries and programs; if not, write
* to the Free Software Foundation, Inc., 51 Franklin Street, Fifth
* Floor, Boston, MA 02110-1301 USA
*/
/*
* COMPONENT_NAME: austext
*
* FUNCTIONS: CNCRD_MEMORY_AREA_LIST
* QUERY_STEM_STR
* STAT_STR
* TREENODE
* build_bin_tree
* comp_stat
* descend_tree
* efim_qsort
* fill_stem
* get_next_memory_block
* init_global_memory
* init_memory
* inv_index_bin_tree
* load_ditto_str
* release_shm_mem
* stat_search
* traverse_tree
* ve_statistical
*
* ORIGINS: 27
*
* (C) COPYRIGHT International Business Machines Corp. 1993,1995
* All Rights Reserved
* US Government Users Restricted Rights - Use, duplication or
* disclosure restricted by GSA ADP Schedule Contract with IBM Corp.
*/
/*************************** VESTATIS.C ****************************
* $XConsortium: vestatis.c /main/9 1996/11/25 18:49:04 drk $
* 1993.
* Statistically sorted stems search.
*
* $Log$
* Revision 2.3 1996/02/01 19:35:55 miker
* AusText 2.1.11, DtSearch 0.3: Uses new single word parser/stemmers.
*
* Revision 2.2 1995/10/25 15:00:05 miker
* Added prolog.
*
* Revision 2.1 1995/09/22 22:30:42 miker
* Freeze DtSearch 0.1, AusText 2.1.8
* Revision 1.11 1995/09/07 23:30:15 miker
* ...One last try (sigh).
* Revision 1.10 1995/09/07 19:08:01 miker
* Last fix incorrectly coded.
* Revision 1.9 1995/09/07 16:25:11 miker
* Fixed solaris bus fault caused by TREENODE structure
* not being aligned on machines word boundary. Fault occurred
* only when query contained more than one word.
* Revision 1.8 1995/09/05 19:31:37 miker
* Made usrblk and ausapi_msglist global. Replaced Socrates()
* with calls to parser() and stemmer(). Deleted socblk.
* Numerous name changes. All for DtSearch...
*/
#ifndef _ALL_SOURCE
# define _ALL_SOURCE /* to pickup typedefs for shm vnodes */
#endif
#include "SearchE.h"
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <math.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <sys/ipc.h>
#include <sys/shm.h>
#include "vista.h"
/*-------------------------- GLOBALS ----------------------------*/
/**** declaration for the global memory pointers ****/
#define PROGNAME "VESTATIS"
#define MEMORY_SIZE 64000 /* 65536 is 64 KBytes of memory */
#define REC_TYPES 256
#define NORM_VALUE 30
#undef INFINITY /* XXX does GCC's __builtin_inff() work here? */
#define INFINITY 9999.0
#define SORT_MESG 10000
#define CHAR_BITS 8
#define STACKSZ 256
#define MED_3_VALUE 7
#define TIME_ITERATION 1
#define LOG2 0.693147181
#define MS_vestatis 17
#define STRUCT_ALIGN sizeof(char*)
static int SHM_FLAG = IPC_CREAT | S_IRUSR | S_IWUSR | S_IWGRP |
S_IRGRP | S_IROTH | S_IWOTH;
static char *mem_start;
static char *cur_pos;
static long mem_offset;
static long total_memory_size;
typedef struct q_s {
char stem[DtSrMAXWIDTH_HWORD];
int count;
} QUERY_STEM_STR;
typedef struct mem_area {
char *start_of_mem_block;
long block_size;
struct mem_area *next_block;
} CNCRD_MEMORY_AREA_LIST;
typedef struct bintree {
struct bintree *rlink; /* ptr to next node in linked list or
* right link in binary tree */
struct bintree *llink; /* left link in binary tree */
char *word; /* ptr to word in the query */
int count;
} TREENODE;
typedef struct s_a {
DB_ADDR dba;
float wght;
DtSrINT32 num_word_hits;
} STAT_STR;
static STAT_STR *stat_array = NULL;
static TREENODE *root_node;
static TREENODE *top_of_stack;
static TREENODE *stack;
static TREENODE *pres;
static TREENODE *prev;
static TREENODE *next;
static TREENODE *avail_node;
static CNCRD_MEMORY_AREA_LIST *memory_blocks = NULL;
static CNCRD_MEMORY_AREA_LIST *cur_mem_ptr;
static QUERY_STEM_STR *query_stems = NULL;
static DB_ADDR *word_addrs = NULL;
static int num_diff_words = 0;
static char begin_search;
static char begin_sort;
static char begin_load_ditto;
static char begin_qsort;
static char qsort_done;
static DtSrINT32 real_num_rec;
static DtSrINT32 num_hits;
static DtSrINT32 total_num_addrs;
static DtSrINT32 dba_offset;
static unsigned char rec_type_tab[REC_TYPES];
static char vestat_msgbuf[256];
static int mes_search_box;
static int slot_d00;
extern char *chmat ();
extern void find_keyword (char *cur_word, int vista_num);
extern void read_wordstr (struct or_hwordrec * glob_word, int vista_num);
extern void write_wordstr (struct or_hwordrec * glob_word, int vista_num);
static void stat_search (void); /* redefined below */
/********************************/
/* */
/* Release Shared Memory */
/* */
/********************************/
void release_shm_mem (void)
{
if (global_memory_ptr != NULL) {
if (shmdt (global_memory_ptr) == -1) {
DtSearchAddMessage (catgets (dtsearch_catd, MS_vestatis, 104,
PROGNAME "104 Cannot detach shared memory "));
OE_flags |= OE_PERMERR;
usrblk.retncode = OE_ABORT;
return;
}
if (shmctl (shm_id, IPC_RMID, NULL) == -1) {
DtSearchAddMessage (catgets (dtsearch_catd, MS_vestatis, 110,
PROGNAME "110 Cannot remove shared memory "));
OE_flags |= OE_PERMERR;
usrblk.retncode = OE_ABORT;
return;
}
global_memory_ptr = NULL;
}
return;
} /* release_shm_mem() */
/********************************/
/* */
/* Init Global Memory */
/* */
/********************************/
/* addrs - largest DBA slot in D00 file in the current database
* r_addrs - total records count in the current database.
*/
static int init_global_memory (DtSrINT32 addrs, DtSrINT32 r_addrs)
{
long i, j;
size_t k;
i = DtSrMAX_STEMCOUNT * ((addrs >> 3) + 1) * 2 +
addrs * sizeof (int) + sizeof (DB_ADDR) * r_addrs;
j = sizeof (STAT_STR) * addrs + sizeof (DB_ADDR) * r_addrs;
k = (i > j) ? i : j;
shm_id = shmget (IPC_PRIVATE, k, SHM_FLAG);
if ((global_memory_ptr = (char *) shmat (shm_id, (char *) 0, 0)) ==
((char *) -1)) {
DtSearchAddMessage (catgets (dtsearch_catd, MS_vestatis, 115,
PROGNAME "115 No shared memory available"));
OE_flags |= OE_PERMERR;
usrblk.retncode = OE_ABORT;
return FALSE;
}
return TRUE;
} /* init_global_memory() */
/****************************************/
/* */
/* efim_qsort */
/* */
/****************************************/
/* Custom quick sort algorithm (medium-of-3 partitioning).
* Coded for efficiency given our expected data characteristics,
* and for interruptability.
*/
int efim_qsort (void)
{
time_t start_time;
double time_dif;
static long left, right;
static long scan_l, scan_r, mid3, pvidx, l_size, r_size;
static long sptr;
static float pivot, temp, stack_l[STACKSZ], stack_r[STACKSZ];
static DB_ADDR dba;
/* Test whether user has pushed STOP button since last call. */
if (usrblk.flags & USR_STOPSRCH) {
if (OE_flags & OE_AUDIT)
oe_write_audit_rec (-1L);
usrblk.retncode = OE_USER_STOP;
release_shm_mem ();
return TRUE;
}
if (begin_qsort) {
sptr = 0;
left = 0;
right = num_hits - 1;
begin_qsort = FALSE;
}
time (&start_time);
for (;;) {
/* check iteration loop */
time_dif = difftime (time (NULL), start_time);
if ((time_dif > TIME_ITERATION
|| usrblk.debug & USRDBG_ITERATE) &&
!(usrblk.flags & USR_NO_ITERATE)) {
usrblk.retncode = OE_SEARCHING;
usrblk.workproc = stat_search;
mes_search_box = TRUE;
return TRUE;
}
while (right > left) {
if ((right - left) > MED_3_VALUE) {
/*
* compute value for the median-of-three partitioning
*/
mid3 = (left + right) >> 1;
/*
* three-sort left, middle, and right elements
*/
if ((stat_array + left)->wght < (stat_array + mid3)->wght) {
temp = (stat_array + left)->wght;
(stat_array + left)->wght =
(stat_array + mid3)->wght;
(stat_array + mid3)->wght = temp;
dba = (stat_array + left)->dba;
(stat_array + left)->dba =
(stat_array + mid3)->dba;
(stat_array + mid3)->dba = dba;
}
if ((stat_array + left)->wght < (stat_array + right)->wght) {
temp = (stat_array + left)->wght;
(stat_array + left)->wght =
(stat_array + right)->wght;
(stat_array + right)->wght = temp;
dba = (stat_array + left)->dba;
(stat_array + left)->dba =
(stat_array + right)->dba;
(stat_array + right)->dba = dba;
}
if ((stat_array + mid3)->wght < (stat_array + right)->wght) {
temp = (stat_array + mid3)->wght;
(stat_array + mid3)->wght =
(stat_array + right)->wght;
(stat_array + right)->wght = temp;
dba = (stat_array + mid3)->dba;
(stat_array + mid3)->dba =
(stat_array + right)->dba;
(stat_array + right)->dba = dba;
}
/* select pivot element index */
pvidx = right - 1;
/* exchange pivot with the middle element */
temp = (stat_array + mid3)->wght;
(stat_array + mid3)->wght = (stat_array + pvidx)->wght;
(stat_array + pvidx)->wght = temp;
dba = (stat_array + mid3)->dba;
(stat_array + mid3)->dba = (stat_array + pvidx)->dba;
(stat_array + pvidx)->dba = dba;
/* setup for partitioning */
scan_l = left + 1;
scan_r = right - 2;
}
else {
/* select pivot element index */
pvidx = right;
/* set scanning indexes */
scan_l = left;
scan_r = right - 1;
}
/* select pivot element */
pivot = (stat_array + pvidx)->wght;
for (;;) {
/* scan from left */
while ((stat_array + scan_l)->wght > pivot) {
scan_l++;
}
/* scan from right */
while ((stat_array + scan_r)->wght < pivot) {
if (scan_r == 0) {
break;
}
scan_r--;
}
/* if scan have met, exit inner loop */
if (scan_l >= scan_r) {
break;
}
/* exchange elements */
temp = (stat_array + scan_r)->wght;
(stat_array + scan_r)->wght = (stat_array + scan_l)->wght;
(stat_array + scan_l)->wght = temp;
dba = (stat_array + scan_r)->dba;
(stat_array + scan_r)->dba = (stat_array + scan_l)->dba;
(stat_array + scan_l)->dba = dba;
/* move scans to next elements */
scan_l++;
scan_r--;
}
if (scan_l != pvidx) {
/* exchange finale element */
temp = (stat_array + pvidx)->wght;
(stat_array + pvidx)->wght = (stat_array + scan_l)->wght;
(stat_array + scan_l)->wght = temp;
dba = (stat_array + pvidx)->dba;
(stat_array + pvidx)->dba = (stat_array + scan_l)->dba;
(stat_array + scan_l)->dba = dba;
}
/* calculate section sizes */
l_size = scan_l - left;
r_size = right - scan_l;
/* place largest section on stack */
if (l_size > r_size) {
/* ignore 1-element sections */
if (l_size > 1) {
sptr++;
if (sptr == STACKSZ) {
fputs (catgets (dtsearch_catd, MS_vestatis, 107,
PROGNAME "107 Qsort stack overflow.\n"),
aa_stderr);
OE_flags |= OE_PERMERR;
usrblk.retncode = OE_ABORT;
release_shm_mem ();
return FALSE;
}
*(stack_l + sptr) = left;
*(stack_r + sptr) = scan_l - 1;
}
/* ignore 1-element sections */
if (r_size != 0) {
left = scan_l + 1;
}
else {
break;
}
}
else {
/* ignore 1-element sections */
if (r_size > 1) {
sptr++;
if (sptr == STACKSZ) {
fputs (catgets (dtsearch_catd, MS_vestatis, 107,
PROGNAME "107 Qsort stack overflow.\n"),
aa_stderr);
OE_flags |= OE_PERMERR;
usrblk.retncode = OE_ABORT;
release_shm_mem ();
return FALSE;
}
*(stack_l + sptr) = scan_l + 1;
*(stack_r + sptr) = right;
}
/* ignore 1-element sections */
if (l_size != 0) {
right = scan_l - 1;
}
else {
break;
}
}
}
/* iterate with values from stack (if any) */
if (sptr) {
left = *(stack_l + sptr);
right = *(stack_r + sptr);
sptr--;
}
else {
break;
}
}
qsort_done = TRUE;
return TRUE;
} /* efim_qsort() */
/****************************************/
/* */
/* fill_stem */
/* */
/****************************************/
/* "Visit" subroutine of descend_tree(), which is itself subroutine
* of traverse_tree(). Builds query_stems array
* and establishes its size in num_diff_words.
*/
static void fill_stem (TREENODE * cur_stem)
{
query_stems[num_diff_words].count = cur_stem->count;
strcpy (query_stems[num_diff_words].stem, cur_stem->word);
num_diff_words++;
return;
} /* fill_stem() */
/****************************************/
/* */
/* descend_tree */
/* */
/****************************************/
/* Subroutine of traverse_tree(), Robson tree traversal algorithm. */
static void descend_tree (void)
{
int not_done = TRUE;
while (not_done) {
/* end of 'descent' subalgorithm? */
if ((pres->llink == NULL) && (pres->rlink == NULL)) {
/* Preorder, Symmetric Order and Postorder */
fill_stem (pres);
avail_node = pres;
return;
}
if (pres->llink != NULL) {
/* Preorder */
fill_stem (pres);
next = pres->llink;
pres->llink = prev;
prev = pres;
pres = next;
}
else {
/* Preorder and Symmetric Order */
fill_stem (pres);
next = pres->rlink;
pres->rlink = prev;
prev = pres;
pres = next;
}
}
return;
} /* descend_tree() */
/********************************/
/* */
/* traverse_tree */
/* */
/********************************/
/* The algorithm is based on the J. M. ROBSON link inversion traversal
* algorithm for binary trees. Ref. Thomas A. STANDISH pp. 77-78.
*/
static void traverse_tree (void)
{
int not_done = TRUE;
int descend = TRUE;
/* initialize the variables */
pres = root_node;
prev = pres;
top_of_stack = NULL;
stack = NULL;
while (not_done) {
if (descend) {
descend_tree ();
}
if (pres == root_node) {
return;
}
if (prev->rlink == NULL) {
/* Symmetric Order and Postorder */
/*** fill_stem(prev); ***/
next = prev->llink;
prev->llink = pres;
pres = prev;
prev = next;
descend = FALSE;
}
else {
if (prev->llink == NULL) {
/* Postorder */
/** fill_stem(prev); **/
next = prev->rlink;
prev->rlink = pres;
pres = prev;
prev = next;
descend = FALSE;
}
else {
if (prev == top_of_stack) {
/* Postorder */
/** fill_stem(prev); **/
next = stack;
top_of_stack = stack->rlink;
stack = stack->llink;
next->llink = NULL;
next->rlink = NULL;
next = prev->llink;
prev->llink = prev->rlink;
prev->rlink = pres;
pres = prev;
prev = next;
descend = FALSE;
}
else {
/* Symmetric Order */
/*** fill_stem(prev); ***/
avail_node->llink = stack;
avail_node->rlink = top_of_stack;
stack = avail_node;
top_of_stack = prev;
next = prev->rlink;
prev->rlink = pres;
pres = next;
descend = TRUE;
}
}
}
}
} /* traverse_tree() */
/********************************/
/* */
/* Get Next Memory Block */
/* */
/********************************/
void get_next_memory_block (size_t node_size)
{
CNCRD_MEMORY_AREA_LIST *temp_ptr;
temp_ptr = memory_blocks;
/*
* We run out of pre-allocated memory. Allocate additional block of
* memory
*/
if (cur_mem_ptr == NULL) {
total_memory_size += node_size;
mem_start = (char *) malloc (total_memory_size);
mem_offset = 0L;
mem_offset += node_size;
cur_pos = mem_start;
if (mem_start == NULL) {
fprintf (aa_stderr, catgets (dtsearch_catd, MS_vestatis, 310,
"%s Out of Memory. Need %ld bytes.\n"),
PROGNAME "310", total_memory_size);
OE_flags |= OE_PERMERR;
usrblk.retncode = OE_ABORT;
release_shm_mem ();
return;
}
/*
* allocate space for the next member of the memory blocks link
* list
*/
memory_blocks = (CNCRD_MEMORY_AREA_LIST *)
malloc (sizeof (CNCRD_MEMORY_AREA_LIST) + 2);
if (memory_blocks == NULL) {
fputs (catgets (dtsearch_catd, MS_vestatis, 314,
PROGNAME"314 Out of Memory.\n"), aa_stderr);
OE_flags |= OE_PERMERR;
usrblk.retncode = OE_ABORT;
release_shm_mem ();
return;
}
memory_blocks->start_of_mem_block = mem_start;
memory_blocks->next_block = temp_ptr;
memory_blocks->block_size = total_memory_size;
/**** allocation of initial memory blocks is done ****/
}
/* Use next available block of memory */
else {
mem_start = cur_mem_ptr->start_of_mem_block;
total_memory_size = cur_mem_ptr->block_size;
cur_mem_ptr = cur_mem_ptr->next_block;
mem_offset = 0L;
mem_offset += node_size;
cur_pos = mem_start;
}
return;
} /* get_next_memory_block() */
/********************************/
/* */
/* build_bin_tree */
/* */
/********************************/
/* Subroutine of inv_index_bin_tree().
* Called for each stem in query.
* Inserts new stem (already uppercase) into tree
* or increments existing stem's count.
* Returns TRUE and incr num_diff_words if new stem inserted.
* Returns FALSE if existing stem's count merely incremented.
* Returns FALSE and OE_ABORT set on error.
*/
static int build_bin_tree (char *cur_word)
{
int i;
int wordlen;
size_t treenode_size;
TREENODE *new;
TREENODE **this_link;
wordlen = strlen (cur_word);
/* Determine the amount of memory needed for the
* new node. Add in a pad amount to align it
* on the machine's word (integer) boundary.
* Some machines aren't happy about misaligned
* structures and we're emulating our own malloc.
* (Thanks, and a tip o' the hat to Takuki Kamiya).
*/
treenode_size = sizeof (TREENODE) + wordlen + 2;
treenode_size +=
(STRUCT_ALIGN - treenode_size % STRUCT_ALIGN) % STRUCT_ALIGN;
/* allocate a new node and load its data fields */
mem_offset += treenode_size;
if (mem_offset > total_memory_size) {
/* allocate new chunk of memory */
get_next_memory_block (treenode_size);
if (usrblk.retncode == OE_ABORT)
return FALSE;
}
new = (TREENODE *) cur_pos;
cur_pos = mem_start + mem_offset;
new->llink = NULL;
new->rlink = NULL;
new->word = (char *) new + sizeof (TREENODE);
new->count = 1;
strcpy (new->word, cur_word);
/* Insert current word into binary tree */
for (this_link = &root_node; *this_link != NULL;) {
i = strcmp (new->word, (*this_link)->word);
/* Test for current word already in the binary tree */
if (i == 0) {
mem_offset -= treenode_size;
cur_pos = mem_start + mem_offset;
(*this_link)->count++;
return FALSE; /* no point in continuing descent */
}
/* Descend tree to find correct insertion point */
this_link = (i < 0) ?
&(*this_link)->llink : &(*this_link)->rlink;
} /* end for loop to find tree insertion
* point */
/* Insert new node at current location in tree */
*this_link = new;
num_diff_words++;
return TRUE;
} /* build_bin_tree() */
/************************/
/* */
/* init_memory */
/* */
/************************/
/* Initialize the first block of memory for the binary tree.
* This function is called only once at each run of the offline program.
*/
void init_memory (void)
{
mem_start = (char *) malloc (MEMORY_SIZE);
if (mem_start == NULL) {
fprintf (aa_stderr, catgets (dtsearch_catd, MS_vestatis, 310,
"%s Out of Memory. Need %ld bytes.\n"), PROGNAME "310", MEMORY_SIZE);
OE_flags |= OE_PERMERR;
usrblk.retncode = OE_ABORT;
release_shm_mem ();
return;
}
total_memory_size = MEMORY_SIZE;
cur_pos = mem_start;
mem_offset = 0L;
/*
* Allocate space for the first member of the memory blocks link list
*/
memory_blocks = (CNCRD_MEMORY_AREA_LIST *)
malloc (sizeof (CNCRD_MEMORY_AREA_LIST) + 2);
if (memory_blocks == NULL) {
fputs (catgets (dtsearch_catd, MS_vestatis, 314,
PROGNAME "314 Out of Memory.\n"), aa_stderr);
OE_flags |= OE_PERMERR;
usrblk.retncode = OE_ABORT;
release_shm_mem ();
return;
}
memory_blocks->start_of_mem_block = mem_start;
memory_blocks->block_size = total_memory_size;
memory_blocks->next_block = NULL;
cur_mem_ptr = NULL;
return;
} /* init_memory() */
/********************************/
/* */
/* inv_index_bin_tree */
/* */
/********************************/
/* Builds binary tree of all stems in query.
* Returns TRUE and loads num_diff_words with number
* of nodes in tree if tree successfully built,
* or if query is empty.
* Returns FALSE on any error (causing eventual engine abort).
*/
static int inv_index_bin_tree (void)
{
char *cptr;
DBLK *dblk = usrblk.dblk;
PARG parg;
/* First time initialize the first block of memory */
if (memory_blocks == NULL) {
/** INITIALIZE MEMORY **/
init_memory ();
if (usrblk.retncode == OE_ABORT)
return FALSE;
root_node = NULL;
}
/* WORD LOOP. Parse and stem each word in query.
* Add each stem to bin tree or incr its count.
*/
memset (&parg, 0, sizeof(PARG));
parg.dblk = dblk;
parg.string = usrblk.query;
for ( cptr = dblk->parser (&parg);
cptr;
cptr = dblk->parser (NULL)) {
build_bin_tree (dblk->stemmer (cptr, dblk));
if (usrblk.retncode == OE_ABORT)
return FALSE;
}
return TRUE;
} /* inv_index_bin_tree() */
/************************/
/* */
/* comp_stat */
/* */
/************************/
int comp_stat (void *val1, void *val2)
{
STAT_STR *bkt1;
STAT_STR *bkt2;
bkt1 = (STAT_STR *) val1;
bkt2 = (STAT_STR *) val2;
if ((bkt2->wght) > (bkt1->wght)) {
return 1;
}
else {
return -1;
}
} /* comp_stat() */
/************************************************/
/* */
/* load_ditto_str */
/* */
/************************************************/
/* Last function called from statistical search.
* Builds a real AusText hitlist from the sorted stat_array,
* translating the statistical weights to AusText 'proximity'
* values, and truncating the hitlist at user's maxhits.
* Working variables made static for speeeeeeeed.
*/
void load_ditto_str (void)
{
struct or_objrec cur_rec; /* structure taken from austext.h */
struct or_miscrec rec_data;
static time_t start_time;
static double time_dif;
static DB_ADDR dba1;
static DtSrResult *cur_ditto_mem;
static DtSrResult *ditto_llist;
static DtSrResult *temp_ditto;
static int debugging;
static int m;
static DtSrINT32 d0024;
static DtSrINT32 maxhits;
static DtSrINT32 i32, i32_start, j32;
static int fzkeysz, fzkey_remaining, abstrsz, dittosz;
static char *src, *targ, *targend;
static int check_dates = FALSE;
static double sum = 0.0;
static double sum1, sum2, sum3, sum4;
debugging = (usrblk.debug & USRDBG_SRCHCMPL);
maxhits = usrblk.dblk->maxhits;
fzkeysz = usrblk.dblk->dbrec.or_fzkeysz;
abstrsz = usrblk.dblk->dbrec.or_abstrsz;
dittosz = sizeof (DtSrResult) + abstrsz + 16;
if (debugging)
fprintf (aa_stderr, PROGNAME "773 "
"numhits=%ld maxhits=%d numwords=%d abstrsz=%d\n",
(long)num_hits, (int)maxhits, num_diff_words, abstrsz);
if (begin_load_ditto) {
/* test for zero hits */
if (num_hits == 0) {
usrblk.workproc = dummy_workproc;
usrblk.retncode = OE_NOTAVAIL;
if (OE_flags & OE_AUDIT)
oe_write_audit_rec (0L);
release_shm_mem ();
return;
}
check_dates = (usrblk.objdate1 || usrblk.objdate2);
/* In order to translate statistical weight into an AusText
* proximity, basically you have to invert it, then scale it.
* The statistical weight is a similarity measure: the
* larger it is the more similar the document to the query.
* But AusText 'proximity' is like a 'distance' measure,
* the smaller the number the closer the document is to the query.
*
* First 'normalize' each document's statistical
* weight to be a fraction between 0 and 1. Do this
* by calculating a normalization factor (sum1), the
* sqrt of the sum of squares of first NORM_VALUE weights.
* (Trying to make the inversion scheme produce
* reasonable proximity numbers for these first records).
*
* To complete proximity initialization, he uses
* the sum1 factor to determine and keep the first record's
* normalized weight (sum), presumably a fraction close
* to 1.0, and the first record's proximity (sum2),
* basically the percent
* value that the first doc is 'distant' from perfection (1.0 or 100%).
* For example, if the normalized weight of the first record is .931
* then the proximity will be 7 (100% - 93% = 7%). He does this now
* because he's going to use this first proximity (sum2) as a scaling
* factor to stretch out all the subsequent proximities so they
* look reasonable.
*/
sum = 0.0;
for (i32 = 0; i32 < num_hits; i32++) {
sum1 = (double) (stat_array + i32)->wght /
(double) num_diff_words;
sum += sum1 * sum1;
if (i32 >= NORM_VALUE)
break;
}
/*
* sum1 = normalization factor.
* sum = normalized weight (betw 0 and 1) of first record.
* sum2 = proximity of first record, proximity scale factor.
*/
sum1 = sqrt (sum);
sum = ((stat_array + 0)->wght / num_diff_words) / sum1;
sum2 = (1.0 - sum) * 100.0;
if (debugging)
fprintf (aa_stderr, PROGNAME "844 "
"normfac=%.2lf normwt(#1)=%.2lf prox(#1)=%.2lf\n",
sum1, sum, sum2);
/* Preallocate first hit on ditto_list */
ditto_llist = (DtSrResult *) austext_malloc (dittosz,
PROGNAME "449", NULL);
j32 = 0;
i32_start = 0;
d0024 = OR_D00 << 24;
begin_load_ditto = FALSE;
} /* endif (begin_load_ditto) */
/* Test whether user has pushed STOP button since last call */
if (usrblk.flags & USR_STOPSRCH) {
if (OE_flags & OE_AUDIT)
oe_write_audit_rec (-1L);
usrblk.retncode = OE_USER_STOP;
release_shm_mem ();
if (j32 == 0)
free (ditto_llist);
else
free_llist ((LLIST **) &ditto_llist);
return;
}
time (&start_time);
/**** MAIN DtSrResult LIST BUILD LOOP ****/
for (i32 = i32_start; i32 < num_hits; i32++) {
/* check iteration loop */
time_dif = difftime (time (NULL), start_time);
if ((time_dif > TIME_ITERATION
|| usrblk.debug & USRDBG_ITERATE) &&
!(usrblk.flags & USR_NO_ITERATE)) {
i32_start = i32;
usrblk.retncode = OE_SEARCHING;
usrblk.workproc = load_ditto_str;
mes_search_box = TRUE;
return;
}
dba1 = ((stat_array + i32)->dba * slot_d00 - dba_offset)
| d0024;
/*
* Don't use CRSET or RECREAD macros here so we can trap invalid
* dba errs.
*/
d_crset (&dba1, saveusr.vistano);
if (db_status < 0) {
fprintf (aa_stderr, catgets (dtsearch_catd, MS_vestatis, 437,
"%s: db_status = %d, dba = %d:%ld (x'%08.8lx'), vistano = %d\n"),
PROGNAME "437", db_status, (dba1 & 0xff000000) >> 24,
dba1 & 0xffffff, dba1, saveusr.vistano);
OE_flags |= OE_PERMERR;
usrblk.retncode = OE_ABORT;
release_shm_mem ();
return;
}
d_recread (&cur_rec, saveusr.vistano);
if (db_status < 0) {
fprintf (aa_stderr, catgets (dtsearch_catd, MS_vestatis, 437,
"%s: db_status = %d, dba = %d:%ld (x'%08.8lx'), vistano = %d\n"),
PROGNAME "437", db_status, (dba1 & 0xff000000) >> 24,
dba1 & 0xffffff, dba1, saveusr.vistano);
OE_flags |= OE_PERMERR;
usrblk.retncode = OE_ABORT;
release_shm_mem ();
return;
}
swab_objrec (&cur_rec, NTOH);
/* Skip any record with undesired keytype
* char, ie first char of key.
*/
if (*(rec_type_tab + cur_rec.or_objkey[0]) == 0)
continue;
/* Skip record if out of date range. */
if (check_dates)
if (!objdate_in_range (cur_rec.or_objdate,
usrblk.objdate1, usrblk.objdate2))
continue;
if (j32 == 0) /* first ditto node already allocated */
cur_ditto_mem = ditto_llist;
else {
cur_ditto_mem = malloc (dittosz);
if (cur_ditto_mem == NULL) {
fputs ( catgets (dtsearch_catd, MS_vestatis, 504,
PROGNAME "504 Cannot allocate cur_ditto\n"),
aa_stderr);
OE_flags |= OE_PERMERR;
usrblk.retncode = OE_ABORT;
release_shm_mem ();
return;
}
temp_ditto->link = cur_ditto_mem;
}
/* Load the ditto_list for this dba */
memset (cur_ditto_mem, 0, sizeof(DtSrResult));
cur_ditto_mem->dbn = OE_dbn;
cur_ditto_mem->dba = dba1;
strcpy (cur_ditto_mem->reckey, cur_rec.or_objkey);
cur_ditto_mem->objsize = cur_rec.or_objsize;
cur_ditto_mem->objdate = cur_rec.or_objdate;
cur_ditto_mem->objflags = cur_rec.or_objflags;
cur_ditto_mem->objuflags = cur_rec.or_objuflags;
cur_ditto_mem->objtype = cur_rec.or_objtype;
cur_ditto_mem->objcost = cur_rec.or_objcost;
/*****cur_ditto_mem->flags = 0;****/
cur_ditto_mem->abstractp = (char *) cur_ditto_mem +
sizeof (DtSrResult);
cur_ditto_mem->abstractp[0] = 0;
/* Translate statistical weight into AusText proximity.
* sum3 = normalized weight (betw 0 and 1).
* sum4 = prox = ratio of this normalized weight to
* first rec's weight, scaled by the first rec's proximity.
* No proximity is allowed to exceed some very large number.
*/
sum3 = ((stat_array + i32)->wght / num_diff_words) / sum1;
sum4 = sum2 * (sum / sum3);
if (sum4 > INFINITY)
sum4 = INFINITY;
cur_ditto_mem->proximity = sum4;
if (debugging)
fprintf (aa_stderr,
" --> dba=%ld normwt=%.4lf prox=%d key='%s'\n",
dba1, sum3, cur_ditto_mem->proximity,
cur_ditto_mem->reckey);
/*
* The abstract immediately follows the fuzzy key in the FZKABS
* misc recs. It may span several recs.
*/
if (abstrsz > 0) {
targ = cur_ditto_mem->abstractp;
targend = targ + abstrsz - 1;
fzkey_remaining = fzkeysz;
SETOR (PROGNAME "2270", OR_OBJ_MISCS, saveusr.vistano);
FINDFM (PROGNAME "2271", OR_OBJ_MISCS, saveusr.vistano);
while (db_status == S_OKAY) {
RECREAD (PROGNAME "549", &rec_data, saveusr.vistano);
NTOHS (rec_data.or_misctype);
if (rec_data.or_misctype == ORM_FZKABS) {
src = (char *) rec_data.or_misc;
for (m = 0; m < sizeof(rec_data.or_misc); m++) {
if (fzkey_remaining > 0) {
src++;
fzkey_remaining--;
continue; /* inner for-loop on m */
}
*targ = *src;
if (*src++ == 0 || targ++ >= targend) {
*targ = 0;
targ = targend; /* make outer loop end */
break;
}
} /* end for-loop for curr misc rec */
} /* endif: misctype == FZKABS */
if (targ >= targend)
break;
FINDNM (PROGNAME "545", OR_OBJ_MISCS, saveusr.vistano);
} /* end while-loop */
} /* endif: (abstrsz > 0) */
cur_ditto_mem->link = NULL;
temp_ditto = cur_ditto_mem;
/* Increment to next hit.
* Break loop when we reach user's specified maxhits.
*/
j32++; /* [j32 same as i] !? */
if (j32 >= maxhits)
break;
} /* i32-loop on each hit in ditto list */
if (j32 == 0) {
usrblk.workproc = dummy_workproc;
usrblk.retncode = OE_NOTAVAIL;
if (OE_flags & OE_AUDIT)
oe_write_audit_rec (0L);
release_shm_mem ();
return;
}
if (num_hits >= maxhits) {
if (!(usrblk.flags & USR_NO_INFOMSGS)) {
sprintf (vestat_msgbuf, catgets (dtsearch_catd, MS_vestatis, 421,
"$s Total Number Hits = %ld. Discarded hits beyond maximum number specified."),
PROGNAME "421", (long)num_hits);
DtSearchAddMessage (vestat_msgbuf);
}
}
free_llist ((LLIST **) &usrblk.dittolist);
usrblk.dittolist = ditto_llist;
usrblk.dittocount = j32;
usrblk.workproc = dummy_workproc;
usrblk.retncode = OE_OK;
if (OE_flags & OE_AUDIT)
oe_write_audit_rec ((long) num_hits);
/***** Free shared memory *****/
release_shm_mem ();
return;
} /* load_ditto_str() */
/****************************************/
/* */
/* stat_search */
/* */
/****************************************/
/* Subroutine of ve_statistical() and interruptable workproc.
*/
static void stat_search (void)
{
time_t start_time;
double time_dif;
DB_ADDR temp, temp1;
struct or_hwordrec word1; /* structure taken from austext.h */
double idf, cur_weight;
int qs;
DtSrINT32 int32, j32;
/*****@@@ size_t size;****/
static int qs_start;
/* Test whether user has pushed STOP button since last call */
if (usrblk.flags & USR_STOPSRCH) {
if (OE_flags & OE_AUDIT)
oe_write_audit_rec (-1L);
usrblk.retncode = OE_USER_STOP;
release_shm_mem ();
return;
}
if (begin_sort) {
begin_qsort = TRUE;
qsort_done = FALSE;
if (begin_search) {
qs_start = 0;
begin_search = FALSE;
}
time (&start_time);
/*
* For every query stem, read d99. For every dba in d99 for each
* stem, update object's stat array node with rec count and a
* weight based on the IDF for this stem. (IDF is described
* below). Saveusr.stemcount = lesser of DtSrMAX_STEMCOUNT or
* num_diff_words. All stems are stored in d99 beginning with ^O
* (decimal 15). Index qs = curr query stem
*/
for (qs = qs_start; qs < saveusr.stemcount; qs++) {
word1.or_hwordkey[0] = 15;
word1.or_hwordkey[1] = '\0';
strcat (word1.or_hwordkey, query_stems[qs].stem);
find_keyword (word1.or_hwordkey, saveusr.vistano);
/*
* If word is not in the database, ignore it. [ If word
* not in database, why not take the next stem in query_stems
* array, if any? ]
*/
if (db_status != S_OKAY)
word1.or_hwaddrs = 0;
else
read_wordstr (&word1, saveusr.vistano);
if (word1.or_hwaddrs > 0) {
fseek (usrblk.dblk->iifile, word1.or_hwoffset,
SEEK_SET);
/****@@@size = sizeof (DB_ADDR) * word1.or_hwaddrs;***/
fread (word_addrs, sizeof(DB_ADDR),
(size_t)word1.or_hwaddrs, usrblk.dblk->iifile);
/*
* Calculate IDF (inverse document frequency) for this
* word. The IDF is a statistical ratio of the number
* of documents containing the word and the total
* number of documents in the entire corpus.
* It is calculated here on the fly to save space in the
* database. IDF = {log (totnumdocs / numdocswithword) /
* log(2)} + 1. Note that an IDF of 1 means the word
* occurs in every doc (it's meaningless). An IDF of 19
* means the word occurs once in every 300,000 recs.
* Note that by dividing by log(2) the IDF also tells
* us how many binary digits are necessary to discriminate
* the word. Finally I think 1.0 was added to prevent
* it ever becoming zero when converted to integer.
*/
idf = (log ((double) real_num_rec / (double) word1.or_hwaddrs)
/ LOG2) + 1.0;
/*
* WEIGHT PASS #1:
* Update the stat array node for each doc (ie dba) which
* includes this stem. Specifically,
* sum the product of the IDF and word-doc weight into
* the 'wght' bucket, and update the number of query
* words this doc contains. Note that the d99 dba format
* is slot# in hi 3 bytes, word-doc weights in lo byte.
*/
for (j32 = 0; j32 < word1.or_hwaddrs; j32++) {
NTOHL (word_addrs [j32]);
temp1 = *(word_addrs + j32); /* d99 dba */
cur_weight = (double) (temp1 & 0xFF); /* lo byte */
temp = temp1 >> 8; /* slot# */
((stat_array + temp)->num_word_hits)++;
((stat_array + temp)->dba) = temp;
((stat_array + temp)->wght) += (float) (cur_weight * idf);
}
} /* end if (word1.or_hwaddrs > 0), ie
* query word exists */
/*
* If the query words were common, the last double loop may
* have taken a long time. If so, return now to the user
* interface to allow the gui to respond to button clicks
* (like CANCEL buttons).
*/
time_dif = difftime (time (NULL), start_time);
if ((time_dif > TIME_ITERATION
|| usrblk.debug & USRDBG_ITERATE) &&
!(usrblk.flags & USR_NO_ITERATE)) {
if (qs == saveusr.stemcount - 1) {
usrblk.retncode = OE_SEARCHING;
usrblk.workproc = stat_search;
mes_search_box = TRUE;
return;
}
else {
qs_start = qs + 1;
usrblk.retncode = OE_SEARCHING;
usrblk.workproc = stat_search;
mes_search_box = TRUE;
return;
}
} /* end if (time_dif > TIME_ITERATION */
} /* end qs-loop on each query stem */
/*
* Entire stat array contains one node for every possible dba
* (doc). Collapse the records that were actually referenced by
* the query words into the top portion of the array.
* Set 'num_hits' to the collapsed stat array size, ie
* num_hits = the total number of docs that will be on
* the prelim hitlist, prior to sort and truncation to user's maxhits.
*
* WEIGHT PASS #2:
* While we're at it, finalize the accumulated 'wght' field, which
* will be our sort field, by multiplying it by the ratio of the
* number of query words in the document divided by the number of
* words in the query.
* Thus the final sort field for each doc is the sum
* over all the query words in the doc of 3 factors:
* 1) IDF (relative weight of each query word in corpus), times
* 2) d99wght (relative weight of each query word in doc), times
* 3) weight based on number of different query words in this doc.
*/
num_hits = 0;
for (int32 = 0; int32 < total_num_addrs; int32++) {
if (stat_array[int32].wght > 0) {
(stat_array + num_hits)->num_word_hits =
(stat_array + int32)->num_word_hits;
(stat_array + num_hits)->wght = (stat_array + int32)->wght *
((double) (stat_array + int32)->num_word_hits /
(double) num_diff_words);
(stat_array + num_hits)->dba = (stat_array + int32)->dba;
num_hits++;
}
}
/*
* We're about to sort the actual hits. If the number of them
* exceeds a certain threshold, return to the user interface one
* more time to again allow the gui to respond to user CANCEL
* events.
*/
if (num_hits > SORT_MESG && !(usrblk.flags & USR_NO_ITERATE)) {
if (!mes_search_box) {
DtSearchAddMessage (catgets (dtsearch_catd, MS_vestatis, 990,
PROGNAME"990 The system is now sorting. Please wait."));
}
usrblk.retncode = OE_SEARCHING;
usrblk.workproc = stat_search;
mes_search_box = TRUE;
begin_sort = FALSE;
return;
}
} /* end if (begin_sort) */
/* Sort the preliminary hitlist (stat_array)
* by the calculated statistical weights.
*/
if (!efim_qsort ())
return;
/* Build a real AusText hitlist from the sorted stat_array,
* translating the statistical weights to AusText 'proximity'
* values, and truncating the hitlist at user's maxhits.
*/
if (qsort_done) {
begin_load_ditto = TRUE;
load_ditto_str ();
}
return;
} /* stat_search() */
/****************************************/
/* */
/* ve_statistical */
/* */
/****************************************/
void ve_statistical (void)
{
void stat_search (void);
DB_ADDR dba;
int i, j;
DtSrINT32 int32;
mes_search_box = FALSE;
usrblk.flags &= ~USR_STOPSRCH; /* turn off stop button */
usrblk.retncode = OE_OK;
usrblk = usrblk;
saveusr.vistano = usrblk.dblk->vistano;
saveusr.dittolist = NULL;
saveusr.dittocount = 0L;
saveusr.iterations = 1;
/****** find total number of records in the database *********/
RECFRST (PROGNAME "1067", OR_OBJREC, saveusr.vistano);
CRGET (PROGNAME "1068", &dba, saveusr.vistano);
real_num_rec = usrblk.dblk->dbrec.or_reccount;
slot_d00 = usrblk.dblk->dbrec.or_recslots;
dba_offset = slot_d00 - (dba & 0x00FFFFFF);
total_num_addrs = (usrblk.dblk->dbrec.or_maxdba -
(dba & 0x00FFFFFF) + 1) / slot_d00 + 1;
/* stat_array size = 1 node for every possible object */
if (usrblk.query[0] == 0) {
DtSearchAddMessage (catgets (dtsearch_catd, MS_vestatis,
677, PROGNAME "677 Query field is empty."));
usrblk.retncode = OE_BAD_QUERY;
return;
}
/*
* Build binary tree of each stem in query containing count of number
* of occurrences of stem in query. Loads num_diff_words with number
* of nodes in tree.
*/
num_diff_words = 0;
inv_index_bin_tree();
if (usrblk.retncode == OE_ABORT)
return;
if (num_diff_words < 1) {
usrblk.retncode = OE_NOTAVAIL;
return;
}
/***** allocate memory for query_stems array *********/
if (query_stems != NULL) {
free (query_stems);
query_stems = NULL;
}
query_stems = (QUERY_STEM_STR *) austext_malloc
(sizeof (QUERY_STEM_STR) * (num_diff_words + 1),
PROGNAME " 371", NULL);
/*
* Traverse tree to build query_stems array, each array node = tree
* node, ie each unique stem in query and its count in query.
* Num_diff_words now used as index for growing array.
*/
num_diff_words = 0;
traverse_tree ();
/*
* For each new query initialize memory offset, current memory start
* position, and total size for the available memory. Starts from the
* first member in the link list of memory blocks.
*/
root_node = NULL;
mem_start = memory_blocks->start_of_mem_block;
total_memory_size = memory_blocks->block_size;
cur_mem_ptr = memory_blocks->next_block;
cur_pos = mem_start;
mem_offset = 0L;
/*
* Copy first DtSrMAX_STEMCOUNT stems into the saveusr.stems. [So no more
* than DtSrMAX_STEMCOUNT will be used in search or hiliting!]
*/
for (i = 0; i < num_diff_words; i++) {
if (i == DtSrMAX_STEMCOUNT)
break;
strcpy (usrblk.stems[i], query_stems[i].stem);
}
usrblk.stemcount = i;
saveusr.stemcount = i;
/* Prepare a string holding first char of desired record ids */
for (i = 0; i < REC_TYPES; i++)
*(rec_type_tab + i) = 0;
for (i = 0, j = 0; i < usrblk.dblk->ktcount; i++)
if (usrblk.dblk->keytypes[i].is_selected)
*(rec_type_tab + usrblk.dblk->keytypes[i].ktchar) = 1;
saveusr.ktchars[j] = '\0';
/*
* New code using shared memory:
* Allocate global block of shared memory,
* and assign parts of this memory to each array.
* Stat array has an element for every possible db object.
* Set whole stat array to binary zeroes.
*/
if (!init_global_memory (total_num_addrs, real_num_rec))
return;
stat_array = (STAT_STR *) global_memory_ptr;
word_addrs = (DB_ADDR *) (global_memory_ptr +
total_num_addrs * sizeof (STAT_STR));
for (int32 = 0; int32 < total_num_addrs; int32++) {
(stat_array + int32)->wght = 0.0;
(stat_array + int32)->num_word_hits = 0;
}
/***** end of memory allocation for statistical array *********/
/* stat_search(): Search d99 and sum the statistical weights.
* Calls efim_qsort() to sort the hitlist by the weights.
*/
begin_search = TRUE; /* global initialization and state flags */
begin_sort = TRUE;
stat_search ();
return;
} /* ve_statistical() */
/*************************** VESTATIS.C ****************************/