cdesktopenv/cde/lib/DtSearch/boolpars.c

1126 lines
31 KiB
C

/*
* CDE - Common Desktop Environment
*
* Copyright (c) 1993-2012, The Open Group. All rights reserved.
*
* These libraries and programs are free software; you can
* redistribute them and/or modify them under the terms of the GNU
* Lesser General Public License as published by the Free Software
* Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* These libraries and programs are distributed in the hope that
* they will be useful, but WITHOUT ANY WARRANTY; without even the
* implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
* PURPOSE. See the GNU Lesser General Public License for more
* details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with these libraries and programs; if not, write
* to the Free Software Foundation, Inc., 51 Franklin Street, Fifth
* Floor, Boston, MA 02110-1301 USA
*/
/* $XConsortium: boolpars.c /main/5 1996/11/25 18:49:27 drk $
*
* (c) Copyright 1996 Digital Equipment Corporation.
* (c) Copyright 1996 Hewlett-Packard Company.
* (c) Copyright 1996 International Business Machines Corp.
* (c) Copyright 1996 Sun Microsystems, Inc.
* (c) Copyright 1996 Novell, Inc.
* (c) Copyright 1996 FUJITSU LIMITED.
* (c) Copyright 1996 Hitachi.
*/
/*
* COMPONENT_NAME: austext
*
* FUNCTIONS: add_syntax_errmsg
* boolean_parse
* boolyac_AND
* boolyac_COLLOC
* boolyac_NOT
* boolyac_OR
* copy_final_truthtab
* copy_token
* creatett
* freett
* get_stem_truthtab
* main
* process_user_args
* yyerror
* yylex
*
* ORIGINS: 27
*
*
* (C) COPYRIGHT International Business Machines Corp. 1996
* All Rights Reserved
* Licensed Materials - Property of IBM
* US Government Users Restricted Rights - Use, duplication or
* disclosure restricted by GSA ADP Schedule Contract with IBM Corp.
*/
/********************* BOOLPARS.C ********************
* $Id: boolpars.c /main/5 1996/11/25 18:49:27 drk $
* February 1996.
* AusText/DtSearch yacc-based boolean query parser.
* Converts boolean query into stems array and truth table
* for subsequent search. Boolyac.y is the yacc source.
* After processing by yacc, it becomes boolyac.c and boolyac.h.
* This module contains all the related C source code: yylex,
* yacc action functions, and the main AusText driver function, boolean_parse.
* Additional information (format of TRUTHTAB) in header file boolpars.h.
*
* $Log$
* Revision 1.4 1996/03/22 23:12:50 miker
* Added string.h header and correctly cast strcspn() calls.
*
* Revision 1.3 1996/03/20 19:14:30 miker
* Enable collocation expressions in stem (type 'S') searches.
*
* Revision 1.2 1996/03/13 22:35:59 miker
* Changed char to UCHAR several places; similar typecasts.
*
* Revision 1.1 1996/03/05 15:52:06 miker
* Initial revision
*/
#include "SearchE.h"
#include <stdlib.h>
#include <string.h>
#include "boolpars.h"
#include "boolyac.h"
#if (DtSrMAX_STEMCOUNT != 8)
#error DtSrMAX_STEMCOUNT is not defined to be 8.
#endif
#define PROGNAME "BOOLPARS"
#define WORD_ENDERS " \t\n\f()|@~&"
#define MAX_YYERRORS 4
#define MS_boolpars 28
/****************************************/
/* */
/* GLOBALS */
/* */
/****************************************/
int qry_has_no_NOTs = FALSE;
int qry_is_all_ANDs = FALSE;
TRUTHTAB final_truthtab = { 0 };
int parser_invalid_wordcount = 0;
static int debugging_boolpars = FALSE;
static unsigned char
*final_permutes = NULL;
static int last_token_was_boolop = TRUE;
static char *msgbuf = NULL;
static UCHAR *next_lex_char = NULL;
static int paren_count = 0;
static TRUTHTAB *ttlist = NULL;
static int yyerror_count = 0;
static size_t yyleng; /* same as in lex API */
static char *yytext; /* same as in lex API */
/****************************************/
/* */
/* add_syntax_errmsg */
/* */
/****************************************/
/* Action function called for yacc rules used to trap syntax errors.
* Adds error message identified by msgno to user's msglist.
*/
void add_syntax_errmsg (int msgno)
{
switch (msgno) {
case 1:
/* Message #2 is called in two places */
sprintf (msgbuf, catgets(dtsearch_catd, MS_boolpars, 2,
"%s Query field is empty."),
PROGNAME"086");
DtSearchAddMessage (msgbuf);
break;
case 2:
sprintf (msgbuf, catgets(dtsearch_catd, MS_boolpars, 5,
"%s Boolean operators must be positioned\n"
"between words or expressions. Two sequential words\n"
"without an operator are interpreted as being separated\n"
"by the AND operator (&)."),
PROGNAME"091");
DtSearchAddMessage (msgbuf);
break;
case 3:
sprintf (msgbuf, catgets(dtsearch_catd, MS_boolpars, 6,
"%s Expression in parentheses is missing."),
PROGNAME"093");
DtSearchAddMessage (msgbuf);
break;
case 4:
sprintf (msgbuf, catgets(dtsearch_catd, MS_boolpars, 7,
"%s NOT operator (~) must be positioned to\n"
"the left of the word or expression it qualifies."),
PROGNAME"098");
DtSearchAddMessage (msgbuf);
break;
case 5:
/* Message #3 is called in two places */
sprintf (msgbuf, catgets(dtsearch_catd, MS_boolpars, 3,
"%s COLLOCATION operator (@) may\n"
"only be positioned between two words."),
PROGNAME"111");
DtSearchAddMessage (msgbuf);
break;
case 6:
sprintf (msgbuf, catgets(dtsearch_catd, MS_boolpars, 4,
"%s One or more words in your\n"
"query are not stored in database '%s'.") ,
PROGNAME"089", usrblk.dblk->label);
DtSearchAddMessage (msgbuf);
break;
default:
sprintf (msgbuf, catgets(dtsearch_catd, MS_boolpars, 8,
"%s Invalid boolean query. Syntax Error #%d.") ,
PROGNAME"100", msgno);
DtSearchAddMessage (msgbuf);
break;
}
return;
} /* add_syntax_errmsg() */
/****************************************/
/* */
/* creatett */
/* */
/****************************************/
/* Constructor for new truth table.
* Allocates it, inits it, and links it into ttlist.
*/
static TRUTHTAB *creatett (int stemno, int pmsz, unsigned char *permutes)
{
TRUTHTAB *newtt = austext_malloc (sizeof(TRUTHTAB) + pmsz + 4,
PROGNAME"140", NULL);
memset (newtt, 0, sizeof(TRUTHTAB));
newtt->stemno = stemno;
newtt->pmsz = pmsz;
newtt->permutes = (unsigned char *) (newtt + 1);
memcpy (newtt->permutes, permutes, pmsz);
newtt->next = ttlist;
ttlist = newtt;
return newtt;
} /* creatett() */
/****************************************/
/* */
/* freett */
/* */
/****************************************/
/* Destructor of passed truth table.
* Unlinks it from ttlist and frees it.
*/
static void freett (TRUTHTAB *argtt)
{
TRUTHTAB *tt;
TRUTHTAB **lastlink = &ttlist;
for (tt = ttlist; tt; tt = tt->next) {
if (tt == argtt) {
*lastlink = tt->next;
free (tt);
break;
}
lastlink = &tt->next;
}
return;
} /* freett() */
/****************************************/
/* */
/* copy_final_truthtab */
/* */
/****************************************/
/* Copys passed truth table into global final_truthtab.
* Returns final_truthtab.
*/
TRUTHTAB *copy_final_truthtab (TRUTHTAB *tt)
{
memset (&final_truthtab, 0, sizeof(TRUTHTAB));
if (!final_permutes)
final_permutes = austext_malloc (300, PROGNAME"788", NULL);
final_truthtab.pmsz = tt->pmsz;
final_truthtab.permutes = final_permutes;
memcpy (final_permutes, tt->permutes, final_truthtab.pmsz);
return &final_truthtab;
} /* copy_final_truthtab() */
/****************************************/
/* */
/* get_stem_truthtab */
/* */
/****************************************/
/* Subroutine of yylex. Also used in yacc action functions.
* Creates and returns truth table for passed stem.
* If stem is new, adds it to saveusr.stems array, and adds
* the original query word string to usrblk.stems for msgs.
* Returns NULL and posts err msg if array is full
* or has other error.
*/
static TRUTHTAB *get_stem_truthtab (char *newstem, char *origword)
{
int i, stemno;
unsigned char bitmask;
unsigned char *pmp;
unsigned char new_permutes [128];
TRUTHTAB *newtt;
/* Check if stem is already in array */
for (stemno = 0; stemno < saveusr.stemcount; stemno++)
if (strcmp (newstem, saveusr.stems[stemno]) == 0)
break;
/* Add new stem to array */
if (stemno == saveusr.stemcount) {
if (++saveusr.stemcount > DtSrMAX_STEMCOUNT) {
sprintf (msgbuf, catgets (dtsearch_catd, MS_boolpars, 9,
"%s Too many terms in boolean query."),
PROGNAME"1513");
DtSearchAddMessage (msgbuf);
saveusr.stemcount--;
return NULL;
}
strncpy (saveusr.stems[stemno], newstem, DtSrMAXWIDTH_HWORD);
saveusr.stems [stemno] [DtSrMAXWIDTH_HWORD - 1] = 0;
if (origword) {
strncpy (usrblk.stems[stemno], origword, DtSrMAXWIDTH_HWORD);
usrblk.stems [stemno] [DtSrMAXWIDTH_HWORD - 1] = 0;
}
}
/* Stemno now indicates correct term in saveusr.stems.
* Truth table for a single term has 128 8-bit permutes,
* the 1/2 of all 256 possible permutations that have
* that term's bit switched on.
*/
bitmask = 1 << stemno; /* mask with only newstem's bit on */
pmp = new_permutes;
for (i=0; i<256; i++)
if ((i & bitmask) != 0) {
*pmp = i;
pmp++;
}
newtt = creatett (stemno, 128, new_permutes);
if (debugging_boolpars) {
fprintf (aa_stderr, " WORD: stem[%d]='%c%s' expr=%p pmsz=%d\n",
stemno,
(saveusr.stems[stemno][0] == STEM_CH) ?
'~' : saveusr.stems[stemno][0],
&saveusr.stems[stemno][1],
(void *) newtt, newtt->pmsz);
fflush (aa_stderr);
}
return newtt;
} /* get_stem_truthtab() */
/****************************************/
/* */
/* boolyac_AND */
/* */
/****************************************/
/* Action function for AND expression rule.
* Returns set INTERSECTION of passed truth tables,
* ie only the permutes they have in common.
* Any truth table, input or output, can be the empty or
* the universal set. For example: "(A & B) & ~A" is empty.
*/
TRUTHTAB *boolyac_AND (TRUTHTAB *tt1, TRUTHTAB *tt2) {
TRUTHTAB *newtt;
unsigned char new_permutes [256];
int pm1, pm2, newpm;
pm1 = pm2 = newpm = 0;
while (pm1 < tt1->pmsz && pm2 < tt2->pmsz) {
if (tt1->permutes[pm1] < tt2->permutes[pm2])
pm1++;
else if (tt1->permutes[pm1] > tt2->permutes[pm2])
pm2++;
else {
new_permutes [newpm++] = tt1->permutes [pm1];
pm1++;
pm2++;
}
}
/* Free old truthtabs, create new one. */
freett (tt1);
freett (tt2);
newtt = creatett (-1, newpm, new_permutes);
if (debugging_boolpars) {
fprintf (aa_stderr, " AND: exprs=%p,%p-->expr=%p pmsz=%d\n",
(void *) tt1, (void *) tt2, (void *) newtt, newtt->pmsz);
fflush (aa_stderr);
}
return newtt;
} /* boolyac_AND() */
/****************************************/
/* */
/* boolyac_OR */
/* */
/****************************************/
/* Action function for OR expression rule.
* Returns set UNION of passed truth tables.
* Any truth table, input or output, can be the empty or
* the universal set. For example: "A | ~A" is universal.
*/
TRUTHTAB *boolyac_OR (TRUTHTAB *tt1, TRUTHTAB *tt2) {
TRUTHTAB *newtt;
unsigned char new_permutes [256];
unsigned char *permutes1 = tt1->permutes;
unsigned char *permutes2 = tt2->permutes;
int pm1, pm2, newpm;
pm1 = pm2 = newpm = 0;
/* While neither permutes array is exhausted... */
while (pm1 < tt1->pmsz && pm2 < tt2->pmsz) {
if (permutes1[pm1] < permutes2[pm2])
new_permutes [newpm++] = permutes1[pm1++];
else if (permutes2[pm2] < permutes1[pm1])
new_permutes [newpm++] = permutes2[pm2++];
else {
new_permutes [newpm++] = permutes1[pm1++];
pm2++;
}
}
/* After one or both permutes arrays are exhausted... */
while (pm1 < tt1->pmsz)
new_permutes [newpm++] = permutes1[pm1++];
while (pm2 < tt2->pmsz)
new_permutes [newpm++] = permutes2[pm2++];
/* Free old truthtabs, create new one. */
freett (tt1);
freett (tt2);
newtt = creatett (-1, newpm, new_permutes);
if (debugging_boolpars) {
fprintf (aa_stderr, " OR: exprs=%p,%p-->expr=%p pmsz=%d\n",
(void *) tt1, (void *) tt2, (void *) newtt, newtt->pmsz);
fflush (aa_stderr);
}
return newtt;
} /* boolyac_OR() */
/****************************************/
/* */
/* boolyac_NOT */
/* */
/****************************************/
/* Action function for NOT expression rule.
* Returns set COMPLEMENT of passed truth table,
* ie the universal set minus the passed set,
* ie all possible permutes except those passed.
* Either the old or the new truth table can be
* the empty or the universal set.
*/
TRUTHTAB *boolyac_NOT (TRUTHTAB *oldtt) {
TRUTHTAB *newtt;
unsigned char new_permutes [256];
int oldpm, newpm;
int candidate;
oldpm = newpm = 0;
for (candidate = 0; candidate < 256; candidate++) {
if (oldpm >= oldtt->pmsz || candidate < oldtt->permutes [oldpm]) {
new_permutes [newpm++] = candidate;
}
/*
* oldtt not done && candidate == oldtt.
* (candidate > oldtt not possible).
*/
else {
oldpm++;
}
}
freett (oldtt);
newtt = creatett (-1, newpm, new_permutes);
if (debugging_boolpars) {
fprintf (aa_stderr, " NOT: expr=%p-->expr=%p pmsz=%d\n",
(void *) oldtt, (void *) newtt, newtt->pmsz);
fflush (aa_stderr);
}
return newtt;
} /* boolyac_NOT() */
/****************************************/
/* */
/* boolyac_COLLOC */
/* */
/****************************************/
/* Action function for COLLOCATION expression rule.
* The record set satisfying a collocation expression is
* generated dynamically. At the parse level it is equivalent
* to a separate 'word' with its own (undetermined) record set.
* So it's given its own slot in saveusr.stems. The word
* in saveusr.stems is formatted "@ssttv[v...]" where ss and tt are
* ascii numbers that index the original collocated words
* in saveusr.stems, and v... is the collocation value integer.
* For example, "@03005" represents the collocation of stem
* number 3 and stem number 0, with collocation value 5.
*
* Returns NULL and errmsg on msglist if any problems.
*/
TRUTHTAB *boolyac_COLLOC (
TRUTHTAB *word1tt,
int colloc_val,
TRUTHTAB *word2tt)
{
TRUTHTAB *newtt;
char wordbuf [DtSrMAXWIDTH_HWORD];
if (word1tt->stemno < 0 || word2tt->stemno < 0) {
/* Message #3 is called in two places */
sprintf (msgbuf, catgets(dtsearch_catd, MS_boolpars, 3,
"%s COLLOCATION operator (@) may\n"
"only be positioned between two words."),
PROGNAME"371");
DtSearchAddMessage (msgbuf);
return NULL;
}
if (word1tt->stemno == word2tt->stemno) {
sprintf (msgbuf, catgets(dtsearch_catd, MS_boolpars, 12,
"%s Collocation operator is not\n"
"permitted between identical words."),
PROGNAME"377");
DtSearchAddMessage (msgbuf);
return NULL;
}
sprintf (wordbuf, COLLOC_STEM_FORMAT,
word1tt->stemno, word2tt->stemno, colloc_val);
if ((newtt = get_stem_truthtab (wordbuf, wordbuf)) == NULL)
return NULL;
freett (word1tt);
freett (word2tt);
if (debugging_boolpars) {
fprintf (aa_stderr, " COLLOC: exprs=%p,%p-->expr=%p pmsz=%d\n",
(void *) word1tt, (void *) word2tt, (void *) newtt, newtt->pmsz);
fflush (aa_stderr);
}
return newtt;
} /* boolyac_COLLOC() */
/****************************************/
/* */
/* yyerror */
/* */
/****************************************/
/* Replaces standard yacc error routine. */
void yyerror (char *msg) {
if (strcmp (msg, "syntax error") == 0) {
if (DtSearchHasMessages())
return;
else if (parser_invalid_wordcount > 0)
add_syntax_errmsg(6);
else {
sprintf (msgbuf, catgets(dtsearch_catd, MS_boolpars, 1,
"%s Your search string is an invalid\n"
"boolean query. Please reformulate and try again."),
PROGNAME"001");
DtSearchAddMessage (msgbuf);
}
}
else
DtSearchAddMessage (msg);
return;
} /* yyerror() */
/****************************************/
/* */
/* copy_token */
/* */
/****************************************/
/* Subroutine of yylex(). Copies passed substring
* Into a zero-terminated buffer of its own.
* Static buffer good until next call.
*/
static char *copy_token (UCHAR *tokenp, size_t toklen)
{
static char *buf = NULL;
static size_t bufsz = 0;
if (toklen > bufsz) {
if (buf)
free (buf);
bufsz = toklen + (toklen >> 1); /* 1.5 times size needed */
buf = austext_malloc (bufsz + 4, PROGNAME"182", NULL);
}
strncpy (buf, (char *) tokenp, toklen);
buf [toklen] = 0;
return buf;
} /* copy_token() */
/****************************************/
/* */
/* yylex */
/* */
/****************************************/
/* Delivers tokens to yyparse() from usrblk.query */
int yylex (void)
{
int retn_token;
PARG parg;
char *stembufp;
char mystembuf [DtSrMAXWIDTH_HWORD + 4];
GET_ANOTHER_TOKEN:
/* Skip white space */
while (ascii_charmap[*next_lex_char] & WHITESPACE)
next_lex_char++;
/* Terminating zero indicates end of query and end of parse.
* Automatically close unbalanced parentheses.
*/
if (*next_lex_char == 0) {
if (paren_count > 0) {
paren_count--;
retn_token = ')';
yytext = ")";
yyleng = 1;
goto DELIVER_TOKEN;
}
retn_token = 0;
yytext = "";
yyleng = 0;
goto DELIVER_TOKEN;
}
switch (*next_lex_char) {
case '|': /* OR operator */
last_token_was_boolop = TRUE;
retn_token = '|';
yytext = "|";
yyleng = 1;
next_lex_char++;
break;
case '~': /* NOT operator */
if (!last_token_was_boolop) {
/* Generate implied AND between words
* and parenthesized expressions.
* A NOT is not itself boolean; it must
* precede the next word or expression.
*/
last_token_was_boolop = TRUE;
retn_token = '&';
yytext = "&";
yyleng = 1;
break;
}
last_token_was_boolop = TRUE;
retn_token = '~';
yytext = "~";
yyleng = 1;
next_lex_char++;
break;
case '&': /* AND operator */
if (last_token_was_boolop && qry_is_all_ANDs) {
/* Ignore multiple AND operators.
* These might occur if we silently
* discarded some invalid words.
*/
next_lex_char++;
goto GET_ANOTHER_TOKEN;
}
last_token_was_boolop = TRUE;
retn_token = '&';
yytext = "&";
yyleng = 1;
next_lex_char++;
break;
case '(': /* OPEN parentheses */
if (!last_token_was_boolop) {
/* Generate implied AND between words
* and parenthesized expressions.
*/
last_token_was_boolop = TRUE;
retn_token = '&';
yytext = "&";
yyleng = 1;
break;
}
paren_count++;
retn_token = '(';
yytext = "(";
yyleng = 1;
next_lex_char++;
break;
case ')': /* CLOSE parentheses */
/* Just discard excessive right parentheses */
if (--paren_count < 0) {
paren_count = 0;
next_lex_char++;
goto GET_ANOTHER_TOKEN;
}
last_token_was_boolop = FALSE;
retn_token = ')';
yytext = ")";
yyleng = 1;
next_lex_char++;
break;
case '@': /* COLLOCATION operator */
/* Collocation token:
* Token is defined as the collocation char followed
* by one or more numeric digits: "@#[#...]".
* Syntactically it's a kind of an AND operator.
* Semantically it's a pseudo word token
* (it will occupy a slot in the stems array).
* The yylval is the integer value following
* the collocation character.
*/
yyleng = strcspn ((char *) next_lex_char + 1, WORD_ENDERS) + 1;
yytext = copy_token (next_lex_char, yyleng);
next_lex_char += yyleng;
if ((usrblk.dblk->dbrec.or_dbaccess & ORA_BLOB) == 0) {
retn_token = ERROR_TOKEN;
sprintf (msgbuf, catgets(dtsearch_catd, MS_boolpars, 10,
"%s Collocation searches not available for database '%s'."),
PROGNAME"2567", usrblk.dblk->label);
DtSearchAddMessage (msgbuf);
break;
}
yylval.int_val = atoi (yytext + 1);
if (yylval.int_val <= 0) {
retn_token = ERROR_TOKEN;
sprintf (msgbuf, catgets(dtsearch_catd, MS_boolpars, 11,
"%s Collocation operator '%.*s' is invalid.\n"
"Correct format is '@n' where n is greater than zero.") ,
PROGNAME"294", DtSrMAXWIDTH_HWORD, yytext);
DtSearchAddMessage (msgbuf);
break;
}
last_token_was_boolop = TRUE;
retn_token = COLLOC_TOKEN;
break;
default:
/* Presumed word token:
* Token is all text chars until next whitespace,
* next lex token, or end of string.
* Linguistically parse it and optionally stem it.
* The token value is the truth table for one
* word: all permutes with only that word's
* bits turned on. If the word is already
* in the stems array, then the permutes
* position is the word's index in the array.
* If the word is not in the array, it's added.
* If the array is full, then an error is reported.
*/
if (!last_token_was_boolop) {
/* Generate implied AND between words
* and parenthesized expressions.
*/
last_token_was_boolop = TRUE;
retn_token = '&';
yytext = "&";
yyleng = 1;
break;
}
yyleng = strcspn ((char *) next_lex_char, WORD_ENDERS);
yytext = copy_token (next_lex_char, yyleng);
next_lex_char += yyleng;
/*
* Linguistically parse the token.
* Failure can occur because word is too short
* or too long, it's on the stoplist, etc.
* Setting PA_MSGS causes parser to explain
* invalid words with a msg.
*/
memset (&parg, 0, sizeof(PARG));
parg.dblk = usrblk.dblk;
parg.string = yytext;
/*****if (!qry_is_all_ANDs)********/
parg.flags = PA_MSGS;
stembufp = usrblk.dblk->parser (&parg);
if (debugging_boolpars) {
fprintf (aa_stderr, " lang: '%s' -> '%s'\n",
yytext, (stembufp)? stembufp : "<null>");
fflush (aa_stderr);
}
/*
* If token is not a linguistically valid word,
* one of two things can happen. If the query
* is all_ANDs (most common type) we silently
* ignore the token.
* Otherwise report error and quit now.
*/
if (stembufp == NULL) {
parser_invalid_wordcount++;
if (qry_is_all_ANDs)
goto GET_ANOTHER_TOKEN;
retn_token = ERROR_TOKEN;
if (!DtSearchHasMessages()) {
sprintf (msgbuf, catgets(dtsearch_catd, MS_boolpars, 13,
"%s Word '%.*s' is invalid.") ,
PROGNAME"315", DtSrMAXWIDTH_HWORD, yytext);
DtSearchAddMessage (msgbuf);
}
break;
}
if (strlen(stembufp) != strlen(yytext)) {
retn_token = ERROR_TOKEN;
sprintf (msgbuf, catgets(dtsearch_catd, MS_boolpars, 14,
"%s String '%.*s' is not a single word.") ,
PROGNAME"634", DtSrMAXWIDTH_HWORD, yytext);
DtSearchAddMessage (msgbuf);
break;
}
/*
* If stemming, we must prefix term with
* special stem char in the stems array.
*/
if (usrblk.request == OE_SRCH_STEMS) {
stembufp = usrblk.dblk->stemmer (stembufp, usrblk.dblk);
if (debugging_boolpars) {
fprintf (aa_stderr, " stemer: -> '%s'\n", stembufp);
fflush (aa_stderr);
}
mystembuf[0] = STEM_CH;
strncpy (mystembuf + 1, stembufp, DtSrMAXWIDTH_HWORD);
mystembuf [DtSrMAXWIDTH_HWORD - 1] = 0;
stembufp = mystembuf;
}
/* Load stem into stems arrays and return it's truth table. */
if ((yylval.truthtab = get_stem_truthtab (stembufp, yytext))) {
retn_token = WORD_TOKEN;
last_token_was_boolop = FALSE;
}
else
retn_token = ERROR_TOKEN;
break;
} /* switch on *next_lex_char */
DELIVER_TOKEN:
if (debugging_boolpars) {
fprintf (aa_stderr,
" yylex: op?=%d parct=%d tok#=%d lval=%p%sYYTEXT='%s'\n",
last_token_was_boolop, paren_count,
retn_token, (void *) yylval.truthtab,
(retn_token == COLLOC_TOKEN)? "\t\t" : "\t",
yytext);
fflush (aa_stderr);
}
return retn_token;
} /* yylex() */
/****************************************/
/* */
/* boolean_parse */
/* */
/****************************************/
/* Called from Opera_Engine for boolean searches.
* Driver for yyparse().
* Expects usrblk.request == OE_SRCH_STEMS or OE_SRCH_WORDS.
* If parse is completely successful (query is valid), outputs
* saveusr.stemcount,
* saveusr.stems (stemmed if necessary with STEM_CH as first char,
* and phony colloc words with '@' as first char),
* usrblk.stems (original unstemmed query terms for err msgs),
* final_truthtab,
* qry_has_no_NOTs,
* qry_is_all_ANDs,
* and returns TRUE. Truthtab allocation good until next call.
* If parse fails, returns FALSE and err msg(s) on msglist.
*/
int boolean_parse (void)
{
int i;
char *cptr;
TRUTHTAB *tt, *ttnext;
debugging_boolpars = (usrblk.debug & USRDBG_BOOL);
if (!msgbuf)
msgbuf = austext_malloc (300 + DtSrMAXWIDTH_HWORD,
PROGNAME"255", NULL);
/* Test for empty query */
if (usrblk.query == NULL) {
EMPTY_QUERY:
/* Message #2 is called in two places */
sprintf (msgbuf, catgets(dtsearch_catd, MS_boolpars, 2,
"%s Query is empty."), PROGNAME"289");
DtSearchAddMessage (msgbuf);
return FALSE;
}
for (cptr = usrblk.query; *cptr; cptr++) {
if ((ascii_charmap[*cptr] & WHITESPACE) == 0)
break;
}
if (*cptr == 0)
goto EMPTY_QUERY;
/* Init globals for yylex and yyparse */
next_lex_char = (UCHAR *) usrblk.query;
paren_count = 0;
yyerror_count = 0;
last_token_was_boolop = TRUE;
saveusr.stemcount = 0;
parser_invalid_wordcount = 0;
/* Query "is all ANDS" if it has no ORs, NOTs, or COLLOCs.
* Missing or linguistically invalid words will be silently
* discarded for all_ANDs queries.
* Query "has no NOTs" if it has no NOTs.
* Results from queries without NOTs can be statistically sorted.
*/
qry_has_no_NOTs = !strchr (usrblk.query, '~');
qry_is_all_ANDs = !strpbrk (usrblk.query, "|~@");
if (debugging_boolpars || (usrblk.debug & USRDBG_SRCHCMPL)) {
fprintf (aa_stderr,
"start boolean_parse: stem?=%d allANDs?=%d noNOTs?=%d\n"
" query: '%s'\n",
(usrblk.request == OE_SRCH_STEMS),
qry_is_all_ANDs, qry_has_no_NOTs, usrblk.query);
fflush (aa_stderr);
}
if (yyparse() != 0)
return FALSE;
/* Free entire remaining ttlist. Only you
* can prevent forest fires and memory leaks.
*/
tt = ttlist;
while (tt) {
ttnext = tt->next;
free (tt);
tt = ttnext;
}
ttlist = NULL;
if (debugging_boolpars || (usrblk.debug & USRDBG_SRCHCMPL)) {
print_stems (saveusr.stemcount, saveusr.stems,
PROGNAME"815 end boolean_parse, syntax ok,");
fprintf (aa_stderr, " permutes=%d:", final_truthtab.pmsz);
for (i=0; i<16; i++) {
if (i >= final_truthtab.pmsz)
break;
fprintf (aa_stderr, " %02x", final_truthtab.permutes [i]);
}
fputc ('\n', aa_stderr);
fflush (aa_stderr);
}
if (final_truthtab.pmsz <= 0) {
sprintf (msgbuf, catgets(dtsearch_catd, MS_boolpars, 15,
"%s Your query cannot logically return\n"
"any records. Please reformulate and try again."),
PROGNAME"334");
DtSearchAddMessage (msgbuf);
return FALSE;
}
if (final_truthtab.pmsz >= 256) {
sprintf (msgbuf, catgets(dtsearch_catd, MS_boolpars, 16,
"%s Your query will return entire database\n"
"'%s'. Please reformulate and try again.") ,
PROGNAME"341", usrblk.dblk->label);
DtSearchAddMessage (msgbuf);
return FALSE;
}
return TRUE;
} /* boolean_parse() */
#ifdef TESTBOOL /*-----------------------------------------------*/
USRBLK usrblk = { 0 };
DBLK dblk;
SAVEUSR saveusr = { 0 };
extern int debugging_teskey;
extern int debugging_paice;
extern int debugging_jpn;
/****************************************/
/* */
/* process_user_args */
/* */
/****************************************/
/* Subroutine of main(). Validates and loads global
* variables with values from command line arguments.
*/
static void process_user_args (int argc, char *argv[])
{
int i;
char *argptr;
char *cptr;
char *src, *targ;
int oops = FALSE;
/* Each pass grabs new parm of "-xxx" format */
argc--, argv++;
while (argc > 0) {
argptr = argv[0];
if (*argptr != '-')
break;
switch (argptr[1]) {
case 'm':
if (argptr[2] == 'x')
dblk.dbrec.or_maxwordsz = atoi (argptr + 3);
else if (argptr[2] == 'n')
dblk.dbrec.or_minwordsz = atoi (argptr + 3);
else
goto BAD_ARG;
break;
case 'l':
dblk.dbrec.or_language = atoi (argptr + 2);
break;
case 'd':
for (cptr = argptr+2; *cptr != 0; cptr++) {
switch (*cptr) {
case 't': debugging_teskey = TRUE; break;
case 'p': debugging_paice = TRUE; break;
case 'j': debugging_jpn = TRUE; break;
default:
oops = TRUE;
fprintf (aa_stderr,
"%s Invalid debug option %c.\a\n",
PROGNAME"049", *cptr);
break;
}
}
break;
BAD_ARG:
default:
oops = TRUE;
fprintf (aa_stderr,
"%s Invalid command line argument '%s'.\a\n",
PROGNAME"059", argptr);
break;
} /* end switch */
argc--, argv++;
} /* main loop on each arg */
if (oops) {
fprintf (aa_stderr,
"\nUSAGE: %s [options]\n"
" -mx# maximum word size.\n"
" -mn# minimum word size.\n"
" -dtpj Debug: Teskey, Paice, Japanese.\n"
" -l# language number. Default 0.\n",
aa_argv0);
exit(2);
}
return;
} /* process_user_args() */
/****************************************/
/* */
/* main */
/* */
/****************************************/
int main (int argc, char *argv[])
{
int i;
int valid_boolpars;
char *cptr;
char linebuf [1024];
/* Init global variables */
aa_argv0 = argv[0];
memset (&usrblk, 0, sizeof(USRBLK));
usrblk.dblk = &dblk;
usrblk.debug |= USRDBG_BOOL; /* set debugging_boolpars */
memset (&dblk, 0, sizeof(DBLK));
strcpy (dblk.name, "testbool");
dblk.label = dblk.name;
dblk.dbrec.or_dbaccess |= ORA_BLOB; /* enable collocations */
/* Read command line args */
process_user_args (argc, argv);
if (!load_language (&dblk, NULL)) {
fprintf (aa_stderr,
PROGNAME"140 load_language() failed. Msgs:\n%s\n",
DtSearchGetMessages());
return 2;
}
fprintf (aa_stderr, " lang=%d minwdsz=%d maxwdsz=%d.\n",
dblk.dbrec.or_language,
dblk.dbrec.or_minwordsz,
dblk.dbrec.or_maxwordsz);
/* Main loop. Each line is a boolean query. */
printf ("Enter an AusText boolean query. 'q' or '.' to quit.\n"
"If first char is '$', words will be stemmed:\n> ");
fflush (stdout);
while (fgets (linebuf, sizeof(linebuf), stdin) != NULL) {
linebuf [sizeof(linebuf) - 1] = 0;
if (strcmp (linebuf, ".\n") == 0)
break;
if (strcmp (linebuf, "q\n") == 0)
break;
if (linebuf[0] == '\n')
break;
linebuf [strlen(linebuf) - 1] = 0; /* overlay \n */
if (linebuf[0] == '$') {
usrblk.query = linebuf + 1;
usrblk.request = OE_SRCH_STEMS;
}
else {
usrblk.query = linebuf;
usrblk.request = OE_SRCH_WORDS;
}
if (!boolean_parse())
puts (PROGNAME"707 boolean_parse() returned FALSE (OE_BAD_QUERY).");
if (DtSearchHasMessages()) {
printf ("mmmmm Messages returned to user mmmmmmmmmmmmmmmmmm\n"
"%s\nmmmmm End of messages to user mmmmmmmmmmmmmmmmmmmm\n",
DtSearchGetMessages());
DtSearchFreeMessages();
}
printf ("--------------------------------\n> ");
fflush (stdout);
} /* main read loop for each query line */
return 0;
} /* main() */
#endif /* TESTBOOL */
/********************* BOOLPARS.C ********************/