/* * CDE - Common Desktop Environment * * Copyright (c) 1993-2012, The Open Group. All rights reserved. * * These libraries and programs are free software; you can * redistribute them and/or modify them under the terms of the GNU * Lesser General Public License as published by the Free Software * Foundation; either version 2 of the License, or (at your option) * any later version. * * These libraries and programs are distributed in the hope that * they will be useful, but WITHOUT ANY WARRANTY; without even the * implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR * PURPOSE. See the GNU Lesser General Public License for more * details. * * You should have received a copy of the GNU Lesser General Public * License along with these libraries and programs; if not, write * to the Free Software Foundation, Inc., 51 Franklin Street, Fifth * Floor, Boston, MA 02110-1301 USA */ /* $XConsortium: boolpars.c /main/5 1996/11/25 18:49:27 drk $ * * (c) Copyright 1996 Digital Equipment Corporation. * (c) Copyright 1996 Hewlett-Packard Company. * (c) Copyright 1996 International Business Machines Corp. * (c) Copyright 1996 Sun Microsystems, Inc. * (c) Copyright 1996 Novell, Inc. * (c) Copyright 1996 FUJITSU LIMITED. * (c) Copyright 1996 Hitachi. */ /* * COMPONENT_NAME: austext * * FUNCTIONS: add_syntax_errmsg * boolean_parse * boolyac_AND * boolyac_COLLOC * boolyac_NOT * boolyac_OR * copy_final_truthtab * copy_token * creatett * freett * get_stem_truthtab * main * process_user_args * yyerror * yylex * * ORIGINS: 27 * * * (C) COPYRIGHT International Business Machines Corp. 1996 * All Rights Reserved * Licensed Materials - Property of IBM * US Government Users Restricted Rights - Use, duplication or * disclosure restricted by GSA ADP Schedule Contract with IBM Corp. */ /********************* BOOLPARS.C ******************** * $Id: boolpars.c /main/5 1996/11/25 18:49:27 drk $ * February 1996. * AusText/DtSearch yacc-based boolean query parser. * Converts boolean query into stems array and truth table * for subsequent search. Boolyac.y is the yacc source. * After processing by yacc, it becomes boolyac.c and boolyac.h. * This module contains all the related C source code: yylex, * yacc action functions, and the main AusText driver function, boolean_parse. * Additional information (format of TRUTHTAB) in header file boolpars.h. * * $Log$ * Revision 1.4 1996/03/22 23:12:50 miker * Added string.h header and correctly cast strcspn() calls. * * Revision 1.3 1996/03/20 19:14:30 miker * Enable collocation expressions in stem (type 'S') searches. * * Revision 1.2 1996/03/13 22:35:59 miker * Changed char to UCHAR several places; similar typecasts. * * Revision 1.1 1996/03/05 15:52:06 miker * Initial revision */ #include "SearchE.h" #include #include #include "boolpars.h" #include "boolyac.h" #if (DtSrMAX_STEMCOUNT != 8) #error DtSrMAX_STEMCOUNT is not defined to be 8. #endif #define PROGNAME "BOOLPARS" #define WORD_ENDERS " \t\n\f()|@~&" #define MAX_YYERRORS 4 #define MS_boolpars 28 /****************************************/ /* */ /* GLOBALS */ /* */ /****************************************/ int qry_has_no_NOTs = FALSE; int qry_is_all_ANDs = FALSE; TRUTHTAB final_truthtab = { 0 }; int parser_invalid_wordcount = 0; static int debugging_boolpars = FALSE; static unsigned char *final_permutes = NULL; static int last_token_was_boolop = TRUE; static char *msgbuf = NULL; static UCHAR *next_lex_char = NULL; static int paren_count = 0; static TRUTHTAB *ttlist = NULL; static int yyerror_count = 0; static size_t yyleng; /* same as in lex API */ static char *yytext; /* same as in lex API */ /****************************************/ /* */ /* add_syntax_errmsg */ /* */ /****************************************/ /* Action function called for yacc rules used to trap syntax errors. * Adds error message identified by msgno to user's msglist. */ void add_syntax_errmsg (int msgno) { switch (msgno) { case 1: /* Message #2 is called in two places */ sprintf (msgbuf, catgets(dtsearch_catd, MS_boolpars, 2, "%s Query field is empty."), PROGNAME"086"); DtSearchAddMessage (msgbuf); break; case 2: sprintf (msgbuf, catgets(dtsearch_catd, MS_boolpars, 5, "%s Boolean operators must be positioned\n" "between words or expressions. Two sequential words\n" "without an operator are interpreted as being separated\n" "by the AND operator (&)."), PROGNAME"091"); DtSearchAddMessage (msgbuf); break; case 3: sprintf (msgbuf, catgets(dtsearch_catd, MS_boolpars, 6, "%s Expression in parentheses is missing."), PROGNAME"093"); DtSearchAddMessage (msgbuf); break; case 4: sprintf (msgbuf, catgets(dtsearch_catd, MS_boolpars, 7, "%s NOT operator (~) must be positioned to\n" "the left of the word or expression it qualifies."), PROGNAME"098"); DtSearchAddMessage (msgbuf); break; case 5: /* Message #3 is called in two places */ sprintf (msgbuf, catgets(dtsearch_catd, MS_boolpars, 3, "%s COLLOCATION operator (@) may\n" "only be positioned between two words."), PROGNAME"111"); DtSearchAddMessage (msgbuf); break; case 6: sprintf (msgbuf, catgets(dtsearch_catd, MS_boolpars, 4, "%s One or more words in your\n" "query are not stored in database '%s'.") , PROGNAME"089", usrblk.dblk->label); DtSearchAddMessage (msgbuf); break; default: sprintf (msgbuf, catgets(dtsearch_catd, MS_boolpars, 8, "%s Invalid boolean query. Syntax Error #%d.") , PROGNAME"100", msgno); DtSearchAddMessage (msgbuf); break; } return; } /* add_syntax_errmsg() */ /****************************************/ /* */ /* creatett */ /* */ /****************************************/ /* Constructor for new truth table. * Allocates it, inits it, and links it into ttlist. */ static TRUTHTAB *creatett (int stemno, int pmsz, unsigned char *permutes) { TRUTHTAB *newtt = austext_malloc (sizeof(TRUTHTAB) + pmsz + 4, PROGNAME"140", NULL); memset (newtt, 0, sizeof(TRUTHTAB)); newtt->stemno = stemno; newtt->pmsz = pmsz; newtt->permutes = (unsigned char *) (newtt + 1); memcpy (newtt->permutes, permutes, pmsz); newtt->next = ttlist; ttlist = newtt; return newtt; } /* creatett() */ /****************************************/ /* */ /* freett */ /* */ /****************************************/ /* Destructor of passed truth table. * Unlinks it from ttlist and frees it. */ static void freett (TRUTHTAB *argtt) { TRUTHTAB *tt; TRUTHTAB **lastlink = &ttlist; for (tt = ttlist; tt; tt = tt->next) { if (tt == argtt) { *lastlink = tt->next; free (tt); break; } lastlink = &tt->next; } return; } /* freett() */ /****************************************/ /* */ /* copy_final_truthtab */ /* */ /****************************************/ /* Copys passed truth table into global final_truthtab. * Returns final_truthtab. */ TRUTHTAB *copy_final_truthtab (TRUTHTAB *tt) { memset (&final_truthtab, 0, sizeof(TRUTHTAB)); if (!final_permutes) final_permutes = austext_malloc (300, PROGNAME"788", NULL); final_truthtab.pmsz = tt->pmsz; final_truthtab.permutes = final_permutes; memcpy (final_permutes, tt->permutes, final_truthtab.pmsz); return &final_truthtab; } /* copy_final_truthtab() */ /****************************************/ /* */ /* get_stem_truthtab */ /* */ /****************************************/ /* Subroutine of yylex. Also used in yacc action functions. * Creates and returns truth table for passed stem. * If stem is new, adds it to saveusr.stems array, and adds * the original query word string to usrblk.stems for msgs. * Returns NULL and posts err msg if array is full * or has other error. */ static TRUTHTAB *get_stem_truthtab (char *newstem, char *origword) { int i, stemno; unsigned char bitmask; unsigned char *pmp; unsigned char new_permutes [128]; TRUTHTAB *newtt; /* Check if stem is already in array */ for (stemno = 0; stemno < saveusr.stemcount; stemno++) if (strcmp (newstem, saveusr.stems[stemno]) == 0) break; /* Add new stem to array */ if (stemno == saveusr.stemcount) { if (++saveusr.stemcount > DtSrMAX_STEMCOUNT) { sprintf (msgbuf, catgets (dtsearch_catd, MS_boolpars, 9, "%s Too many terms in boolean query."), PROGNAME"1513"); DtSearchAddMessage (msgbuf); saveusr.stemcount--; return NULL; } strncpy (saveusr.stems[stemno], newstem, DtSrMAXWIDTH_HWORD); saveusr.stems [stemno] [DtSrMAXWIDTH_HWORD - 1] = 0; if (origword) { strncpy (usrblk.stems[stemno], origword, DtSrMAXWIDTH_HWORD); usrblk.stems [stemno] [DtSrMAXWIDTH_HWORD - 1] = 0; } } /* Stemno now indicates correct term in saveusr.stems. * Truth table for a single term has 128 8-bit permutes, * the 1/2 of all 256 possible permutations that have * that term's bit switched on. */ bitmask = 1 << stemno; /* mask with only newstem's bit on */ pmp = new_permutes; for (i=0; i<256; i++) if ((i & bitmask) != 0) { *pmp = i; pmp++; } newtt = creatett (stemno, 128, new_permutes); if (debugging_boolpars) { fprintf (aa_stderr, " WORD: stem[%d]='%c%s' expr=%p pmsz=%d\n", stemno, (saveusr.stems[stemno][0] == STEM_CH) ? '~' : saveusr.stems[stemno][0], &saveusr.stems[stemno][1], (void *) newtt, newtt->pmsz); fflush (aa_stderr); } return newtt; } /* get_stem_truthtab() */ /****************************************/ /* */ /* boolyac_AND */ /* */ /****************************************/ /* Action function for AND expression rule. * Returns set INTERSECTION of passed truth tables, * ie only the permutes they have in common. * Any truth table, input or output, can be the empty or * the universal set. For example: "(A & B) & ~A" is empty. */ TRUTHTAB *boolyac_AND (TRUTHTAB *tt1, TRUTHTAB *tt2) { TRUTHTAB *newtt; unsigned char new_permutes [256]; int pm1, pm2, newpm; pm1 = pm2 = newpm = 0; while (pm1 < tt1->pmsz && pm2 < tt2->pmsz) { if (tt1->permutes[pm1] < tt2->permutes[pm2]) pm1++; else if (tt1->permutes[pm1] > tt2->permutes[pm2]) pm2++; else { new_permutes [newpm++] = tt1->permutes [pm1]; pm1++; pm2++; } } /* Free old truthtabs, create new one. */ freett (tt1); freett (tt2); newtt = creatett (-1, newpm, new_permutes); if (debugging_boolpars) { fprintf (aa_stderr, " AND: exprs=%p,%p-->expr=%p pmsz=%d\n", (void *) tt1, (void *) tt2, (void *) newtt, newtt->pmsz); fflush (aa_stderr); } return newtt; } /* boolyac_AND() */ /****************************************/ /* */ /* boolyac_OR */ /* */ /****************************************/ /* Action function for OR expression rule. * Returns set UNION of passed truth tables. * Any truth table, input or output, can be the empty or * the universal set. For example: "A | ~A" is universal. */ TRUTHTAB *boolyac_OR (TRUTHTAB *tt1, TRUTHTAB *tt2) { TRUTHTAB *newtt; unsigned char new_permutes [256]; unsigned char *permutes1 = tt1->permutes; unsigned char *permutes2 = tt2->permutes; int pm1, pm2, newpm; pm1 = pm2 = newpm = 0; /* While neither permutes array is exhausted... */ while (pm1 < tt1->pmsz && pm2 < tt2->pmsz) { if (permutes1[pm1] < permutes2[pm2]) new_permutes [newpm++] = permutes1[pm1++]; else if (permutes2[pm2] < permutes1[pm1]) new_permutes [newpm++] = permutes2[pm2++]; else { new_permutes [newpm++] = permutes1[pm1++]; pm2++; } } /* After one or both permutes arrays are exhausted... */ while (pm1 < tt1->pmsz) new_permutes [newpm++] = permutes1[pm1++]; while (pm2 < tt2->pmsz) new_permutes [newpm++] = permutes2[pm2++]; /* Free old truthtabs, create new one. */ freett (tt1); freett (tt2); newtt = creatett (-1, newpm, new_permutes); if (debugging_boolpars) { fprintf (aa_stderr, " OR: exprs=%p,%p-->expr=%p pmsz=%d\n", (void *) tt1, (void *) tt2, (void *) newtt, newtt->pmsz); fflush (aa_stderr); } return newtt; } /* boolyac_OR() */ /****************************************/ /* */ /* boolyac_NOT */ /* */ /****************************************/ /* Action function for NOT expression rule. * Returns set COMPLEMENT of passed truth table, * ie the universal set minus the passed set, * ie all possible permutes except those passed. * Either the old or the new truth table can be * the empty or the universal set. */ TRUTHTAB *boolyac_NOT (TRUTHTAB *oldtt) { TRUTHTAB *newtt; unsigned char new_permutes [256]; int oldpm, newpm; int candidate; oldpm = newpm = 0; for (candidate = 0; candidate < 256; candidate++) { if (oldpm >= oldtt->pmsz || candidate < oldtt->permutes [oldpm]) { new_permutes [newpm++] = candidate; } /* * oldtt not done && candidate == oldtt. * (candidate > oldtt not possible). */ else { oldpm++; } } freett (oldtt); newtt = creatett (-1, newpm, new_permutes); if (debugging_boolpars) { fprintf (aa_stderr, " NOT: expr=%p-->expr=%p pmsz=%d\n", (void *) oldtt, (void *) newtt, newtt->pmsz); fflush (aa_stderr); } return newtt; } /* boolyac_NOT() */ /****************************************/ /* */ /* boolyac_COLLOC */ /* */ /****************************************/ /* Action function for COLLOCATION expression rule. * The record set satisfying a collocation expression is * generated dynamically. At the parse level it is equivalent * to a separate 'word' with its own (undetermined) record set. * So it's given its own slot in saveusr.stems. The word * in saveusr.stems is formatted "@ssttv[v...]" where ss and tt are * ascii numbers that index the original collocated words * in saveusr.stems, and v... is the collocation value integer. * For example, "@03005" represents the collocation of stem * number 3 and stem number 0, with collocation value 5. * * Returns NULL and errmsg on msglist if any problems. */ TRUTHTAB *boolyac_COLLOC ( TRUTHTAB *word1tt, int colloc_val, TRUTHTAB *word2tt) { TRUTHTAB *newtt; char wordbuf [DtSrMAXWIDTH_HWORD]; if (word1tt->stemno < 0 || word2tt->stemno < 0) { /* Message #3 is called in two places */ sprintf (msgbuf, catgets(dtsearch_catd, MS_boolpars, 3, "%s COLLOCATION operator (@) may\n" "only be positioned between two words."), PROGNAME"371"); DtSearchAddMessage (msgbuf); return NULL; } if (word1tt->stemno == word2tt->stemno) { sprintf (msgbuf, catgets(dtsearch_catd, MS_boolpars, 12, "%s Collocation operator is not\n" "permitted between identical words."), PROGNAME"377"); DtSearchAddMessage (msgbuf); return NULL; } sprintf (wordbuf, COLLOC_STEM_FORMAT, word1tt->stemno, word2tt->stemno, colloc_val); if ((newtt = get_stem_truthtab (wordbuf, wordbuf)) == NULL) return NULL; freett (word1tt); freett (word2tt); if (debugging_boolpars) { fprintf (aa_stderr, " COLLOC: exprs=%p,%p-->expr=%p pmsz=%d\n", (void *) word1tt, (void *) word2tt, (void *) newtt, newtt->pmsz); fflush (aa_stderr); } return newtt; } /* boolyac_COLLOC() */ /****************************************/ /* */ /* yyerror */ /* */ /****************************************/ /* Replaces standard yacc error routine. */ void yyerror (char *msg) { if (strcmp (msg, "syntax error") == 0) { if (DtSearchHasMessages()) return; else if (parser_invalid_wordcount > 0) add_syntax_errmsg(6); else { sprintf (msgbuf, catgets(dtsearch_catd, MS_boolpars, 1, "%s Your search string is an invalid\n" "boolean query. Please reformulate and try again."), PROGNAME"001"); DtSearchAddMessage (msgbuf); } } else DtSearchAddMessage (msg); return; } /* yyerror() */ /****************************************/ /* */ /* copy_token */ /* */ /****************************************/ /* Subroutine of yylex(). Copies passed substring * Into a zero-terminated buffer of its own. * Static buffer good until next call. */ static char *copy_token (UCHAR *tokenp, size_t toklen) { static char *buf = NULL; static size_t bufsz = 0; if (toklen > bufsz) { if (buf) free (buf); bufsz = toklen + (toklen >> 1); /* 1.5 times size needed */ buf = austext_malloc (bufsz + 4, PROGNAME"182", NULL); } strncpy (buf, (char *) tokenp, toklen); buf [toklen] = 0; return buf; } /* copy_token() */ /****************************************/ /* */ /* yylex */ /* */ /****************************************/ /* Delivers tokens to yyparse() from usrblk.query */ int yylex (void) { int retn_token; PARG parg; char *stembufp; char mystembuf [DtSrMAXWIDTH_HWORD + 4]; GET_ANOTHER_TOKEN: /* Skip white space */ while (ascii_charmap[*next_lex_char] & WHITESPACE) next_lex_char++; /* Terminating zero indicates end of query and end of parse. * Automatically close unbalanced parentheses. */ if (*next_lex_char == 0) { if (paren_count > 0) { paren_count--; retn_token = ')'; yytext = ")"; yyleng = 1; goto DELIVER_TOKEN; } retn_token = 0; yytext = ""; yyleng = 0; goto DELIVER_TOKEN; } switch (*next_lex_char) { case '|': /* OR operator */ last_token_was_boolop = TRUE; retn_token = '|'; yytext = "|"; yyleng = 1; next_lex_char++; break; case '~': /* NOT operator */ if (!last_token_was_boolop) { /* Generate implied AND between words * and parenthesized expressions. * A NOT is not itself boolean; it must * precede the next word or expression. */ last_token_was_boolop = TRUE; retn_token = '&'; yytext = "&"; yyleng = 1; break; } last_token_was_boolop = TRUE; retn_token = '~'; yytext = "~"; yyleng = 1; next_lex_char++; break; case '&': /* AND operator */ if (last_token_was_boolop && qry_is_all_ANDs) { /* Ignore multiple AND operators. * These might occur if we silently * discarded some invalid words. */ next_lex_char++; goto GET_ANOTHER_TOKEN; } last_token_was_boolop = TRUE; retn_token = '&'; yytext = "&"; yyleng = 1; next_lex_char++; break; case '(': /* OPEN parentheses */ if (!last_token_was_boolop) { /* Generate implied AND between words * and parenthesized expressions. */ last_token_was_boolop = TRUE; retn_token = '&'; yytext = "&"; yyleng = 1; break; } paren_count++; retn_token = '('; yytext = "("; yyleng = 1; next_lex_char++; break; case ')': /* CLOSE parentheses */ /* Just discard excessive right parentheses */ if (--paren_count < 0) { paren_count = 0; next_lex_char++; goto GET_ANOTHER_TOKEN; } last_token_was_boolop = FALSE; retn_token = ')'; yytext = ")"; yyleng = 1; next_lex_char++; break; case '@': /* COLLOCATION operator */ /* Collocation token: * Token is defined as the collocation char followed * by one or more numeric digits: "@#[#...]". * Syntactically it's a kind of an AND operator. * Semantically it's a pseudo word token * (it will occupy a slot in the stems array). * The yylval is the integer value following * the collocation character. */ yyleng = strcspn ((char *) next_lex_char + 1, WORD_ENDERS) + 1; yytext = copy_token (next_lex_char, yyleng); next_lex_char += yyleng; if ((usrblk.dblk->dbrec.or_dbaccess & ORA_BLOB) == 0) { retn_token = ERROR_TOKEN; sprintf (msgbuf, catgets(dtsearch_catd, MS_boolpars, 10, "%s Collocation searches not available for database '%s'."), PROGNAME"2567", usrblk.dblk->label); DtSearchAddMessage (msgbuf); break; } yylval.int_val = atoi (yytext + 1); if (yylval.int_val <= 0) { retn_token = ERROR_TOKEN; sprintf (msgbuf, catgets(dtsearch_catd, MS_boolpars, 11, "%s Collocation operator '%.*s' is invalid.\n" "Correct format is '@n' where n is greater than zero.") , PROGNAME"294", DtSrMAXWIDTH_HWORD, yytext); DtSearchAddMessage (msgbuf); break; } last_token_was_boolop = TRUE; retn_token = COLLOC_TOKEN; break; default: /* Presumed word token: * Token is all text chars until next whitespace, * next lex token, or end of string. * Linguistically parse it and optionally stem it. * The token value is the truth table for one * word: all permutes with only that word's * bits turned on. If the word is already * in the stems array, then the permutes * position is the word's index in the array. * If the word is not in the array, it's added. * If the array is full, then an error is reported. */ if (!last_token_was_boolop) { /* Generate implied AND between words * and parenthesized expressions. */ last_token_was_boolop = TRUE; retn_token = '&'; yytext = "&"; yyleng = 1; break; } yyleng = strcspn ((char *) next_lex_char, WORD_ENDERS); yytext = copy_token (next_lex_char, yyleng); next_lex_char += yyleng; /* * Linguistically parse the token. * Failure can occur because word is too short * or too long, it's on the stoplist, etc. * Setting PA_MSGS causes parser to explain * invalid words with a msg. */ memset (&parg, 0, sizeof(PARG)); parg.dblk = usrblk.dblk; parg.string = yytext; /*****if (!qry_is_all_ANDs)********/ parg.flags = PA_MSGS; stembufp = usrblk.dblk->parser (&parg); if (debugging_boolpars) { fprintf (aa_stderr, " lang: '%s' -> '%s'\n", yytext, (stembufp)? stembufp : ""); fflush (aa_stderr); } /* * If token is not a linguistically valid word, * one of two things can happen. If the query * is all_ANDs (most common type) we silently * ignore the token. * Otherwise report error and quit now. */ if (stembufp == NULL) { parser_invalid_wordcount++; if (qry_is_all_ANDs) goto GET_ANOTHER_TOKEN; retn_token = ERROR_TOKEN; if (!DtSearchHasMessages()) { sprintf (msgbuf, catgets(dtsearch_catd, MS_boolpars, 13, "%s Word '%.*s' is invalid.") , PROGNAME"315", DtSrMAXWIDTH_HWORD, yytext); DtSearchAddMessage (msgbuf); } break; } if (strlen(stembufp) != strlen(yytext)) { retn_token = ERROR_TOKEN; sprintf (msgbuf, catgets(dtsearch_catd, MS_boolpars, 14, "%s String '%.*s' is not a single word.") , PROGNAME"634", DtSrMAXWIDTH_HWORD, yytext); DtSearchAddMessage (msgbuf); break; } /* * If stemming, we must prefix term with * special stem char in the stems array. */ if (usrblk.request == OE_SRCH_STEMS) { stembufp = usrblk.dblk->stemmer (stembufp, usrblk.dblk); if (debugging_boolpars) { fprintf (aa_stderr, " stemer: -> '%s'\n", stembufp); fflush (aa_stderr); } mystembuf[0] = STEM_CH; strncpy (mystembuf + 1, stembufp, DtSrMAXWIDTH_HWORD); mystembuf [DtSrMAXWIDTH_HWORD - 1] = 0; stembufp = mystembuf; } /* Load stem into stems arrays and return it's truth table. */ if ((yylval.truthtab = get_stem_truthtab (stembufp, yytext))) { retn_token = WORD_TOKEN; last_token_was_boolop = FALSE; } else retn_token = ERROR_TOKEN; break; } /* switch on *next_lex_char */ DELIVER_TOKEN: if (debugging_boolpars) { fprintf (aa_stderr, " yylex: op?=%d parct=%d tok#=%d lval=%p%sYYTEXT='%s'\n", last_token_was_boolop, paren_count, retn_token, (void *) yylval.truthtab, (retn_token == COLLOC_TOKEN)? "\t\t" : "\t", yytext); fflush (aa_stderr); } return retn_token; } /* yylex() */ /****************************************/ /* */ /* boolean_parse */ /* */ /****************************************/ /* Called from Opera_Engine for boolean searches. * Driver for yyparse(). * Expects usrblk.request == OE_SRCH_STEMS or OE_SRCH_WORDS. * If parse is completely successful (query is valid), outputs * saveusr.stemcount, * saveusr.stems (stemmed if necessary with STEM_CH as first char, * and phony colloc words with '@' as first char), * usrblk.stems (original unstemmed query terms for err msgs), * final_truthtab, * qry_has_no_NOTs, * qry_is_all_ANDs, * and returns TRUE. Truthtab allocation good until next call. * If parse fails, returns FALSE and err msg(s) on msglist. */ int boolean_parse (void) { int i; char *cptr; TRUTHTAB *tt, *ttnext; debugging_boolpars = (usrblk.debug & USRDBG_BOOL); if (!msgbuf) msgbuf = austext_malloc (300 + DtSrMAXWIDTH_HWORD, PROGNAME"255", NULL); /* Test for empty query */ if (usrblk.query == NULL) { EMPTY_QUERY: /* Message #2 is called in two places */ sprintf (msgbuf, catgets(dtsearch_catd, MS_boolpars, 2, "%s Query is empty."), PROGNAME"289"); DtSearchAddMessage (msgbuf); return FALSE; } for (cptr = usrblk.query; *cptr; cptr++) { if ((ascii_charmap[*cptr] & WHITESPACE) == 0) break; } if (*cptr == 0) goto EMPTY_QUERY; /* Init globals for yylex and yyparse */ next_lex_char = (UCHAR *) usrblk.query; paren_count = 0; yyerror_count = 0; last_token_was_boolop = TRUE; saveusr.stemcount = 0; parser_invalid_wordcount = 0; /* Query "is all ANDS" if it has no ORs, NOTs, or COLLOCs. * Missing or linguistically invalid words will be silently * discarded for all_ANDs queries. * Query "has no NOTs" if it has no NOTs. * Results from queries without NOTs can be statistically sorted. */ qry_has_no_NOTs = !strchr (usrblk.query, '~'); qry_is_all_ANDs = !strpbrk (usrblk.query, "|~@"); if (debugging_boolpars || (usrblk.debug & USRDBG_SRCHCMPL)) { fprintf (aa_stderr, "start boolean_parse: stem?=%d allANDs?=%d noNOTs?=%d\n" " query: '%s'\n", (usrblk.request == OE_SRCH_STEMS), qry_is_all_ANDs, qry_has_no_NOTs, usrblk.query); fflush (aa_stderr); } if (yyparse() != 0) return FALSE; /* Free entire remaining ttlist. Only you * can prevent forest fires and memory leaks. */ tt = ttlist; while (tt) { ttnext = tt->next; free (tt); tt = ttnext; } ttlist = NULL; if (debugging_boolpars || (usrblk.debug & USRDBG_SRCHCMPL)) { print_stems (saveusr.stemcount, saveusr.stems, PROGNAME"815 end boolean_parse, syntax ok,"); fprintf (aa_stderr, " permutes=%d:", final_truthtab.pmsz); for (i=0; i<16; i++) { if (i >= final_truthtab.pmsz) break; fprintf (aa_stderr, " %02x", final_truthtab.permutes [i]); } fputc ('\n', aa_stderr); fflush (aa_stderr); } if (final_truthtab.pmsz <= 0) { sprintf (msgbuf, catgets(dtsearch_catd, MS_boolpars, 15, "%s Your query cannot logically return\n" "any records. Please reformulate and try again."), PROGNAME"334"); DtSearchAddMessage (msgbuf); return FALSE; } if (final_truthtab.pmsz >= 256) { sprintf (msgbuf, catgets(dtsearch_catd, MS_boolpars, 16, "%s Your query will return entire database\n" "'%s'. Please reformulate and try again.") , PROGNAME"341", usrblk.dblk->label); DtSearchAddMessage (msgbuf); return FALSE; } return TRUE; } /* boolean_parse() */ #ifdef TESTBOOL /*-----------------------------------------------*/ USRBLK usrblk = { 0 }; DBLK dblk; SAVEUSR saveusr = { 0 }; extern int debugging_teskey; extern int debugging_paice; extern int debugging_jpn; /****************************************/ /* */ /* process_user_args */ /* */ /****************************************/ /* Subroutine of main(). Validates and loads global * variables with values from command line arguments. */ static void process_user_args (int argc, char *argv[]) { int i; char *argptr; char *cptr; char *src, *targ; int oops = FALSE; /* Each pass grabs new parm of "-xxx" format */ argc--, argv++; while (argc > 0) { argptr = argv[0]; if (*argptr != '-') break; switch (argptr[1]) { case 'm': if (argptr[2] == 'x') dblk.dbrec.or_maxwordsz = atoi (argptr + 3); else if (argptr[2] == 'n') dblk.dbrec.or_minwordsz = atoi (argptr + 3); else goto BAD_ARG; break; case 'l': dblk.dbrec.or_language = atoi (argptr + 2); break; case 'd': for (cptr = argptr+2; *cptr != 0; cptr++) { switch (*cptr) { case 't': debugging_teskey = TRUE; break; case 'p': debugging_paice = TRUE; break; case 'j': debugging_jpn = TRUE; break; default: oops = TRUE; fprintf (aa_stderr, "%s Invalid debug option %c.\a\n", PROGNAME"049", *cptr); break; } } break; BAD_ARG: default: oops = TRUE; fprintf (aa_stderr, "%s Invalid command line argument '%s'.\a\n", PROGNAME"059", argptr); break; } /* end switch */ argc--, argv++; } /* main loop on each arg */ if (oops) { fprintf (aa_stderr, "\nUSAGE: %s [options]\n" " -mx# maximum word size.\n" " -mn# minimum word size.\n" " -dtpj Debug: Teskey, Paice, Japanese.\n" " -l# language number. Default 0.\n", aa_argv0); exit(2); } return; } /* process_user_args() */ /****************************************/ /* */ /* main */ /* */ /****************************************/ int main (int argc, char *argv[]) { int i; int valid_boolpars; char *cptr; char linebuf [1024]; /* Init global variables */ aa_argv0 = argv[0]; memset (&usrblk, 0, sizeof(USRBLK)); usrblk.dblk = &dblk; usrblk.debug |= USRDBG_BOOL; /* set debugging_boolpars */ memset (&dblk, 0, sizeof(DBLK)); strcpy (dblk.name, "testbool"); dblk.label = dblk.name; dblk.dbrec.or_dbaccess |= ORA_BLOB; /* enable collocations */ /* Read command line args */ process_user_args (argc, argv); if (!load_language (&dblk, NULL)) { fprintf (aa_stderr, PROGNAME"140 load_language() failed. Msgs:\n%s\n", DtSearchGetMessages()); return 2; } fprintf (aa_stderr, " lang=%d minwdsz=%d maxwdsz=%d.\n", dblk.dbrec.or_language, dblk.dbrec.or_minwordsz, dblk.dbrec.or_maxwordsz); /* Main loop. Each line is a boolean query. */ printf ("Enter an AusText boolean query. 'q' or '.' to quit.\n" "If first char is '$', words will be stemmed:\n> "); fflush (stdout); while (fgets (linebuf, sizeof(linebuf), stdin) != NULL) { linebuf [sizeof(linebuf) - 1] = 0; if (strcmp (linebuf, ".\n") == 0) break; if (strcmp (linebuf, "q\n") == 0) break; if (linebuf[0] == '\n') break; linebuf [strlen(linebuf) - 1] = 0; /* overlay \n */ if (linebuf[0] == '$') { usrblk.query = linebuf + 1; usrblk.request = OE_SRCH_STEMS; } else { usrblk.query = linebuf; usrblk.request = OE_SRCH_WORDS; } if (!boolean_parse()) puts (PROGNAME"707 boolean_parse() returned FALSE (OE_BAD_QUERY)."); if (DtSearchHasMessages()) { printf ("mmmmm Messages returned to user mmmmmmmmmmmmmmmmmm\n" "%s\nmmmmm End of messages to user mmmmmmmmmmmmmmmmmmmm\n", DtSearchGetMessages()); DtSearchFreeMessages(); } printf ("--------------------------------\n> "); fflush (stdout); } /* main read loop for each query line */ return 0; } /* main() */ #endif /* TESTBOOL */ /********************* BOOLPARS.C ********************/