244 lines
7.3 KiB
C
244 lines
7.3 KiB
C
/*
|
|
* CDE - Common Desktop Environment
|
|
*
|
|
* Copyright (c) 1993-2012, The Open Group. All rights reserved.
|
|
*
|
|
* These libraries and programs are free software; you can
|
|
* redistribute them and/or modify them under the terms of the GNU
|
|
* Lesser General Public License as published by the Free Software
|
|
* Foundation; either version 2 of the License, or (at your option)
|
|
* any later version.
|
|
*
|
|
* These libraries and programs are distributed in the hope that
|
|
* they will be useful, but WITHOUT ANY WARRANTY; without even the
|
|
* implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
|
* PURPOSE. See the GNU Lesser General Public License for more
|
|
* details.
|
|
*
|
|
* You should have received a copy of the GNU Lesser General Public
|
|
* License along with these libraries and programs; if not, write
|
|
* to the Free Software Foundation, Inc., 51 Franklin Street, Fifth
|
|
* Floor, Boston, MA 02110-1301 USA
|
|
*/
|
|
/*
|
|
* COMPONENT_NAME: austext
|
|
*
|
|
* FUNCTIONS: discard_to_ETX
|
|
* readchar_ftext
|
|
* readchar_string
|
|
*
|
|
* ORIGINS: 27
|
|
*
|
|
*
|
|
* (C) COPYRIGHT International Business Machines Corp. 1996
|
|
* All Rights Reserved
|
|
* Licensed Materials - Property of IBM
|
|
* US Government Users Restricted Rights - Use, duplication or
|
|
* disclosure restricted by GSA ADP Schedule Contract with IBM Corp.
|
|
*/
|
|
/************************ READCHAR.C *******************************
|
|
* $XConsortium: readchar.c /main/3 1996/05/07 13:47:58 drk $
|
|
* January 1996.
|
|
* Character reading cofunctions for language parsers.
|
|
*
|
|
* $Log$
|
|
* Revision 1.5 1996/03/25 17:01:19 miker
|
|
* Clean up compiler warning.
|
|
*
|
|
* Revision 1.4 1996/03/13 22:59:39 miker
|
|
* Added prolog. Changed char to UCHAR several places.
|
|
*
|
|
* Revision 1.3 1996/03/05 18:39:34 miker
|
|
* Make *all* char ptrs unsigned.
|
|
*
|
|
* Revision 1.2 1996/03/05 18:08:03 miker
|
|
* Readchar functions return unsigned chars for compatibility
|
|
* with compilers whose default char type is signed.
|
|
*
|
|
* Revision 1.1 1996/02/01 19:20:39 miker
|
|
* Initial revision
|
|
*/
|
|
#include "SearchP.h"
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
|
|
#define PROGNAME "READCHAR"
|
|
|
|
/************************************************/
|
|
/* */
|
|
/* readchar_string */
|
|
/* */
|
|
/************************************************/
|
|
/* Generic readchar cofunction for parsers when the
|
|
* text block is a string. Used for example when
|
|
* parsing queries.
|
|
*/
|
|
UCHAR readchar_string (UCHAR *the_string)
|
|
{
|
|
static UCHAR *strp = (UCHAR *) "";
|
|
if (the_string)
|
|
strp = the_string;
|
|
return ((*strp)? *strp++ : 0);
|
|
}
|
|
|
|
|
|
/************************************************/
|
|
/* */
|
|
/* discard_to_ETX */
|
|
/* */
|
|
/************************************************/
|
|
/* Called when dtsrload or dtsrindex wants to skip to next
|
|
* .fzk record by reading and discarding all text to either
|
|
* end of record marker or end of file.
|
|
* Usually called after some error condition in the .fzk file,
|
|
* such as recid not found in database, or when blobs not
|
|
* used in dtsrload so they can be discarded.
|
|
* Wraps around readchar_ftext(), which does the actual read
|
|
* and checks for ETX with a read-ahead buffer.
|
|
*/
|
|
void discard_to_ETX (PARG *parg)
|
|
{
|
|
if (!readchar_ftext (parg))
|
|
return;
|
|
while (readchar_ftext (NULL))
|
|
;
|
|
return;
|
|
} /* discard_to_ETX() */
|
|
|
|
|
|
/************************************************/
|
|
/* */
|
|
/* readchar_ftext */
|
|
/* */
|
|
/************************************************/
|
|
/* Returns next char in a text file. Called in 2 different situations:
|
|
* It's a character reader cofunction called from
|
|
* linguistic parser functions for supported languages.
|
|
* It's also called directly from discard_to_ETX() in offline
|
|
* build tools for *all* languages when for some reason the
|
|
* current record being indexed must be discarded all the way
|
|
* to end of text block (ETX).
|
|
* ETX is when etxdelim string detected, or at end-of-file.
|
|
*
|
|
* The first call, which passes parg, is a reset trigger
|
|
* to clear ETX. The globals are then set and used in
|
|
* subsequent calls (passing NULL). This technique is
|
|
* used because it will be called many times in a time
|
|
* critical loop while indexing.
|
|
*
|
|
* Returns '\0' at ETX, and keeps returning '\0'
|
|
* without further reads until the ETX flag is reset.
|
|
* Returns '\0' forever at end-of-file.
|
|
*/
|
|
UCHAR readchar_ftext (PARG *parg)
|
|
{
|
|
static FILE *ftext = NULL;
|
|
static UCHAR *etxdelim = NULL;
|
|
static UCHAR *rabuf = NULL;
|
|
static int ETX_flag = TRUE;
|
|
static int delimsz = 0;
|
|
static int bufcount = 0;
|
|
static int i;
|
|
static UCHAR *head, *tail, *cptr, *rabufend;
|
|
|
|
/* I'm always going to read ahead just enough chars
|
|
* to test the delim string. The string is expected
|
|
* to be small, typically just a few chars.
|
|
* (A single \0 char indicates there is no record
|
|
* delimiter--record ends only at end of file.)
|
|
* I use a circular read ahead buffer with head and tail ptrs.
|
|
* Bufcount contains current number of chars in buf.
|
|
* Head is next file read point, ahead of youngest char in buf.
|
|
* Tail is next char to return, ie oldest char in buf.
|
|
*/
|
|
if (parg) {
|
|
ftext = parg->ftext;
|
|
if (feof(ftext)) {
|
|
ETX_flag = TRUE;
|
|
return 0;
|
|
}
|
|
if (etxdelim) {
|
|
free (etxdelim);
|
|
etxdelim = NULL;
|
|
}
|
|
if (parg->etxdelim)
|
|
etxdelim = (UCHAR *) strdup (parg->etxdelim);
|
|
ETX_flag = FALSE;
|
|
delimsz = (etxdelim)? strlen((char*) etxdelim) : 0;
|
|
if (!rabuf) {
|
|
rabuf = austext_malloc (MAX_ETXDELIM + 2, PROGNAME"479", NULL);
|
|
rabufend = rabuf + MAX_ETXDELIM;
|
|
}
|
|
if (delimsz >= MAX_ETXDELIM) {
|
|
fprintf (aa_stderr, PROGNAME"505 Record delimiter too long.\n");
|
|
DtSearchExit (2);
|
|
}
|
|
head = tail = rabuf;
|
|
bufcount = 0;
|
|
}
|
|
|
|
if (ETX_flag)
|
|
return 0;
|
|
|
|
/* Read chars into read ahead buf until we
|
|
* have enough to compare for etxdelim.
|
|
* If possible, always read in at least one char.
|
|
*/
|
|
while (bufcount == 0 || bufcount < delimsz) {
|
|
if (feof(ftext))
|
|
break;
|
|
if ((i = fgetc (ftext)) == EOF)
|
|
break;
|
|
*head++ = i;
|
|
bufcount++;
|
|
if (head >= rabufend)
|
|
head = rabuf;
|
|
}
|
|
|
|
/* There are now 3 possibilities.
|
|
* (1) If bufcount == 0 we got EOF and there
|
|
* are no chars remaining in buffer, quit now.
|
|
* (2) Most likely bufcount is nonzero and equals delimsz.
|
|
* Do a wrap-around strcmp looking for delim string.
|
|
* (3) If bufcount is positive but less than delimsz,
|
|
* we got EOF before the last record ended.
|
|
* We'll fall through to the code that returns the next
|
|
* char in the buffer, returning the remaining chars one
|
|
* at a time until exhausted.
|
|
* Note this sequence also handles the case where delimsz == 0.
|
|
*/
|
|
if (bufcount <= 0) {
|
|
ETX_flag = TRUE;
|
|
return 0;
|
|
}
|
|
|
|
/* Compare chars in read ahead buf for delim string.
|
|
* (Note that if the compare succeeds, both bufcount
|
|
* and delimsz must be > 0).
|
|
*/
|
|
if (bufcount == delimsz) {
|
|
cptr = tail;
|
|
for (i = 0; i < delimsz; i++) {
|
|
if (etxdelim[i] != *cptr)
|
|
break;
|
|
cptr++;
|
|
if (cptr >= rabufend)
|
|
cptr = rabuf;
|
|
}
|
|
if (i == delimsz) {
|
|
ETX_flag = TRUE;
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
/* No ETX yet. Return the oldest char in read ahead buffer. */
|
|
cptr = tail++;
|
|
if (tail >= rabufend)
|
|
tail = rabuf;
|
|
bufcount--;
|
|
return *cptr;
|
|
} /* readchar_ftext */
|
|
|
|
/*************************** READCHAR.C ****************************/
|
|
|