cdesktopenv/cde/lib/DtSearch/isduprec.c

/*
 * CDE - Common Desktop Environment
 *
 * Copyright (c) 1993-2012, The Open Group. All rights reserved.
 *
 * These libraries and programs are free software; you can
 * redistribute them and/or modify them under the terms of the GNU
 * Lesser General Public License as published by the Free Software
 * Foundation; either version 2 of the License, or (at your option)
 * any later version.
 *
 * These libraries and programs are distributed in the hope that
 * they will be useful, but WITHOUT ANY WARRANTY; without even the
 * implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
 * PURPOSE. See the GNU Lesser General Public License for more
 * details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with these libraries and programs; if not, write
 * to the Free Software Foundation, Inc., 51 Franklin Street, Fifth
 * Floor, Boston, MA 02110-1301 USA
 */
/*
 *   COMPONENT_NAME: austext
 *
 *   FUNCTIONS: dump_hashtab
 *		is_duprec
 *		main
 *
 *   ORIGINS: 27
 *
 *
 *   (C) COPYRIGHT International Business Machines Corp. 1993,1995
 *   All Rights Reserved
 *   Licensed Materials - Property of IBM
 *   US Government Users Restricted Rights - Use, duplication or
 *   disclosure restricted by GSA ADP Schedule Contract with IBM Corp.
 */
/******************* ISDUPREC.C *******************
 * $XConsortium: isduprec.c /main/5 1996/05/07 13:37:35 drk $
 * June 1993.
 * Is_duprec() returns 0 (FALSE) for every record id it is passed
 * unless one is passed that duplicates a previous one,
 * in which case it returns 1 (TRUE).
 * It ensures that duplicate record ids in an .fzk file
 * are not processed by either ravel or borodin.
 * It does it by storing each recid into a hash table and
 * searching the table before storing a new recid.
 * Returns 2 on errors (malloc out of space, etc);
 *
 * Global 'duprec_hashsize' can be changed to any rational value
 * for a hash table size (say 1000 to 30,000) prior to the first call
 * of is_duprec().  It should be roughly => to the total number of
 * different record ids expected to be passed to is_duprec().
 * If initialized to 0 before the first call, that will disable
 * duplicate checking, i.e. is_duprec() will allocate no memory
 * and always return 0.
 *
 * $Log$
 * Revision 2.2  1995/10/25  17:22:48  miker
 * Added prolog.
 *
 * Revision 2.1  1995/09/22  20:56:44  miker
 * Freeze DtSearch 0.1, AusText 2.1.8
 *
 * Revision 1.3  1995/09/05  18:11:45  miker
 * Minor changes so ansi c compilers won't whine.
 */
#include <stdlib.h>
#include <string.h>

#ifdef TEST
#include <stdio.h>
#include <errno.h>
#endif

#define PROGNAME	"ISDUPREC"
#define HASHSIZE	3000L
#define NOT_A_DUP	0
#define IS_A_DUP	1
#define OUT_OF_MEM	2

unsigned long   duprec_hashsize = HASHSIZE;

/************************************************/
/*						*/
/*		    HASHNODE			*/
/*						*/
/************************************************/
/* The hash table is a HASHSIZE array of pointers to these structures.
 * Each pointer is initialized to NULL.
 * Additions are handled by filling in a HASHNODE pointed to
 * by the table pointer.  The 'recid' is NOT a char array of length
 * 1, but a string whose length varies depending on the actual
 * length of the passed record id.  Each hashnode is malloced
 * for exactly the right length.  Collisions are handled by linking
 * additional nodes off of the original one.
 */
typedef struct hash_tag {
    struct hash_tag *link;
    char            recid[2];	/* actual array size varies */
}               HASHNODE;


#ifdef TEST
/************************************************/
/*						*/
/*		  dump_hashtab()		*/
/*						*/
/************************************************/
/* For debugging, prints out all recids in hashtab, skipping empty bkts */
static void     dump_hashtab (HASHNODE ** hashtab)
{
    HASHNODE       *hp, **hpp;
    int             i;
    printf (PROGNAME "67 dump_hashtab(%p):\n", hashtab);
    for (i = 0, hpp = hashtab; i < duprec_hashsize; i++, hpp++) {
	if (*hpp) {
	    printf ("  %4d:", i);
	    fflush (stdout);
	    for (hp = *hpp; hp != NULL; hp = hp->link)
		printf (" '%s'", hp->recid);
	    putchar ('\n');
	    fflush (stdout);
	}
    }
    return;
}  /* dump_hashtab() */

#endif	/* TEST */


/************************************************/
/*						*/
/*		   is_duprec()			*/
/*						*/
/************************************************/
/* Normal return is 0 indicating that passed record id is unique.
 * Also immediately returns 0 if duplicate checking has been
 * turned off by setting global 'duprec_hashsize' to zero.
 * Returns 1 if record id is a duplicate.
 * Returns 2 if out of memory.
 * First call uses 'duprec_hashsize' to create hash table.
 */
int             is_duprec (char *recid)
{
    static HASHNODE **hashtab = NULL;
    static unsigned long primes[10] =
    {1013, 1511, 2203, 3511, 5003, 10007, 15013, 20011, 25013, 30001};

    unsigned long   i;
    char           *cp;
    unsigned long   sum;
    HASHNODE       *hp, **hpp;

    if (duprec_hashsize == 0UL)
	return NOT_A_DUP;

/* Generate hash table at first call only */
    if (hashtab == NULL) {
	/*
	 * adjust table size upward to nearest preordained prime
	 * number
	 */
	for (i = 0; i < 9 && primes[i] < duprec_hashsize; i++);
	duprec_hashsize = primes[i];
#ifdef TEST
	printf (PROGNAME "117 Create hash table, duprec_hashsize set = %ld.\n",
	    duprec_hashsize);
#endif

	hashtab = malloc ((duprec_hashsize + 2L) * sizeof (HASHNODE *));
	if (hashtab == NULL)
	    return OUT_OF_MEM;

	/* init table to all NULL pointers. */
	hpp = hashtab;
	for (i = duprec_hashsize + 2L; i > 0L; i--)
	    *hpp++ = NULL;
    }

    /*****dump_hashtab(hashtab);******/

    /* HASH FUNCTION:  H(recid) = (SUM(i*recid[i])) mod M,
     * where M is table size (prime), and SUM is calculated
     * for i=1 to end of recid.  Multiplying the position by the character
     * value at that position minimizes the influence of identical
     * characters at the beginnings and ends of recids,
     * and also usually yields a number larger than M.
     * Not skipping over the first position (the keytype char) helps
     * efficiently catch recids that are blank after the keytype.
     */
    sum = 0UL;
    i = 1;
    cp = recid;
    while (*cp != 0)
	sum += i++ * (*cp++);
    hpp = &(hashtab[sum % duprec_hashsize]);	/* hpp = head of linked
						 * list */

#ifdef TEST
    printf (PROGNAME "150 is_duprec('%s')=hashtab[%lu]=%p: ",
	recid, sum % duprec_hashsize, *hpp);
    fflush (stdout);
    i = 0;
#endif

    /* Search linked list (if any) for hashnode containing recid */
    for (hp = *hpp; hp != NULL; hp = hp->link) {
#ifdef TEST
	i++;
#endif

	if (strcmp (hp->recid, recid) == 0) {
#ifdef TEST
	    printf ("DUP!@listpos=%d\n", i);
#endif
	    return IS_A_DUP;
	}
	hpp = &hp->link;	/* now hpp = tail of linked list */
    }
#ifdef TEST
    printf ("miss@listlen=%d\n", i);
#endif

    /* Not a duplicate.  Add current recid to hash table. */
    if ((hp = malloc (sizeof (HASHNODE) + strlen (recid) + 2)) == NULL)
	return OUT_OF_MEM;
    strcpy (hp->recid, recid);
    hp->link = NULL;
    /*****hp->link = *hpp;******/
    *hpp = hp;
    return NOT_A_DUP;
}  /* is_duprec() */


#ifdef MAIN
/************************************************/
/*						*/
/*		     main()			*/
/*						*/
/************************************************/
main (int argc, char *argv[])
{
    int             i;
    FILE           *f;
    char            buf[2048];

    if (argc < 2) {
	printf ("USAGE: %s <file> [n]\n"
	    "where file contains list of char strings\n"
	    "and optional n changes hash table size.\n",
	    argv[0]);
	return;
    }
    if ((f = fopen (argv[1], "r")) == NULL) {
	printf ("Can't open %s: %s\n", argv[1], strerror (errno));
	return;
    }
    if (argc >= 3)
	duprec_hashsize = atol (argv[2]);

    while (fgets (buf, sizeof (buf), f) != NULL) {
	buf[sizeof (buf) - 1] = 0;
	i = is_duprec (buf);
	printf ("%s", buf);	/* each buf should end in \n */
	if (i > 1)
	    break;
    }
    return;
}

#endif
/******************* ISDUPREC.C *******************/