cdesktopenv/cde/programs/dtsr/dtsrkdump.c

540 lines
15 KiB
C

/*
* CDE - Common Desktop Environment
*
* Copyright (c) 1993-2012, The Open Group. All rights reserved.
*
* These libraries and programs are free software; you can
* redistribute them and/or modify them under the terms of the GNU
* Lesser General Public License as published by the Free Software
* Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* These libraries and programs are distributed in the hope that
* they will be useful, but WITHOUT ANY WARRANTY; without even the
* implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
* PURPOSE. See the GNU Lesser General Public License for more
* details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with these libraries and programs; if not, write
* to the Free Software Foundation, Inc., 51 Franklin Street, Fifth
* Floor, Boston, MA 02110-1301 USA
*/
/* $XConsortium: dtsrkdump.c /main/3 1996/09/23 21:03:37 cde-ibm $
*
* (c) Copyright 1996 Digital Equipment Corporation.
* (c) Copyright 1996 Hewlett-Packard Company.
* (c) Copyright 1996 International Business Machines Corp.
* (c) Copyright 1996 Sun Microsystems, Inc.
* (c) Copyright 1996 Novell, Inc.
* (c) Copyright 1996 FUJITSU LIMITED.
* (c) Copyright 1996 Hitachi.
*/
/*
* COMPONENT_NAME: austext
*
* FUNCTIONS: count_words
* main
*
* ORIGINS: 27
*
*
* (C) COPYRIGHT International Business Machines Corp. 1994,1996
* All Rights Reserved
* Licensed Materials - Property of IBM
* US Government Users Restricted Rights - Use, duplication or
* disclosure restricted by GSA ADP Schedule Contract with IBM Corp.
*/
/*********************** DTSRKDUMP.C *************************
* $Id: dtsrkdump.c /main/3 1996/09/23 21:03:37 cde-ibm $
* April 1994.
* Dumps a DtSearch/AusText keyfile to stdout.
* Renamed from auskdump for DtSearch.
*
* $Log$
* Revision 2.3 1996/04/10 21:19:28 miker
* Program renamed from auskdump with minor cleanup.
*
*
* *** Log: auskdump.c,v ***
* Revision 2.2 1995/10/19 20:29:37 miker
* Permit accessing of read-only databases.
* Revision 2.1 1995/09/22 18:55:59 miker
* Freeze DtSearch 0.1, AusText 2.1.8
* Revision 1.11 1995/09/19 21:47:26 miker
* Added explanation of '*' in report.
* Revision 1.10 1995/09/06 14:18:33 miker
* Fixed bug: -p value incorrectly converted to double because
* atof() function prototype was not provided from stdlib.h.
* Revision 1.9 1995/09/01 23:58:57 miker
* Minor name changes for DtSearch.
* Print err msgs when databases fail to open.
* Revision 1.8 1995/05/30 18:40:12 miker
* Print progress dots and some additional dbrec info.
*/
#include "SearchP.h"
#include <string.h>
#include <ctype.h>
#include <stdlib.h>
#include <fcntl.h>
#include <locale.h>
#include "vista.h"
#define PROGNAME "DTSRKDUMP"
#define MIN_THRESHOLD 100L
#define KEYS_PER_DOT 1000
#define MS_dtsrkdump 25
/*----------------- GLOBALS -------------------*/
char buf[2048];
static long *counters = NULL; /* allocated array */
static int do_verbose = FALSE;
static DB_ADDR dba;
static long min_threshold = MIN_THRESHOLD;
static long maxdba = 0L;
static struct or_dbrec
dbrec;
/****************************************/
/* */
/* count_words */
/* */
/****************************************/
void count_words (int index)
{
long vista_field = 0;
UCHAR *ptr;
DtSrINT32 offset, free, addrs;
int tabstop;
long keycount = 0;
int dotcount = 0;
if (index == 0)
vista_field = OR_SWORDKEY;
else if (index == 2)
vista_field = OR_LWORDKEY;
else if (index == 4)
vista_field = OR_HWORDKEY;
else {
printf (CATGETS(dtsearch_catd, MS_dtsrkdump, 1,
"%s Program Error Abort.\a\n"),
PROGNAME"030");
DtSearchExit (4);
}
KEYFRST (PROGNAME"36", vista_field, 0);
while (db_status == S_OKAY) {
KEYREAD (PROGNAME"48", buf);
if (buf[0] == STEM_CH)
(counters[index])++;
else
(counters[index + 1])++;
if (do_verbose) {
CRGET (PROGNAME"58", &dba, 0);
switch (index) {
case 0:
CRREAD (PROGNAME"66", OR_SWOFFSET, &offset, 0);
CRREAD (PROGNAME"67", OR_SWFREE, &free, 0);
CRREAD (PROGNAME"68", OR_SWADDRS, &addrs, 0);
break;
case 2:
CRREAD (PROGNAME"76", OR_LWOFFSET, &offset, 0);
CRREAD (PROGNAME"77", OR_LWFREE, &free, 0);
CRREAD (PROGNAME"78", OR_LWADDRS, &addrs, 0);
break;
case 4:
CRREAD (PROGNAME"86", OR_HWOFFSET, &offset, 0);
CRREAD (PROGNAME"87", OR_HWFREE, &free, 0);
CRREAD (PROGNAME"88", OR_HWADDRS, &addrs, 0);
break;
}
NTOHL (offset);
NTOHL (free);
NTOHL (addrs);
if (addrs >= min_threshold) {
printf (" \"");
tabstop = 0;
for (ptr = (UCHAR *) buf; *ptr != 0; ptr++) {
putchar ((*ptr >= 32) ? *ptr : '~');
tabstop++;
}
printf ("\" ");
while (tabstop++ < 22)
putchar (' ');
printf (CATGETS(dtsearch_catd, MS_dtsrkdump, 2,
"%c dba=%d:%-7ld ofs=%-9ld adr=%-6ld fre=%ld\n"),
(addrs >= dbrec.or_reccount) ? '*' : ' ',
dba >> 24, dba & 0xffffff, offset, addrs, free);
}
} /* end verbose */
else { /* !verbose */
if (++keycount % KEYS_PER_DOT == 0) {
putchar ('.');
if (++dotcount % 10 == 0)
putchar (' ');
if (dotcount % 50 == 0) {
putchar ('\n');
dotcount = 0;
}
fflush (stdout);
}
} /* end !verbose dot printing */
KEYNEXT (PROGNAME"98", vista_field, 0);
} /* end object key read loop */
if (dotcount)
putchar ('\n');
return;
} /* count_words() */
/****************************************/
/* */
/* main */
/* */
/****************************************/
int main (int argc, char *argv[])
{
int i;
int oops;
int dotcount;
long keycount;
long total;
char *ptr;
int do_objkeys = FALSE;
int do_wordkeys = FALSE;
char dbpath[2048];
char rcs_revision [8];
char dbname[12];
time_t now;
double percent = 0.0;
int listing_most_words = FALSE;
static char *word_labels[6] =
{
"Short Stems = %8ld\n", "Short Words = %8ld\n",
"Long Stems = %8ld\n", "Long Words = %8ld\n",
"Huge Stems = %8ld\n", "Huge Words = %8ld\n"
};
aa_argv0 = argv[0];
time (&now);
sscanf ("$Revision: /main/3 $", "%*s %s", rcs_revision);
setlocale (LC_ALL, "");
dtsearch_catd = CATOPEN(FNAME_DTSRCAT, 0);
strftime (buf, sizeof (buf), "%m/%d/%Y, %I:%M %p",
localtime (&now));
printf (CATGETS(dtsearch_catd, MS_dtsrkdump, 3,
"%s %s, engine %s. %s.\n"),
aa_argv0, rcs_revision, AUSAPI_VERSION, buf);
if (argc <= 1) {
PRINT_USAGE:
printf (CATGETS(dtsearch_catd, MS_dtsrkdump, 4,
"\nUSAGE: %s -o|w|ow [-v] [-t<N> | -p<N>] dbname\n"
" Reads DtSearch key files and prints summary report.\n"
" -o Keys examined are OBJECT record keys.\n"
" -w Keys examined are inverted index WORDS.\n"
" -v VERBOSE mode, lists every key.\n"
" -t<N> Threshold. Sets w and v options, and lists only words\n"
" with >= <N> addresses. All words will be listed if <N> = 1.\n"
" -p<N> Another threshold. Same as -t except <N> is percent\n"
" of the entire database (<N> may include a decimal point).\n"
" For example -p99.9 prints out every word that occurs\n"
" in 99.9%% or more of the records--an excellent way to find\n"
" candidates for the stop list.\n"
" If w and v are set without threshold, default is -t%d.\n"
" <dbname> 1 - 8 character database name with optional path prefix.\n")
,aa_argv0
,MIN_THRESHOLD
);
DtSearchExit (2);
}
/* parse options */
else { /* argc >= 2 */
for (;;) {
/* each pass grabs new token with "-xxx" format */
--argc;
++argv;
if (argc <= 0)
break; /* no more tokens of any kind */
ptr = argv[0];
if (*ptr != '-')
break; /* no more option tokens */
/* examine each char in this -xxx token */
while (*(++ptr) != 0) {
switch (*ptr) {
case 'o':
do_objkeys = TRUE;
break;
case 'w':
do_wordkeys = TRUE;
break;
case 'v':
do_verbose = TRUE;
break;
case 'p':
do_verbose = TRUE;
do_wordkeys = TRUE;
percent = atof (ptr + 1);
if (percent <= 0.0 || percent > 100.0) {
fprintf (stderr,
CATGETS(dtsearch_catd, MS_dtsrkdump, 5,
"%s Invalid percent value %lf.\a\n"),
PROGNAME"195", percent);
goto PRINT_USAGE;
}
ptr[1] = 0; /* terminate parse */
break;
case 't':
do_verbose = TRUE;
do_wordkeys = TRUE;
if ((min_threshold = atol (ptr + 1)) <= 0L) {
fprintf (stderr,
CATGETS(dtsearch_catd, MS_dtsrkdump, 53,
"%s Invalid threshold value.\a\n"),
PROGNAME"198");
goto PRINT_USAGE;
}
ptr[1] = 0; /* terminate parse */
break;
default:
fprintf (stderr,
CATGETS(dtsearch_catd, MS_dtsrkdump, 55,
"%s Unknown command line argument '%c'.\a\n"),
PROGNAME"278", *ptr);
goto PRINT_USAGE;
} /* end switch */
} /* end while-loop for each char of -xxx token */
} /* end for-loop for each -xxx token */
} /* end of options parse altogether */
oops = FALSE;
if (argc <= 0) {
printf (CATGETS(dtsearch_catd, MS_dtsrkdump, 56,
"%s Missing required database name.\a\n"),
PROGNAME"267");
oops = TRUE;
}
if (!do_wordkeys && !do_objkeys) {
printf (CATGETS(dtsearch_catd, MS_dtsrkdump, 57,
"%s Either -o or -w must be specified.\a\n"),
PROGNAME"271");
oops = TRUE;
}
if (oops)
goto PRINT_USAGE;
/* Database name may have a long path prefix.
* If so, we need to segregate the two.
* Set 'ptr' to just the 8 char dictionary name by moving
* it backwards until first non-alphanumeric character
* (such as a ":" in the dos drive id or a slash between directories),
* or to the beginning of string.
*/
strncpy (dbpath, argv[0], sizeof (dbpath));
dbpath[sizeof (dbpath) - 1] = 0;
for (ptr = dbpath + strlen (dbpath) - 1; ptr >= dbpath; ptr--)
if (!isalnum (*ptr)) {
ptr++;
break;
}
if (ptr < dbpath)
ptr = dbpath;
/* test for valid database name */
i = strlen (ptr);
if (i < 1 || i > 8) {
fprintf (stderr, CATGETS(dtsearch_catd, MS_dtsrkdump, 58,
"%s Invalid database name '%s'.\a\n"),
PROGNAME"297", ptr);
goto PRINT_USAGE;
}
strcpy (dbname, ptr);
*ptr = 0; /* truncate dbname off of full path/dbname */
/* Open database in read-only mode. */
db_oflag = O_RDONLY;
if (!austext_dopen (dbname, dbpath, NULL, 0, &dbrec)) {
fprintf (stderr, "%s\n", DtSearchGetMessages());
DtSearchExit (3);
}
maxdba = dbrec.or_maxdba;
printf (CATGETS(dtsearch_catd, MS_dtsrkdump, 60,
"%s: '%s' reccount=%ld maxdba=%ld recslots=%hd minw=%hd maxw=%hd\n"),
aa_argv0, dbname, dbrec.or_reccount,
dbrec.or_maxdba, dbrec.or_recslots,
dbrec.or_minwordsz, dbrec.or_maxwordsz);
/* Adjust threshold if necessary */
if (percent > 0.0)
min_threshold = (long)
((float) percent * (float) dbrec.or_reccount / 100.0);
if (min_threshold > dbrec.or_reccount)
min_threshold = dbrec.or_reccount;
if (do_wordkeys && do_verbose) {
if (min_threshold > 1 && min_threshold < dbrec.or_reccount) {
printf (CATGETS(dtsearch_catd, MS_dtsrkdump, 70,
"%s Will only list words occurring "
"in %ld or more records.\n"),
aa_argv0, min_threshold);
listing_most_words =
(float) min_threshold / (float) dbrec.or_reccount > .90;
}
else {
printf (CATGETS(dtsearch_catd, MS_dtsrkdump, 80,
"%s: Listing all words in database.\n"),
aa_argv0);
listing_most_words = TRUE;
}
}
if (do_objkeys) {
/*
* Allocate and initialize an array of keytype counters, one for
* each possible ascii keytype char (256).
*/
counters = austext_malloc (258 * sizeof(long), PROGNAME"113", NULL);
memset (counters, 0, 258 * sizeof(long));
dotcount = 0;
keycount = 0;
KEYFRST (PROGNAME"111", OR_OBJKEY, 0);
while (db_status == S_OKAY) {
KEYREAD (PROGNAME"288", buf);
(counters[buf[0]])++;
CRGET (PROGNAME"251", &dba, 0);
if (maxdba < (dba & 0xffffff))
maxdba = dba;
if (do_verbose) {
/* Mark control and nonascii chars with a period. */
i = 0;
putchar ('\"');
for (ptr = buf; *ptr != 0; ptr++) {
if (*ptr < 32 | *ptr >= 127) {
putchar ('.');
i++;
}
else {
putchar (*ptr);
i++;
}
}
printf ("\" ");
while (i++ < DtSrMAX_DB_KEYSIZE)
putchar (' ');
printf (CATGETS(dtsearch_catd, MS_dtsrkdump, 100,
"dba x%08lx, %6ld\n"), dba, dba);
} /* end verbose */
else { /* !verbose */
if (++keycount % KEYS_PER_DOT == 0) {
putchar ('.');
if (++dotcount % 10 == 0)
putchar (' ');
if (dotcount % 50 == 0) {
putchar ('\n');
dotcount = 0;
}
fflush (stdout);
}
} /* end !verbose dot printing */
KEYNEXT (PROGNAME"291", OR_OBJKEY, 0);
} /* end object key read loop */
/* Print objkey summary report */
if (dotcount)
putchar ('\n');
if (dbpath[0] == 0)
buf[0] = 0;
else
sprintf (buf, CATGETS(dtsearch_catd, MS_dtsrkdump, 110,
" in %s"), dbpath);
printf (CATGETS(dtsearch_catd, MS_dtsrkdump, 120,
"Object Summary for '%s'%s:\n"), dbname, buf);
puts (CATGETS(dtsearch_catd, MS_dtsrkdump, 130,
"Object Count by Keytypes:"));
total = 0L;
for (i = 0; i < 256; i++) {
if (counters[i] > 0L) {
total += counters[i];
if (i > 32 && i < 127)
printf (" '%c' %6ld\n", i, counters[i]);
else
printf (" x%02x %6ld\n", i, counters[i]);
}
}
printf (CATGETS(dtsearch_catd, MS_dtsrkdump, 160,
"TOTAL Objects Count = %ld\n"), total);
printf (CATGETS(dtsearch_catd, MS_dtsrkdump, 170,
"Largest Object DBA = %ld\n"), maxdba);
free (counters);
} /* end do_objkeys */
if (do_wordkeys) {
if (listing_most_words)
printf (CATGETS(dtsearch_catd, MS_dtsrkdump, 180,
"%s: * Words marked with asterisk occur in every record.\n"),
aa_argv0);
/*
* Allocate and initialize word and stem counters. First is for
* short stems (those beginning with STEM_CH), next is for short
* words (everything else). Next are for long stems, long words,
* huge stems, and huge words (6 in all).
*/
counters = austext_malloc (8 * sizeof (long), PROGNAME"113", NULL);
memset (counters, 0, 6 * sizeof(long));
count_words (0); /* short */
count_words (2); /* long */
count_words (4); /* huge */
/* print wordkey summary report */
if (do_objkeys)
putchar ('\n'); /* separate from last report */
if (dbpath[0] == 0)
buf[0] = 0;
else
sprintf (buf, CATGETS(dtsearch_catd, MS_dtsrkdump, 110,
" in %s"), dbpath);
printf (CATGETS(dtsearch_catd, MS_dtsrkdump, 200,
"Words Summary for '%s'%s:\n"), dbname, buf);
total = 0L;
for (i = 0; i < 6; i++) {
printf (word_labels[i], counters[i]);
total += counters[i];
}
printf (CATGETS(dtsearch_catd, MS_dtsrkdump, 210,
"TOTAL Words Count = %ld\n"), total);
free (counters);
} /* end do_wordkeys */
DtSearchExit (0);
} /* main() */
/*********************** DTSRKDUMP.C *************************/