/* * CDE - Common Desktop Environment * * Copyright (c) 1993-2012, The Open Group. All rights reserved. * * These libraries and programs are free software; you can * redistribute them and/or modify them under the terms of the GNU * Lesser General Public License as published by the Free Software * Foundation; either version 2 of the License, or (at your option) * any later version. * * These libraries and programs are distributed in the hope that * they will be useful, but WITHOUT ANY WARRANTY; without even the * implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR * PURPOSE. See the GNU Lesser General Public License for more * details. * * You should have received a copy of the GNU Lesser General Public * License along with these libraries and programs; if not, write * to the Free Software Foundation, Inc., 51 Franklin Street, Fifth * Floor, Boston, MA 02110-1301 USA */ /* $XConsortium: dtsrkdump.c /main/3 1996/09/23 21:03:37 cde-ibm $ * * (c) Copyright 1996 Digital Equipment Corporation. * (c) Copyright 1996 Hewlett-Packard Company. * (c) Copyright 1996 International Business Machines Corp. * (c) Copyright 1996 Sun Microsystems, Inc. * (c) Copyright 1996 Novell, Inc. * (c) Copyright 1996 FUJITSU LIMITED. * (c) Copyright 1996 Hitachi. */ /* * COMPONENT_NAME: austext * * FUNCTIONS: count_words * main * * ORIGINS: 27 * * * (C) COPYRIGHT International Business Machines Corp. 1994,1996 * All Rights Reserved * Licensed Materials - Property of IBM * US Government Users Restricted Rights - Use, duplication or * disclosure restricted by GSA ADP Schedule Contract with IBM Corp. */ /*********************** DTSRKDUMP.C ************************* * $Id: dtsrkdump.c /main/3 1996/09/23 21:03:37 cde-ibm $ * April 1994. * Dumps a DtSearch/AusText keyfile to stdout. * Renamed from auskdump for DtSearch. * * $Log$ * Revision 2.3 1996/04/10 21:19:28 miker * Program renamed from auskdump with minor cleanup. * * * *** Log: auskdump.c,v *** * Revision 2.2 1995/10/19 20:29:37 miker * Permit accessing of read-only databases. * Revision 2.1 1995/09/22 18:55:59 miker * Freeze DtSearch 0.1, AusText 2.1.8 * Revision 1.11 1995/09/19 21:47:26 miker * Added explanation of '*' in report. * Revision 1.10 1995/09/06 14:18:33 miker * Fixed bug: -p value incorrectly converted to double because * atof() function prototype was not provided from stdlib.h. * Revision 1.9 1995/09/01 23:58:57 miker * Minor name changes for DtSearch. * Print err msgs when databases fail to open. * Revision 1.8 1995/05/30 18:40:12 miker * Print progress dots and some additional dbrec info. */ #include "SearchP.h" #include #include #include #include #include #include "vista.h" #define PROGNAME "DTSRKDUMP" #define MIN_THRESHOLD 100L #define KEYS_PER_DOT 1000 #define MS_dtsrkdump 25 /*----------------- GLOBALS -------------------*/ char buf[2048]; static long *counters = NULL; /* allocated array */ static int do_verbose = FALSE; static DB_ADDR dba; static long min_threshold = MIN_THRESHOLD; static long maxdba = 0L; static struct or_dbrec dbrec; /****************************************/ /* */ /* count_words */ /* */ /****************************************/ void count_words (int index) { long vista_field = 0; UCHAR *ptr; DtSrINT32 offset, free, addrs; int tabstop; long keycount = 0; int dotcount = 0; if (index == 0) vista_field = OR_SWORDKEY; else if (index == 2) vista_field = OR_LWORDKEY; else if (index == 4) vista_field = OR_HWORDKEY; else { printf (CATGETS(dtsearch_catd, MS_dtsrkdump, 1, "%s Program Error Abort.\a\n"), PROGNAME"030"); DtSearchExit (4); } KEYFRST (PROGNAME"36", vista_field, 0); while (db_status == S_OKAY) { KEYREAD (PROGNAME"48", buf); if (buf[0] == STEM_CH) (counters[index])++; else (counters[index + 1])++; if (do_verbose) { CRGET (PROGNAME"58", &dba, 0); switch (index) { case 0: CRREAD (PROGNAME"66", OR_SWOFFSET, &offset, 0); CRREAD (PROGNAME"67", OR_SWFREE, &free, 0); CRREAD (PROGNAME"68", OR_SWADDRS, &addrs, 0); break; case 2: CRREAD (PROGNAME"76", OR_LWOFFSET, &offset, 0); CRREAD (PROGNAME"77", OR_LWFREE, &free, 0); CRREAD (PROGNAME"78", OR_LWADDRS, &addrs, 0); break; case 4: CRREAD (PROGNAME"86", OR_HWOFFSET, &offset, 0); CRREAD (PROGNAME"87", OR_HWFREE, &free, 0); CRREAD (PROGNAME"88", OR_HWADDRS, &addrs, 0); break; } NTOHL (offset); NTOHL (free); NTOHL (addrs); if (addrs >= min_threshold) { printf (" \""); tabstop = 0; for (ptr = (UCHAR *) buf; *ptr != 0; ptr++) { putchar ((*ptr >= 32) ? *ptr : '~'); tabstop++; } printf ("\" "); while (tabstop++ < 22) putchar (' '); printf (CATGETS(dtsearch_catd, MS_dtsrkdump, 2, "%c dba=%d:%-7ld ofs=%-9ld adr=%-6ld fre=%ld\n"), (addrs >= dbrec.or_reccount) ? '*' : ' ', dba >> 24, dba & 0xffffff, offset, addrs, free); } } /* end verbose */ else { /* !verbose */ if (++keycount % KEYS_PER_DOT == 0) { putchar ('.'); if (++dotcount % 10 == 0) putchar (' '); if (dotcount % 50 == 0) { putchar ('\n'); dotcount = 0; } fflush (stdout); } } /* end !verbose dot printing */ KEYNEXT (PROGNAME"98", vista_field, 0); } /* end object key read loop */ if (dotcount) putchar ('\n'); return; } /* count_words() */ /****************************************/ /* */ /* main */ /* */ /****************************************/ int main (int argc, char *argv[]) { int i; int oops; int dotcount; long keycount; long total; char *ptr; int do_objkeys = FALSE; int do_wordkeys = FALSE; char dbpath[2048]; char rcs_revision [8]; char dbname[12]; time_t now; double percent = 0.0; int listing_most_words = FALSE; static char *word_labels[6] = { "Short Stems = %8ld\n", "Short Words = %8ld\n", "Long Stems = %8ld\n", "Long Words = %8ld\n", "Huge Stems = %8ld\n", "Huge Words = %8ld\n" }; aa_argv0 = argv[0]; time (&now); sscanf ("$Revision: /main/3 $", "%*s %s", rcs_revision); setlocale (LC_ALL, ""); dtsearch_catd = CATOPEN(FNAME_DTSRCAT, 0); strftime (buf, sizeof (buf), "%m/%d/%Y, %I:%M %p", localtime (&now)); printf (CATGETS(dtsearch_catd, MS_dtsrkdump, 3, "%s %s, engine %s. %s.\n"), aa_argv0, rcs_revision, AUSAPI_VERSION, buf); if (argc <= 1) { PRINT_USAGE: printf (CATGETS(dtsearch_catd, MS_dtsrkdump, 4, "\nUSAGE: %s -o|w|ow [-v] [-t | -p] dbname\n" " Reads DtSearch key files and prints summary report.\n" " -o Keys examined are OBJECT record keys.\n" " -w Keys examined are inverted index WORDS.\n" " -v VERBOSE mode, lists every key.\n" " -t Threshold. Sets w and v options, and lists only words\n" " with >= addresses. All words will be listed if = 1.\n" " -p Another threshold. Same as -t except is percent\n" " of the entire database ( may include a decimal point).\n" " For example -p99.9 prints out every word that occurs\n" " in 99.9%% or more of the records--an excellent way to find\n" " candidates for the stop list.\n" " If w and v are set without threshold, default is -t%d.\n" " 1 - 8 character database name with optional path prefix.\n") ,aa_argv0 ,MIN_THRESHOLD ); DtSearchExit (2); } /* parse options */ else { /* argc >= 2 */ for (;;) { /* each pass grabs new token with "-xxx" format */ --argc; ++argv; if (argc <= 0) break; /* no more tokens of any kind */ ptr = argv[0]; if (*ptr != '-') break; /* no more option tokens */ /* examine each char in this -xxx token */ while (*(++ptr) != 0) { switch (*ptr) { case 'o': do_objkeys = TRUE; break; case 'w': do_wordkeys = TRUE; break; case 'v': do_verbose = TRUE; break; case 'p': do_verbose = TRUE; do_wordkeys = TRUE; percent = atof (ptr + 1); if (percent <= 0.0 || percent > 100.0) { fprintf (stderr, CATGETS(dtsearch_catd, MS_dtsrkdump, 5, "%s Invalid percent value %lf.\a\n"), PROGNAME"195", percent); goto PRINT_USAGE; } ptr[1] = 0; /* terminate parse */ break; case 't': do_verbose = TRUE; do_wordkeys = TRUE; if ((min_threshold = atol (ptr + 1)) <= 0L) { fprintf (stderr, CATGETS(dtsearch_catd, MS_dtsrkdump, 53, "%s Invalid threshold value.\a\n"), PROGNAME"198"); goto PRINT_USAGE; } ptr[1] = 0; /* terminate parse */ break; default: fprintf (stderr, CATGETS(dtsearch_catd, MS_dtsrkdump, 55, "%s Unknown command line argument '%c'.\a\n"), PROGNAME"278", *ptr); goto PRINT_USAGE; } /* end switch */ } /* end while-loop for each char of -xxx token */ } /* end for-loop for each -xxx token */ } /* end of options parse altogether */ oops = FALSE; if (argc <= 0) { printf (CATGETS(dtsearch_catd, MS_dtsrkdump, 56, "%s Missing required database name.\a\n"), PROGNAME"267"); oops = TRUE; } if (!do_wordkeys && !do_objkeys) { printf (CATGETS(dtsearch_catd, MS_dtsrkdump, 57, "%s Either -o or -w must be specified.\a\n"), PROGNAME"271"); oops = TRUE; } if (oops) goto PRINT_USAGE; /* Database name may have a long path prefix. * If so, we need to segregate the two. * Set 'ptr' to just the 8 char dictionary name by moving * it backwards until first non-alphanumeric character * (such as a ":" in the dos drive id or a slash between directories), * or to the beginning of string. */ strncpy (dbpath, argv[0], sizeof (dbpath)); dbpath[sizeof (dbpath) - 1] = 0; for (ptr = dbpath + strlen (dbpath) - 1; ptr >= dbpath; ptr--) if (!isalnum (*ptr)) { ptr++; break; } if (ptr < dbpath) ptr = dbpath; /* test for valid database name */ i = strlen (ptr); if (i < 1 || i > 8) { fprintf (stderr, CATGETS(dtsearch_catd, MS_dtsrkdump, 58, "%s Invalid database name '%s'.\a\n"), PROGNAME"297", ptr); goto PRINT_USAGE; } strcpy (dbname, ptr); *ptr = 0; /* truncate dbname off of full path/dbname */ /* Open database in read-only mode. */ db_oflag = O_RDONLY; if (!austext_dopen (dbname, dbpath, NULL, 0, &dbrec)) { fprintf (stderr, "%s\n", DtSearchGetMessages()); DtSearchExit (3); } maxdba = dbrec.or_maxdba; printf (CATGETS(dtsearch_catd, MS_dtsrkdump, 60, "%s: '%s' reccount=%ld maxdba=%ld recslots=%hd minw=%hd maxw=%hd\n"), aa_argv0, dbname, dbrec.or_reccount, dbrec.or_maxdba, dbrec.or_recslots, dbrec.or_minwordsz, dbrec.or_maxwordsz); /* Adjust threshold if necessary */ if (percent > 0.0) min_threshold = (long) ((float) percent * (float) dbrec.or_reccount / 100.0); if (min_threshold > dbrec.or_reccount) min_threshold = dbrec.or_reccount; if (do_wordkeys && do_verbose) { if (min_threshold > 1 && min_threshold < dbrec.or_reccount) { printf (CATGETS(dtsearch_catd, MS_dtsrkdump, 70, "%s Will only list words occurring " "in %ld or more records.\n"), aa_argv0, min_threshold); listing_most_words = (float) min_threshold / (float) dbrec.or_reccount > .90; } else { printf (CATGETS(dtsearch_catd, MS_dtsrkdump, 80, "%s: Listing all words in database.\n"), aa_argv0); listing_most_words = TRUE; } } if (do_objkeys) { /* * Allocate and initialize an array of keytype counters, one for * each possible ascii keytype char (256). */ counters = austext_malloc (258 * sizeof(long), PROGNAME"113", NULL); memset (counters, 0, 258 * sizeof(long)); dotcount = 0; keycount = 0; KEYFRST (PROGNAME"111", OR_OBJKEY, 0); while (db_status == S_OKAY) { KEYREAD (PROGNAME"288", buf); (counters[buf[0]])++; CRGET (PROGNAME"251", &dba, 0); if (maxdba < (dba & 0xffffff)) maxdba = dba; if (do_verbose) { /* Mark control and nonascii chars with a period. */ i = 0; putchar ('\"'); for (ptr = buf; *ptr != 0; ptr++) { if (*ptr < 32 | *ptr >= 127) { putchar ('.'); i++; } else { putchar (*ptr); i++; } } printf ("\" "); while (i++ < DtSrMAX_DB_KEYSIZE) putchar (' '); printf (CATGETS(dtsearch_catd, MS_dtsrkdump, 100, "dba x%08lx, %6ld\n"), dba, dba); } /* end verbose */ else { /* !verbose */ if (++keycount % KEYS_PER_DOT == 0) { putchar ('.'); if (++dotcount % 10 == 0) putchar (' '); if (dotcount % 50 == 0) { putchar ('\n'); dotcount = 0; } fflush (stdout); } } /* end !verbose dot printing */ KEYNEXT (PROGNAME"291", OR_OBJKEY, 0); } /* end object key read loop */ /* Print objkey summary report */ if (dotcount) putchar ('\n'); if (dbpath[0] == 0) buf[0] = 0; else sprintf (buf, CATGETS(dtsearch_catd, MS_dtsrkdump, 110, " in %s"), dbpath); printf (CATGETS(dtsearch_catd, MS_dtsrkdump, 120, "Object Summary for '%s'%s:\n"), dbname, buf); puts (CATGETS(dtsearch_catd, MS_dtsrkdump, 130, "Object Count by Keytypes:")); total = 0L; for (i = 0; i < 256; i++) { if (counters[i] > 0L) { total += counters[i]; if (i > 32 && i < 127) printf (" '%c' %6ld\n", i, counters[i]); else printf (" x%02x %6ld\n", i, counters[i]); } } printf (CATGETS(dtsearch_catd, MS_dtsrkdump, 160, "TOTAL Objects Count = %ld\n"), total); printf (CATGETS(dtsearch_catd, MS_dtsrkdump, 170, "Largest Object DBA = %ld\n"), maxdba); free (counters); } /* end do_objkeys */ if (do_wordkeys) { if (listing_most_words) printf (CATGETS(dtsearch_catd, MS_dtsrkdump, 180, "%s: * Words marked with asterisk occur in every record.\n"), aa_argv0); /* * Allocate and initialize word and stem counters. First is for * short stems (those beginning with STEM_CH), next is for short * words (everything else). Next are for long stems, long words, * huge stems, and huge words (6 in all). */ counters = austext_malloc (8 * sizeof (long), PROGNAME"113", NULL); memset (counters, 0, 6 * sizeof(long)); count_words (0); /* short */ count_words (2); /* long */ count_words (4); /* huge */ /* print wordkey summary report */ if (do_objkeys) putchar ('\n'); /* separate from last report */ if (dbpath[0] == 0) buf[0] = 0; else sprintf (buf, CATGETS(dtsearch_catd, MS_dtsrkdump, 110, " in %s"), dbpath); printf (CATGETS(dtsearch_catd, MS_dtsrkdump, 200, "Words Summary for '%s'%s:\n"), dbname, buf); total = 0L; for (i = 0; i < 6; i++) { printf (word_labels[i], counters[i]); total += counters[i]; } printf (CATGETS(dtsearch_catd, MS_dtsrkdump, 210, "TOTAL Words Count = %ld\n"), total); free (counters); } /* end do_wordkeys */ DtSearchExit (0); } /* main() */ /*********************** DTSRKDUMP.C *************************/