static char sccsid
[] = "@(#)invert.c 2.7 5/27/93";
/* input: records of lines, separated by blank lines
output: key:file1 start/length ... start/length:file2 start/length ...
# define makelow(c) ('A'<=(c) && (c)<='Z' ? (c)-'A'+'a' : c)
int max_kcnt
= 100; /* max number of keys */
int max_klen
= 6; /* max length of keys */
char *ignore
= /* string of line starts to ignore */
char *INDEX
= /* name of output file */
char *bibtmpfile
= /* name of temporary file */
int silent
= 0; /* 0 => statistics printed */
/* 1 => no statisitics printed */
"sort -u +0 -1 +1 -2 +2n -3 +3n %s -o %s";
int bol
= 1; /* at beginning of line */
long int records
= 0; /* number of records read */
long int keys
= 0; /* number of keys read (occurences) */
long int distinct
; /* number of distinct keys */
InitDirectory(BMACLIB
,N_BMACLIB
);
InitDirectory(COMFILE
,N_COMFILE
);
output
= fopen(bibtmpfile
,"w");
for ( flags() ; argc
>0 ; argc
--, argv
++ ,flags() )
input
= fopen(filename
,"r");
{ fprintf(stderr
,"invert: error in open of %s\n", filename
);
for(;;) /* each record */ {
/* find start of next record (exit if none) */
start
= nextrecord(input
,start
+length
);
length
= recsize(input
,start
);
sprintf(tag_line
, " %s %d %d\n", filename
, start
, length
);
while (ftell(input
) < start
+length
&& kcnt
< max_kcnt
) {
getword(input
,word
,ignore
,&bol
);
makekey(word
,max_klen
,COMFILE
);
fputs(word
,output
); fputs(tag_line
,output
);
sprintf(sortcmd
, sort_it
, bibtmpfile
, bibtmpfile
);
distinct
= shorten(bibtmpfile
,INDEX
);
"%d documents %d distinct keys %d key occurrences\n",
records
, distinct
, keys
);
-%str ignore lines that begin with %x CNOPVX
-cfile file contains Common words /usr/new/lib/bib/common
do not use common words as keys
-pfile name of output file INDEX
-s do not print statistics statistics printed
# define operand (strlen(*argv+2)==0 ? (argv++,argc--,*argv) : *argv+2)
for (; argc
>0 && *argv
[0]=='-'; argc
--,argv
++)
{ case 'k': max_kcnt
= atoi(operand
);
case 'l': max_klen
= atoi(operand
);
case 'c': strcpy(COMFILE
,operand
);
case '%': ignore
= *argv
+2;
case 'p': INDEX
= operand
;
strreplace(COMFILE
, BMACLIB
, p
);
default: fprintf(stderr
,"unknown flag '%s'\n", *argv
);
/* shorten(inf,outf): file "inf" consists of lines of the form:
sorted by key and file. replace lines with the same key
with one line of the form:
key:file1 start/length ... start/length:file2 start/length ...
returns number of lines in output
char key
[maxstr
], newkey
[maxstr
],
file
[maxstr
], newfile
[maxstr
];
if (in
==NULL
|| out
==NULL
)
{ fprintf(stderr
,"invert: error in opening file for compression\n");
sscanf(line
,"%s%s%d%d", key
, file
, &start
, &length
);
fprintf(out
, "%s :%s %d/%d", key
, file
, start
, length
);
for ( getline(in
, line
) ; !feof(in
); getline(in
, line
))
{ sscanf(line
,"%s%s%d%d", newkey
, newfile
, &start
, &length
);
if (strcmp(key
,newkey
)!=0)
fprintf(out
, "\n%s :%s %d/%d", key
, file
, start
, length
);
else if (strcmp(file
,newfile
)!=0)
fprintf(out
, ":%s %d/%d", file
, start
, length
);
fprintf(out
, " %d/%d", start
, length
);