1d07cbd7963790178a30e81b98bb8d61fa11fb41
[unix-history] / usr / src / contrib / bib / src / invert.c
#ifndef lint
static char sccsid[] = "@(#)invert.c 2.2 %G%";
#endif not lint
/* input: records of lines, separated by blank lines
output: key:file1 start/length ... start/length:file2 start/length ...
*/
# include "stdio.h"
# include "streams.h"
# include "bib.h"
# define isnull(x) (*(x) == NULL)
# define makelow(c) ('A'<=(c) && (c)<='Z' ? (c)-'A'+'a' : c)
int max_kcnt = 100; /* max number of keys */
int max_klen = 6; /* max length of keys */
char *ignore = /* string of line starts to ignore */
"CNOPVX";
char *common = /* name of file of common words */
COMFILE;
char *INDEX= /* name of output file */
INDXFILE;
char *tmpfile = /* name of temporary file */
INVTEMPFILE;
int silent = 0; /* 0 => statistics printed */
/* 1 => no statisitics printed */
char *sort_it =
"sort -u +0 -1 +1 -2 +2n -3 +3n %s -o %s";
char sortcmd[maxstr];
int argc;
char **argv;
main(argcount,arglist)
int argcount;
char **arglist;
{ char *filename;
FILE *input, *output;
long int start,length;
char word[maxstr];
int kcnt;
char tag_line[maxstr];
long int records = 0; /* number of records read */
long int keys = 0; /* number of keys read (occurences) */
long int distinct; /* number of distinct keys */
long int shorten();
argc= argcount-1;
argv= arglist+1;
mktemp(tmpfile);
output= fopen(tmpfile,"w");
for ( flags() ; argc>0 ; argc--, argv++ ,flags() )
{ /* open input file */
filename= *argv;
input= fopen(filename,"r");
if (input==NULL)
{ fprintf(stderr, "invert: error in open of %s\n", filename);
continue;
}
start= 0L;
length= 0L;
for(;;) /* each record */
{ /* find start of next record (exit if none) */
start= nextrecord(input,start+length);
if (start==EOF) break;
records++;
kcnt= 0;
length= recsize(input,start);
sprintf(tag_line, " %s %ld %ld\n", filename, start, length);
while (ftell(input) < start+length && kcnt < max_kcnt)
{ getword(input,word,ignore);
makekey(word,max_klen,common);
if (!isnull(word))
{ fputs(word,output); fputs(tag_line,output);
kcnt++; keys++;
}
}
}
fclose(input);
}
fclose(output);
sprintf(sortcmd, sort_it, tmpfile, tmpfile);
system(sortcmd);
distinct = shorten(tmpfile,INDEX);
if( silent == 0 )
fprintf(stderr,
"%ld documents %ld distinct keys %ld key occurrences\n",
records, distinct, keys);
}
/* Flag Meaning Default
-ki Keys per record 100
-li max Length of keys 6
-%str ignore lines that begin with %x CNOPVX
where x is in str
str is a seq of chars
-cfile file contains Common words /usr/src/local/bib/common
do not use common words as keys
-pfile name of output file INDEX
-s do not print statistics statistics printed
*/
# define operand (strlen(*argv+2)==0 ? (argv++,argc--,*argv) : *argv+2)
flags()
{ for (; argc>0 && *argv[0]=='-'; argc--,argv++)
{ switch ((*argv)[1])
{ case 'k': max_kcnt= atoi(operand);
break;
case 'l': max_klen= atoi(operand);
break;
case 'c': common= operand;
break;
case '%': ignore= *argv+2;
break;
case 'p': INDEX= operand;
break;
case 's': silent= 1;
break;
default: fprintf(stderr, "unknown flag '%s'\n", *argv);
}
}
}
/* shorten(inf,outf): file "inf" consists of lines of the form:
key file start length
sorted by key and file. replace lines with the same key
with one line of the form:
key:file1 start/length ... start/length:file2 start/length ...
rename as file "outf"
returns number of lines in output
*/
long shorten(inf,outf)
char *inf, *outf;
{ FILE *in, *out;
char line[maxstr];
char key[maxstr], newkey[maxstr],
file[maxstr], newfile[maxstr];
long int start, length;
long int lines = 0;
in= fopen(inf, "r");
out= fopen(outf, "w");
if (in==NULL || out==NULL)
{ fprintf(stderr, "invert: error in opening file for compression\n");
return(0);
}
getline(in,line);
sscanf(line,"%s%s%ld%ld", key, file, &start, &length);
fprintf(out, "%s :%s %ld/%ld", key, file, start, length);
for ( getline(in, line) ; !feof(in); getline(in, line))
{ sscanf(line,"%s%s%ld%ld", newkey, newfile, &start, &length);
if (strcmp(key,newkey)!=0)
{ strcpy(key, newkey);
strcpy(file, newfile);
fprintf(out, "\n%s :%s %ld/%ld", key, file, start, length);
lines++;
}
else if (strcmp(file,newfile)!=0)
{ strcpy(file,newfile);
fprintf(out, ":%s %ld/%ld", file, start, length);
}
else
fprintf(out, " %ld/%ld", start, length);
}
fprintf(out, "\n");
lines++;
fclose(in); fclose(out);
unlink(inf);
return (lines);
}