1d07cbd7963790178a30e81b98bb8d61fa11fb41
static char sccsid
[] = "@(#)invert.c 2.2 %G%";
/* input: records of lines, separated by blank lines
output: key:file1 start/length ... start/length:file2 start/length ...
# define isnull(x) (*(x) == NULL)
# define makelow(c) ('A'<=(c) && (c)<='Z' ? (c)-'A'+'a' : c)
int max_kcnt
= 100; /* max number of keys */
int max_klen
= 6; /* max length of keys */
char *ignore
= /* string of line starts to ignore */
char *common
= /* name of file of common words */
char *INDEX
= /* name of output file */
char *tmpfile
= /* name of temporary file */
int silent
= 0; /* 0 => statistics printed */
/* 1 => no statisitics printed */
"sort -u +0 -1 +1 -2 +2n -3 +3n %s -o %s";
long int records
= 0; /* number of records read */
long int keys
= 0; /* number of keys read (occurences) */
long int distinct
; /* number of distinct keys */
output
= fopen(tmpfile
,"w");
for ( flags() ; argc
>0 ; argc
--, argv
++ ,flags() )
input
= fopen(filename
,"r");
{ fprintf(stderr
, "invert: error in open of %s\n", filename
);
for(;;) /* each record */
{ /* find start of next record (exit if none) */
start
= nextrecord(input
,start
+length
);
length
= recsize(input
,start
);
sprintf(tag_line
, " %s %ld %ld\n", filename
, start
, length
);
while (ftell(input
) < start
+length
&& kcnt
< max_kcnt
)
{ getword(input
,word
,ignore
);
makekey(word
,max_klen
,common
);
{ fputs(word
,output
); fputs(tag_line
,output
);
sprintf(sortcmd
, sort_it
, tmpfile
, tmpfile
);
distinct
= shorten(tmpfile
,INDEX
);
"%ld documents %ld distinct keys %ld key occurrences\n",
records
, distinct
, keys
);
-%str ignore lines that begin with %x CNOPVX
-cfile file contains Common words /usr/src/local/bib/common
do not use common words as keys
-pfile name of output file INDEX
-s do not print statistics statistics printed
# define operand (strlen(*argv+2)==0 ? (argv++,argc--,*argv) : *argv+2)
{ for (; argc
>0 && *argv
[0]=='-'; argc
--,argv
++)
{ case 'k': max_kcnt
= atoi(operand
);
case 'l': max_klen
= atoi(operand
);
case 'c': common
= operand
;
case '%': ignore
= *argv
+2;
case 'p': INDEX
= operand
;
default: fprintf(stderr
, "unknown flag '%s'\n", *argv
);
/* shorten(inf,outf): file "inf" consists of lines of the form:
sorted by key and file. replace lines with the same key
with one line of the form:
key:file1 start/length ... start/length:file2 start/length ...
returns number of lines in output
char key
[maxstr
], newkey
[maxstr
],
file
[maxstr
], newfile
[maxstr
];
if (in
==NULL
|| out
==NULL
)
{ fprintf(stderr
, "invert: error in opening file for compression\n");
sscanf(line
,"%s%s%ld%ld", key
, file
, &start
, &length
);
fprintf(out
, "%s :%s %ld/%ld", key
, file
, start
, length
);
for ( getline(in
, line
) ; !feof(in
); getline(in
, line
))
{ sscanf(line
,"%s%s%ld%ld", newkey
, newfile
, &start
, &length
);
if (strcmp(key
,newkey
)!=0)
fprintf(out
, "\n%s :%s %ld/%ld", key
, file
, start
, length
);
else if (strcmp(file
,newfile
)!=0)
fprintf(out
, ":%s %ld/%ld", file
, start
, length
);
fprintf(out
, " %ld/%ld", start
, length
);