BSD 4_4 release
[unix-history] / usr / src / contrib / bib / src / invert.c
CommitLineData
d3ce25cd 1#ifndef lint
ad787160 2static char sccsid[] = "@(#)invert.c 2.7 5/27/93";
d3ce25cd 3#endif not lint
e473df34 4#
b529738d
GL
5/* input: records of lines, separated by blank lines
6 output: key:file1 start/length ... start/length:file2 start/length ...
7*/
8
9# include "stdio.h"
10# include "streams.h"
11# include "bib.h"
b529738d
GL
12# define makelow(c) ('A'<=(c) && (c)<='Z' ? (c)-'A'+'a' : c)
13
14int max_kcnt = 100; /* max number of keys */
15int max_klen = 6; /* max length of keys */
16char *ignore = /* string of line starts to ignore */
17 "CNOPVX";
b529738d
GL
18char *INDEX= /* name of output file */
19 INDXFILE;
20
5e871a33 21char *bibtmpfile = /* name of temporary file */
b529738d
GL
22 INVTEMPFILE;
23
24int silent = 0; /* 0 => statistics printed */
25 /* 1 => no statisitics printed */
26
27char *sort_it =
28 "sort -u +0 -1 +1 -2 +2n -3 +3n %s -o %s";
29char sortcmd[maxstr];
30
31int argc;
32char **argv;
33
34main(argcount,arglist)
35int argcount;
36char **arglist;
37{ char *filename;
38 FILE *input, *output;
39 long int start,length;
40 char word[maxstr];
41 int kcnt;
42 char tag_line[maxstr];
dbc5fe45 43 int bol = 1; /* at beginning of line */
b529738d
GL
44
45 long int records = 0; /* number of records read */
46 long int keys = 0; /* number of keys read (occurences) */
47 long int distinct; /* number of distinct keys */
48 long int shorten();
49
dbc5fe45
KB
50 InitDirectory(BMACLIB,N_BMACLIB);
51 InitDirectory(COMFILE,N_COMFILE);
e9550f8a 52
b529738d
GL
53 argc= argcount-1;
54 argv= arglist+1;
5e871a33
GL
55 mktemp(bibtmpfile);
56 output= fopen(bibtmpfile,"w");
b529738d
GL
57
58 for ( flags() ; argc>0 ; argc--, argv++ ,flags() )
59 { /* open input file */
60 filename= *argv;
61 input= fopen(filename,"r");
62 if (input==NULL)
dbc5fe45 63 { fprintf(stderr,"invert: error in open of %s\n", filename);
b529738d
GL
64 continue;
65 }
dbc5fe45
KB
66 start= 0L;
67 length= 0L;
68
69 for(;;) /* each record */ {
70 /* find start of next record (exit if none) */
71 start= nextrecord(input,start+length);
72 if (start==EOF) break;
73 records++;
74 kcnt= 0;
75 length= recsize(input,start);
76 sprintf(tag_line, " %s %d %d\n", filename, start, length);
77
78 while (ftell(input) < start+length && kcnt < max_kcnt) {
79 getword(input,word,ignore,&bol);
80 makekey(word,max_klen,COMFILE);
81 if (*word != NULL) {
82 fputs(word,output); fputs(tag_line,output);
83 kcnt++; keys++;
84 }
85 }
86 }
87 fclose(input);
88 }
b529738d
GL
89 fclose(output);
90
5e871a33 91 sprintf(sortcmd, sort_it, bibtmpfile, bibtmpfile);
b529738d
GL
92 system(sortcmd);
93
5e871a33 94 distinct = shorten(bibtmpfile,INDEX);
b529738d
GL
95 if( silent == 0 )
96 fprintf(stderr,
e473df34 97 "%d documents %d distinct keys %d key occurrences\n",
b529738d 98 records, distinct, keys);
cc79be5d 99 exit(0);
b529738d
GL
100}
101
102
103
104/* Flag Meaning Default
105 -ki Keys per record 100
106 -li max Length of keys 6
107 -%str ignore lines that begin with %x CNOPVX
108 where x is in str
109 str is a seq of chars
98535d6d 110 -cfile file contains Common words /usr/new/lib/bib/common
b529738d
GL
111 do not use common words as keys
112 -pfile name of output file INDEX
113 -s do not print statistics statistics printed
114*/
115
116# define operand (strlen(*argv+2)==0 ? (argv++,argc--,*argv) : *argv+2)
117
118flags()
e9550f8a
RH
119{
120 char *p;
121 for (; argc>0 && *argv[0]=='-'; argc--,argv++)
b529738d
GL
122 { switch ((*argv)[1])
123 { case 'k': max_kcnt= atoi(operand);
124 break;
125 case 'l': max_klen= atoi(operand);
126 break;
dbc5fe45 127 case 'c': strcpy(COMFILE,operand);
b529738d
GL
128 break;
129 case '%': ignore= *argv+2;
130 break;
131 case 'p': INDEX= operand;
132 break;
133 case 's': silent= 1;
134 break;
e9550f8a
RH
135 case 'd':
136 p = &argv[0][2];
137 if (!p) {
138 argv++;
139 p = &argv[0][0];
140 }
141 strreplace(COMFILE, BMACLIB, p);
142 strcpy(BMACLIB, p);
143 break;
dbc5fe45 144 default: fprintf(stderr,"unknown flag '%s'\n", *argv);
b529738d
GL
145 }
146 }
147}
148
149
150/* shorten(inf,outf): file "inf" consists of lines of the form:
151 key file start length
152 sorted by key and file. replace lines with the same key
153 with one line of the form:
154 key:file1 start/length ... start/length:file2 start/length ...
155 rename as file "outf"
156 returns number of lines in output
157*/
158long shorten(inf,outf)
159char *inf, *outf;
160{ FILE *in, *out;
161 char line[maxstr];
162 char key[maxstr], newkey[maxstr],
163 file[maxstr], newfile[maxstr];
164 long int start, length;
165 long int lines = 0;
166
167 in= fopen(inf, "r");
168 out= fopen(outf, "w");
169 if (in==NULL || out==NULL)
dbc5fe45 170 { fprintf(stderr,"invert: error in opening file for compression\n");
b529738d
GL
171 return(0);
172 }
173
174 getline(in,line);
e473df34
GL
175 sscanf(line,"%s%s%d%d", key, file, &start, &length);
176 fprintf(out, "%s :%s %d/%d", key, file, start, length);
b529738d 177 for ( getline(in, line) ; !feof(in); getline(in, line))
e473df34 178 { sscanf(line,"%s%s%d%d", newkey, newfile, &start, &length);
b529738d
GL
179 if (strcmp(key,newkey)!=0)
180 { strcpy(key, newkey);
181 strcpy(file, newfile);
e473df34 182 fprintf(out, "\n%s :%s %d/%d", key, file, start, length);
b529738d
GL
183 lines++;
184 }
185 else if (strcmp(file,newfile)!=0)
186 { strcpy(file,newfile);
e473df34 187 fprintf(out, ":%s %d/%d", file, start, length);
b529738d
GL
188 }
189 else
e473df34 190 fprintf(out, " %d/%d", start, length);
b529738d
GL
191 }
192 fprintf(out, "\n");
193 lines++;
194
195 fclose(in); fclose(out);
196 unlink(inf);
197 return (lines);
198}