Commit | Line | Data |
---|---|---|
d3ce25cd | 1 | #ifndef lint |
ad787160 | 2 | static char sccsid[] = "@(#)invert.c 2.7 5/27/93"; |
d3ce25cd | 3 | #endif not lint |
e473df34 | 4 | # |
b529738d GL |
5 | /* input: records of lines, separated by blank lines |
6 | output: key:file1 start/length ... start/length:file2 start/length ... | |
7 | */ | |
8 | ||
9 | # include "stdio.h" | |
10 | # include "streams.h" | |
11 | # include "bib.h" | |
b529738d GL |
12 | # define makelow(c) ('A'<=(c) && (c)<='Z' ? (c)-'A'+'a' : c) |
13 | ||
14 | int max_kcnt = 100; /* max number of keys */ | |
15 | int max_klen = 6; /* max length of keys */ | |
16 | char *ignore = /* string of line starts to ignore */ | |
17 | "CNOPVX"; | |
b529738d GL |
18 | char *INDEX= /* name of output file */ |
19 | INDXFILE; | |
20 | ||
5e871a33 | 21 | char *bibtmpfile = /* name of temporary file */ |
b529738d GL |
22 | INVTEMPFILE; |
23 | ||
24 | int silent = 0; /* 0 => statistics printed */ | |
25 | /* 1 => no statisitics printed */ | |
26 | ||
27 | char *sort_it = | |
28 | "sort -u +0 -1 +1 -2 +2n -3 +3n %s -o %s"; | |
29 | char sortcmd[maxstr]; | |
30 | ||
31 | int argc; | |
32 | char **argv; | |
33 | ||
34 | main(argcount,arglist) | |
35 | int argcount; | |
36 | char **arglist; | |
37 | { char *filename; | |
38 | FILE *input, *output; | |
39 | long int start,length; | |
40 | char word[maxstr]; | |
41 | int kcnt; | |
42 | char tag_line[maxstr]; | |
dbc5fe45 | 43 | int bol = 1; /* at beginning of line */ |
b529738d GL |
44 | |
45 | long int records = 0; /* number of records read */ | |
46 | long int keys = 0; /* number of keys read (occurences) */ | |
47 | long int distinct; /* number of distinct keys */ | |
48 | long int shorten(); | |
49 | ||
dbc5fe45 KB |
50 | InitDirectory(BMACLIB,N_BMACLIB); |
51 | InitDirectory(COMFILE,N_COMFILE); | |
e9550f8a | 52 | |
b529738d GL |
53 | argc= argcount-1; |
54 | argv= arglist+1; | |
5e871a33 GL |
55 | mktemp(bibtmpfile); |
56 | output= fopen(bibtmpfile,"w"); | |
b529738d GL |
57 | |
58 | for ( flags() ; argc>0 ; argc--, argv++ ,flags() ) | |
59 | { /* open input file */ | |
60 | filename= *argv; | |
61 | input= fopen(filename,"r"); | |
62 | if (input==NULL) | |
dbc5fe45 | 63 | { fprintf(stderr,"invert: error in open of %s\n", filename); |
b529738d GL |
64 | continue; |
65 | } | |
dbc5fe45 KB |
66 | start= 0L; |
67 | length= 0L; | |
68 | ||
69 | for(;;) /* each record */ { | |
70 | /* find start of next record (exit if none) */ | |
71 | start= nextrecord(input,start+length); | |
72 | if (start==EOF) break; | |
73 | records++; | |
74 | kcnt= 0; | |
75 | length= recsize(input,start); | |
76 | sprintf(tag_line, " %s %d %d\n", filename, start, length); | |
77 | ||
78 | while (ftell(input) < start+length && kcnt < max_kcnt) { | |
79 | getword(input,word,ignore,&bol); | |
80 | makekey(word,max_klen,COMFILE); | |
81 | if (*word != NULL) { | |
82 | fputs(word,output); fputs(tag_line,output); | |
83 | kcnt++; keys++; | |
84 | } | |
85 | } | |
86 | } | |
87 | fclose(input); | |
88 | } | |
b529738d GL |
89 | fclose(output); |
90 | ||
5e871a33 | 91 | sprintf(sortcmd, sort_it, bibtmpfile, bibtmpfile); |
b529738d GL |
92 | system(sortcmd); |
93 | ||
5e871a33 | 94 | distinct = shorten(bibtmpfile,INDEX); |
b529738d GL |
95 | if( silent == 0 ) |
96 | fprintf(stderr, | |
e473df34 | 97 | "%d documents %d distinct keys %d key occurrences\n", |
b529738d | 98 | records, distinct, keys); |
cc79be5d | 99 | exit(0); |
b529738d GL |
100 | } |
101 | ||
102 | ||
103 | ||
104 | /* Flag Meaning Default | |
105 | -ki Keys per record 100 | |
106 | -li max Length of keys 6 | |
107 | -%str ignore lines that begin with %x CNOPVX | |
108 | where x is in str | |
109 | str is a seq of chars | |
98535d6d | 110 | -cfile file contains Common words /usr/new/lib/bib/common |
b529738d GL |
111 | do not use common words as keys |
112 | -pfile name of output file INDEX | |
113 | -s do not print statistics statistics printed | |
114 | */ | |
115 | ||
116 | # define operand (strlen(*argv+2)==0 ? (argv++,argc--,*argv) : *argv+2) | |
117 | ||
118 | flags() | |
e9550f8a RH |
119 | { |
120 | char *p; | |
121 | for (; argc>0 && *argv[0]=='-'; argc--,argv++) | |
b529738d GL |
122 | { switch ((*argv)[1]) |
123 | { case 'k': max_kcnt= atoi(operand); | |
124 | break; | |
125 | case 'l': max_klen= atoi(operand); | |
126 | break; | |
dbc5fe45 | 127 | case 'c': strcpy(COMFILE,operand); |
b529738d GL |
128 | break; |
129 | case '%': ignore= *argv+2; | |
130 | break; | |
131 | case 'p': INDEX= operand; | |
132 | break; | |
133 | case 's': silent= 1; | |
134 | break; | |
e9550f8a RH |
135 | case 'd': |
136 | p = &argv[0][2]; | |
137 | if (!p) { | |
138 | argv++; | |
139 | p = &argv[0][0]; | |
140 | } | |
141 | strreplace(COMFILE, BMACLIB, p); | |
142 | strcpy(BMACLIB, p); | |
143 | break; | |
dbc5fe45 | 144 | default: fprintf(stderr,"unknown flag '%s'\n", *argv); |
b529738d GL |
145 | } |
146 | } | |
147 | } | |
148 | ||
149 | ||
150 | /* shorten(inf,outf): file "inf" consists of lines of the form: | |
151 | key file start length | |
152 | sorted by key and file. replace lines with the same key | |
153 | with one line of the form: | |
154 | key:file1 start/length ... start/length:file2 start/length ... | |
155 | rename as file "outf" | |
156 | returns number of lines in output | |
157 | */ | |
158 | long shorten(inf,outf) | |
159 | char *inf, *outf; | |
160 | { FILE *in, *out; | |
161 | char line[maxstr]; | |
162 | char key[maxstr], newkey[maxstr], | |
163 | file[maxstr], newfile[maxstr]; | |
164 | long int start, length; | |
165 | long int lines = 0; | |
166 | ||
167 | in= fopen(inf, "r"); | |
168 | out= fopen(outf, "w"); | |
169 | if (in==NULL || out==NULL) | |
dbc5fe45 | 170 | { fprintf(stderr,"invert: error in opening file for compression\n"); |
b529738d GL |
171 | return(0); |
172 | } | |
173 | ||
174 | getline(in,line); | |
e473df34 GL |
175 | sscanf(line,"%s%s%d%d", key, file, &start, &length); |
176 | fprintf(out, "%s :%s %d/%d", key, file, start, length); | |
b529738d | 177 | for ( getline(in, line) ; !feof(in); getline(in, line)) |
e473df34 | 178 | { sscanf(line,"%s%s%d%d", newkey, newfile, &start, &length); |
b529738d GL |
179 | if (strcmp(key,newkey)!=0) |
180 | { strcpy(key, newkey); | |
181 | strcpy(file, newfile); | |
e473df34 | 182 | fprintf(out, "\n%s :%s %d/%d", key, file, start, length); |
b529738d GL |
183 | lines++; |
184 | } | |
185 | else if (strcmp(file,newfile)!=0) | |
186 | { strcpy(file,newfile); | |
e473df34 | 187 | fprintf(out, ":%s %d/%d", file, start, length); |
b529738d GL |
188 | } |
189 | else | |
e473df34 | 190 | fprintf(out, " %d/%d", start, length); |
b529738d GL |
191 | } |
192 | fprintf(out, "\n"); | |
193 | lines++; | |
194 | ||
195 | fclose(in); fclose(out); | |
196 | unlink(inf); | |
197 | return (lines); | |
198 | } |