* Copyright (c) 1989 The Regents of the University of California.
* This code is derived from software contributed to Berkeley by
* Redistribution and use in source and binary forms are permitted
* provided that the above copyright notice and this paragraph are
* duplicated in all such forms and that any documentation,
* advertising materials, and other materials related to such
* distribution and use acknowledge that the software was developed
* by the University of California, Berkeley. The name of the
* University may not be used to endorse or promote products derived
* from this software without specific prior written permission.
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
"@(#) Copyright (c) 1989 The Regents of the University of California.\n\
static char sccsid
[] = "@(#)locate.code.c 4.5 (Berkeley) %G%";
* PURPOSE: sorted list compressor (works with a modified 'find'
* to encode/decode a filename database)
* USAGE: bigram < list > bigrams
* process bigrams (see updatedb) > common_bigrams
* code common_bigrams < list > squozen_list
* METHOD: Uses 'front compression' (see ";login:", Volume 8, Number 1
* February/March 1983, p. 8 ). Output format is, per line, an
* offset differential count byte followed by a partially bigram-
* encoded ascii residue. A bigram is a two-character sequence,
* the first 128 most common of which are encoded in one byte.
* EXAMPLE: For simple front compression with no bigram encoding,
* if the input is... then the output is...
* /usr/src/cmd/aardvark.c 8 /cmd/aardvark.c
* /usr/src/cmd/armadillo.c 14 armadillo.c
* 0-28 likeliest differential counts + offset to make nonnegative
* 30 switch code for out-of-range count to follow in next word
* 128-255 bigram codes (128 most common, as determined by 'updatedb')
* 32-127 single character (printable) ascii residue (ie, literal)
* SEE ALSO: updatedb.csh, bigram.c, find.c
* AUTHOR: James A. Woods, Informatics General Corp.,
* NASA Ames Research Center, 10/82
#define BGBUFSIZE (NBG * 2) /* size of bigram buffer */
char buf1
[MAXPATHLEN
] = " ";
char bigrams
[BGBUFSIZE
+ 1] = { 0 };
register char *cp
, *oldpath
= buf1
, *path
= buf2
;
int code
, count
, diffcount
, oldcount
= 0;
if ((fp
= fopen(argv
[1], "r")) == NULL
) {
printf("Usage: code common_bigrams < list > squozen_list\n");
/* first copy bigram array to stdout */
fgets ( bigrams
, BGBUFSIZE
+ 1, fp
);
fwrite ( bigrams
, 1, BGBUFSIZE
, stdout
);
while ( fgets ( path
, sizeof(buf2
), stdin
) != NULL
) {
/* squelch characters that would botch the decoding */
for ( cp
= path
; *cp
!= NULL
; cp
++ ) {
else if ( *cp
<= SWITCH
)
/* skip longest common prefix */
for ( cp
= path
; *cp
== *oldpath
; cp
++, oldpath
++ )
diffcount
= count
- oldcount
+ OFFSET
;
if ( diffcount
< 0 || diffcount
> 2*OFFSET
) {
putw ( diffcount
, stdout
);
putc ( diffcount
, stdout
);
if ( *(cp
+ 1) == NULL
) {
if ( (code
= bgindex ( cp
)) < 0 ) {
else { /* found, so mark byte with parity bit */
putchar ( (code
/ 2) | PARITY
);
if ( path
== buf1
) /* swap pointers */
path
= buf2
, oldpath
= buf1
;
path
= buf1
, oldpath
= buf2
;
bgindex ( bg
) /* return location of bg in bigrams or -1 */
register char bg0
= bg
[0], bg1
= bg
[1];
for ( p
= bigrams
; *p
!= NULL
; p
++ )
if ( *p
++ == bg0
&& *p
== bg1
)
return ( *p
== NULL
? -1 : --p
- bigrams
);