* Copyright (c) 1990 The Regents of the University of California.
* This code is derived from software contributed to Berkeley by
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
#if defined(LIBC_SCCS) && !defined(lint)
static char sccsid
[] = "@(#)btree.c 5.9 (Berkeley) 5/7/91";
#endif /* LIBC_SCCS and not lint */
* btree.c -- implementation of btree access method for 4.4BSD.
* The design here is based on that of the btree access method used
* in the Postgres database system at UC Berkeley. The implementation
* is wholly independent of the Postgres code.
* This implementation supports btrees on disk (supply a filename) or
* in memory (don't). Public interfaces defined here are:
* btree_open() -- wrapper; returns a filled DB struct for
* bt_open() -- open a new or existing btree.
* bt_get() -- fetch data from a tree by key.
* bt_seq() -- do a sequential scan on a tree.
* bt_put() -- add data to a tree by key.
* bt_delete() -- remove data from a tree by key.
* bt_close() -- close a btree.
* bt_sync() -- force btree pages to disk (disk trees only).
BTREEINFO _DefaultBTInfo
= {
* BTREE_OPEN -- Wrapper routine to open a btree.
* Creates and fills a DB struct, and calls the routine that actually
* flags: flag bits passed to open
* mode: permission bits, used if O_CREAT specified
* Filled-in DBT on success; NULL on failure, with errno
* Allocates memory for the DB struct.
btree_open(f
, flags
, mode
, b
)
if ((db
= (DB
*) malloc((unsigned) sizeof(DB
))) == (DB
*) NULL
)
if ((t
= bt_open(f
, flags
, mode
, b
)) == (BTREE
) NULL
) {
(void) free ((char *) db
);
db
->internal
= (char *) t
;
* BT_OPEN -- Open a btree.
* This routine creates the correct kind (disk or in-memory) of
* btree and initializes it as required.
* f -- name of btree (NULL for in-memory btrees)
* flags -- flags passed to open()
* mode -- mode passed to open ()
* b -- BTREEINFO structure, describing btree
* (Opaque) pointer to the btree. On failure, returns NULL
* with errno set as appropriate.
* Allocates memory, opens files.
bt_open(f
, flags
, mode
, b
)
/* use the default info if none was provided */
if (b
== (BTREEINFO
*) NULL
)
if ((t
= (BTREE_P
) malloc((unsigned) sizeof *t
)) == (BTREE_P
) NULL
)
t
->bt_compare
= b
->compare
;
t
->bt_curpage
= (BTHEADER
*) NULL
;
c
->c_flags
= (u_char
) NULL
;
c
->c_key
= (char *) NULL
;
t
->bt_stack
= (BTSTACK
*) NULL
;
* If no file name was supplied, this is an in-memory btree.
* Otherwise, it's a disk-based btree.
if (f
== (char *) NULL
) {
if ((t
->bt_psize
= b
->psize
) < MINPSIZE
) {
(void) free ((char *) t
);
t
->bt_psize
= getpagesize();
nbytes
= HTSIZE
* sizeof(HTBUCKET
*);
if ((ht
= (HTABLE
) malloc((unsigned) nbytes
))
(void) bzero((char *) ht
, nbytes
);
t
->bt_lorder
= BYTE_ORDER
;
t
->bt_flags
|= BTF_NODUPS
;
if ((fd
= open(f
, O_RDONLY
, 0)) < 0) {
/* doesn't exist yet, be sure page is big enough */
if ((t
->bt_psize
= b
->psize
) < sizeof(BTHEADER
)
if (b
->lorder
!= BIG_ENDIAN
&& b
->lorder
!= LITTLE_ENDIAN
) {
t
->bt_lorder
= b
->lorder
;
t
->bt_flags
|= BTF_NODUPS
;
/* exists, get page size from file */
if (read(fd
, &m
, sizeof(m
)) < sizeof(m
)) {
/* lorder always stored in host-independent format */
if (m
.m_lorder
!= BIG_ENDIAN
&& m
.m_lorder
!= LITTLE_ENDIAN
) {
t
->bt_lorder
= m
.m_lorder
;
if (t
->bt_lorder
!= BYTE_ORDER
) {
if (m
.m_magic
!= BTREEMAGIC
|| m
.m_version
!= BTREEVERSION
|| m
.m_psize
< MINPSIZE
) {
t
->bt_flags
|= (m
.m_flags
& BTF_NODUPS
) | BTF_METAOK
;
/* now open the file the way the user wanted it */
if ((t
->bt_s
.bt_d
.d_fd
= open(f
, flags
, mode
)) < 0) {
(void) free ((char *) t
);
/* access method files are always close-on-exec */
if (fcntl(t
->bt_s
.bt_d
.d_fd
, F_SETFL
, 1) == -1) {
(void) close(t
->bt_s
.bt_d
.d_fd
);
(void) free ((char *) t
);
/* get number of pages, page size if necessary */
(void) fstat(t
->bt_s
.bt_d
.d_fd
, &buf
);
t
->bt_psize
= buf
.st_blksize
;
t
->bt_npages
= (pgno_t
) (buf
.st_size
/ t
->bt_psize
);
/* page zero is metadata, doesn't count */
/* get an lru buffer cache, if the user asked for one */
if ((b
->cachesize
/ t
->bt_psize
) > 0) {
BTDISK
*d
= &(t
->bt_s
.bt_d
);
d
->d_cache
= lruinit(d
->d_fd
,
(int) (b
->cachesize
/ t
->bt_psize
),
if (d
->d_cache
== (char *) NULL
) {
/* remember if tree was opened for write */
if (((flags
& O_WRONLY
) == O_WRONLY
)
|| ((flags
& O_RDWR
) == O_RDWR
))
t
->bt_flags
|= BTF_ISWRITE
;
* BT_GET -- Get an entry from a btree.
* Does a key lookup in the tree to find the specified key, and returns
* the key/data pair found.
* tree -- btree in which to do lookup
* data -- pointer to DBT in which to return data
* RET_SUCCESS, RET_ERROR, or RET_SPECIAL if the key is not
* found. If key is not found, nothing is stored in the
* Return data is statically allocated, and will be overwritten
bt_get(dbp
, key
, data
, flag
)
BTREE_P t
= (BTREE_P
) (dbp
->internal
);
item
= _bt_search(t
, key
);
/* clear parent pointer stack */
while (_bt_pop(t
) != P_NONE
)
if (item
== (BTITEM
*) NULL
)
h
= (BTHEADER
*) t
->bt_curpage
;
data
->data
= (u_char
*) NULL
;
&& _bt_cmp(t
, key
->data
, item
->bti_index
) == 0) {
d
= (DATUM
*) GETDATUM(h
, item
->bti_index
);
return (_bt_buildret(t
, d
, data
, key
));
* BT_PUT -- Add an entry to a btree.
* The specified (key, data) pair is added to the tree. If the tree
* was created for unique keys only, then duplicates will not be
* entered. If the requested key exists in the tree, it will be over-
* written unless the flags parameter is R_NOOVERWRITE, in which case
* the update will not be done. If duplicate keys are permitted in the
* tree, duplicates will be inserted and will not overwrite existing
* keys. Nodes are split as required.
* tree -- btree in which to put the new entry
* key -- key component to add
* data -- data corresponding to key
* flag -- R_NOOVERWRITE or zero.
* RET_SUCCESS, RET_ERROR, or RET_SPECIAL if the
* NOOVERWRITE flag was set and the specified key exists
bt_put(dbp
, key
, data
, flag
)
t
= (BTREE_P
)dbp
->internal
;
/* look for this key in the tree */
item
= _bt_search(t
, key
);
* If this tree was originally created without R_DUP, then duplicate
* keys are not allowed. We need to check this at insertion time.
if (VALIDITEM(t
, item
) && _bt_cmp(t
, key
->data
, item
->bti_index
) == 0) {
if ((t
->bt_flags
& BTF_NODUPS
) && flag
== R_NOOVERWRITE
) {
if (_bt_delone(t
, item
->bti_index
) == RET_ERROR
) {
while (_bt_pop(t
) != P_NONE
)
return (_bt_insert(t
, item
, key
, data
, flag
));
* BT_DELETE -- delete a key from the tree.
* Deletes all keys (and their associated data items) matching the
* supplied key from the tree. If the flags entry is R_CURSOR, then
* the current item in the active scan is deleted.
* tree -- btree from which to delete key
* flags -- R_CURSOR or zero
* RET_SUCCESS, RET_ERROR, or RET_SPECIAL if the specified
* key was not in the tree.
bt_delete(dbp
, key
, flags
)
t
= (BTREE_P
)dbp
->internal
;
/* find the first matching key in the tree */
item
= _bt_first(t
, key
);
/* don't need the descent stack for deletes */
while (_bt_pop(t
) != P_NONE
)
/* delete all matching keys */
while (VALIDITEM(t
, item
)
&& (_bt_cmp(t
, key
->data
, item
->bti_index
) == 0)) {
if (_bt_delone(t
, item
->bti_index
) == RET_ERROR
)
if (VALIDITEM(t
, item
) || h
->h_nextpg
== P_NONE
)
/* next page, if necessary */
if (_bt_getpage(t
, h
->h_nextpg
) == RET_ERROR
)
} while (NEXTINDEX(h
) == 0 && h
->h_nextpg
!= P_NONE
);
item
->bti_pgno
= h
->h_pgno
;
|| _bt_cmp(t
, key
->data
, item
->bti_index
) != 0)
/* flush changes to disk */
if (h
->h_flags
& F_DIRTY
) {
if (_bt_write(t
, t
->bt_curpage
, NORELEASE
) == RET_ERROR
)
* BT_SYNC -- sync the btree to disk.
* RET_SUCCESS, RET_ERROR.
t
= (BTREE_P
)dbp
->internal
;
/* if this is an in-memory btree, syncing is a no-op */
h
= (BTHEADER
*) t
->bt_curpage
;
pgno
= t
->bt_curpage
->h_pgno
;
if (_bt_write(t
, h
, RELEASE
) == RET_ERROR
)
if (lrusync(t
->bt_s
.bt_d
.d_cache
) < RET_ERROR
)
if (_bt_getpage(t
, pgno
) == RET_ERROR
)
if (_bt_write(t
, h
, NORELEASE
) == RET_ERROR
)
return (fsync(t
->bt_s
.bt_d
.d_fd
));
* BT_SEQ -- Sequential scan interface.
* This routine supports sequential scans on the btree. If called with
* flags set to R_CURSOR, or if no seq scan has been initialized in the
* current tree, we initialize the scan. Otherwise, we advance the scan
* and return the next item.
* Scans can be either keyed or non-keyed. Keyed scans basically have
* a starting point somewhere in the middle of the tree. Non-keyed
* scans start at an endpoint. Also, scans can be backward (descending
* order), forward (ascending order), or no movement (keep returning
* Flags is checked every time we enter the routine, so the user can
* change directions on an active scan if desired. The key argument
* is examined only when we initialize the scan, in order to position
* Items are returned via the key and data arguments passed in.
* tree -- btree in which to do scan
* key -- key, used to position scan on initialization, and
* used to return key components to the user.
* data -- used to return data components to the user.
* flags -- specify R_CURSOR, R_FIRST, R_LAST, R_NEXT, or
* RET_SUCCESS, RET_ERROR, or RET_SPECIAL if no more data
* exists in the tree in the specified direction.
* Changes the btree's notion of the current position in the
* The key and data items returned are static and will be
* overwritten by the next bt_get or bt_seq.
bt_seq(dbp
, key
, data
, flags
)
t
= (BTREE_P
)dbp
->internal
;
/* do we need to initialize the scan? */
if (flags
== R_CURSOR
|| flags
== R_LAST
|| flags
== R_FIRST
|| !(t
->bt_flags
& BTF_SEQINIT
)) {
status
= _bt_seqinit(t
, key
, flags
);
/* just advance the current scan pointer */
status
= _bt_seqadvance(t
, flags
);
key
->size
= data
->size
= 0;
key
->data
= data
->data
= (u_char
*) NULL
;
/* is there a valid item at the current scan location? */
if (status
== RET_SPECIAL
) {
if (t
->bt_cursor
.c_index
>= NEXTINDEX(h
)) {
t
->bt_cursor
.c_index
= NEXTINDEX(h
) - 1;
t
->bt_cursor
.c_index
= 0;
t
->bt_cursor
.c_index
= 0;
} else if (status
== RET_ERROR
)
/* okay, return the data */
d
= (DATUM
*) GETDATUM(h
, t
->bt_cursor
.c_index
);
return (_bt_buildret(t
, d
, data
, key
));
* BT_CLOSE -- Close a btree
* RET_SUCCESS, RET_ERROR.
* Frees space occupied by the tree.
t
= (BTREE_P
)dbp
->internal
;
if (t
->bt_cursor
.c_key
!= (char *) NULL
)
(void) free(t
->bt_cursor
.c_key
);
/* in-memory tree, release hash table memory */
for (i
= 0; i
< HTSIZE
; i
++) {
if ((b
= ht
[i
]) == (struct HTBUCKET
*) NULL
)
(void) free((char *) b
->ht_page
);
(void) free((char *) sb
);
} while (b
!= (struct HTBUCKET
*) NULL
);
(void) free ((char *) ht
);
(void) free ((char *) t
);
if ((t
->bt_flags
& BTF_ISWRITE
) && !(t
->bt_flags
& BTF_METAOK
)) {
if (_bt_wrtmeta(t
) == RET_ERROR
)
if (t
->bt_curpage
!= (BTHEADER
*) NULL
) {
if (h
->h_flags
& F_DIRTY
) {
if (_bt_write(t
, h
, RELEASE
) == RET_ERROR
)
if (_bt_release(t
, h
) == RET_ERROR
)
/* flush and free the cache, if there is one */
cache
= t
->bt_s
.bt_d
.d_cache
;
if (lrusync(cache
) == RET_ERROR
)
(void) free ((char *) h
);
(void) free ((char *) t
);