* Copyright (c) 1990 The Regents of the University of California.
* This code is derived from software contributed to Berkeley by
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
#if defined(LIBC_SCCS) && !defined(lint)
static char sccsid
[] = "@(#)storage.c 5.2 (Berkeley) 2/22/91";
#endif /* LIBC_SCCS and not lint */
* BT_GETPAGE -- Make pgno the current page of the btree.
* This routine is just a wrapper that decides whether to call the
* memory or disk-based routine to do the work.
* t -- btree in which to get page
* pgno -- page number to get
* RET_SUCCESS or RET_ERROR.
/* see if we can get away without doing any work */
if (t
->bt_curpage
!= (BTHEADER
*) NULL
) {
if (t
->bt_curpage
->h_pgno
== pgno
)
if (t
->bt_fname
== (char *) NULL
)
return (_bt_getmpage(t
, pgno
));
return (_bt_getdpage(t
, pgno
));
* _BT_GETMPAGE -- Make pgno the current page of the btree.
* This routine gets pages for in-memory btrees.
* t -- btree in which to get page
* pgno -- page number to get
* RET_SUCCESS or RET_ERROR.
if (t
->bt_curpage
== (BTHEADER
*) NULL
) {
h
= (BTHEADER
*) malloc((unsigned) t
->bt_psize
);
if (h
== (BTHEADER
*) NULL
)
(((char *) &(h
->h_linp
[0])) - ((char *) h
));
h
->h_upper
= t
->bt_psize
;
h
->h_prevpg
= h
->h_nextpg
= P_NONE
;
/* get the root page into the hash table */
if (_bt_write(t
, h
, RELEASE
) == RET_ERROR
)
for (b
= t
->bt_s
.bt_ht
[htindex
];
if (b
->ht_pgno
== pgno
) {
t
->bt_curpage
= b
->ht_page
;
* _BT_GETDPAGE -- Make pgno the current page of the btree.
* This routine gets pages for disk btrees.
* Because disk btree pages must be readable across machine architectures,
* the btree code writes integers out in network format. This routine
* converts them back to host format before returning the page.
* t -- btree in which to get page
* pgno -- page number to get
* RET_SUCCESS, RET_ERROR.
/* if we have an lru cache, let the cache code do the work */
cache
= t
->bt_s
.bt_d
.d_cache
;
/* release the old page */
if (t
->bt_curpage
!= (BTHEADER
*) NULL
) {
pgno_t opgno
= t
->bt_curpage
->h_pgno
;
t
->bt_curpage
->h_flags
&= ~F_DIRTY
;
if (lruwrite(cache
, (int) opgno
) < 0)
if (lrurelease(cache
, (int) opgno
) < 0)
if (pgno
> t
->bt_npages
) {
if ((h
= (BTHEADER
*) lrugetnew(cache
, (int)pgno
, &nbytes
))
if ((h
= (BTHEADER
*) lruget(cache
, (int)pgno
, &nbytes
))
/* init this page, if necessary */
(((char *) &(h
->h_linp
[0])) - ((char *) h
));
h
->h_upper
= t
->bt_psize
;
h
->h_prevpg
= h
->h_nextpg
= P_NONE
;
/* sync the current page, if necessary */
if (t
->bt_curpage
!= (BTHEADER
*) NULL
) {
if (t
->bt_curpage
->h_flags
& F_DIRTY
)
if (_bt_write(t
, t
->bt_curpage
, RELEASE
) == RET_ERROR
)
/* if no current page, get space for one */
if ((t
->bt_curpage
= (BTHEADER
*) malloc((unsigned) t
->bt_psize
))
/* seek to correct location in file */
if (lseek(t
->bt_s
.bt_d
.d_fd
, pos
, L_SET
) != pos
) {
if ((nbytes
= read(t
->bt_s
.bt_d
.d_fd
, t
->bt_curpage
, n
)) < n
) {
* If we didn't get a full page, we must have gotten no
* data at all -- in which case we're trying to read a
* root page that doesn't exist yet. This is the only
* case in which this is okay. If this happens, construct
* an empty root page by hand.
if (nbytes
!= 0 || pgno
!= P_ROOT
) {
h
= (BTHEADER
*) t
->bt_curpage
;
(((char *) &(h
->h_linp
[0])) - ((char *) h
));
h
->h_upper
= t
->bt_psize
;
h
->h_prevpg
= h
->h_nextpg
= P_NONE
;
(void) _bt_pgin(t
->bt_curpage
, (char *) t
->bt_lorder
);
* _BT_PGOUT, _BT_PGIN -- Convert host-specific number layout to/from
* the host-independent format stored on disk.
* _lorder -- byte order for pages (stored as a char * in the
* cache, and passed around as a magic cookie).
* RET_SUCCESS (lru code requires a return value).
* Layout of tree metadata on the page is changed in place.
* Everywhere else in the code, the types pgno_t and index_t
* are opaque. These two routines know what they really
if (lorder
== BYTE_ORDER
)
if (h
->h_flags
& F_LEAF
) {
if (h
->h_flags
& F_CONT
) {
if (h
->h_prevpg
== P_NONE
) {
(void) bcopy((char *) &(h
->h_linp
[0]),
(void) bcopy((char *) &longsz
,
(char *) &(h
->h_linp
[0]),
for (i
= 0; i
< top
; i
++) {
d
= (DATUM
*) GETDATUM(h
, i
);
if (d
->d_flags
& D_BIGKEY
) {
(void) bcopy((char *) &(d
->d_bytes
[0]),
(void) bcopy((char *) &chain
,
(char *) &(d
->d_bytes
[0]),
if (d
->d_flags
& D_BIGDATA
) {
(void) bcopy((char *) &(d
->d_bytes
[d
->d_ksize
]),
(void) bcopy((char *) &chain
,
(char *) &(d
->d_bytes
[d
->d_ksize
]),
for (i
= 0; i
< top
; i
++) {
id
= (IDATUM
*) GETDATUM(h
, i
);
if (id
->i_flags
& D_BIGKEY
) {
(void) bcopy((char *) &(id
->i_bytes
[0]),
(void) bcopy((char *) &chain
,
(char *) &(id
->i_bytes
[0]),
* If btree pages are to be stored in the host byte order, don't
if (lorder
== BYTE_ORDER
)
if (h
->h_flags
& F_LEAF
) {
if (h
->h_flags
& F_CONT
) {
if (h
->h_prevpg
== P_NONE
) {
(void) bcopy((char *) &(h
->h_linp
[0]),
(void) bcopy((char *) &longsz
,
(char *) &(h
->h_linp
[0]),
for (i
= 0; i
< top
; i
++) {
d
= (DATUM
*) GETDATUM(h
, i
);
if (d
->d_flags
& D_BIGKEY
) {
(void) bcopy((char *) &(d
->d_bytes
[0]),
(void) bcopy((char *) &chain
,
(char *) &(d
->d_bytes
[0]),
if (d
->d_flags
& D_BIGDATA
) {
(void) bcopy((char *) &(d
->d_bytes
[d
->d_ksize
]),
(void) bcopy((char *) &chain
,
(char *) &(d
->d_bytes
[d
->d_ksize
]),
for (i
= 0; i
< top
; i
++) {
id
= (IDATUM
*) GETDATUM(h
, i
);
if (id
->i_flags
& D_BIGKEY
) {
(void) bcopy((char *) &(id
->i_bytes
[0]),
(void) bcopy((char *) &chain
,
(char *) &(id
->i_bytes
[0]),
* _BT_ALLOCPG -- allocate a new page in the btree.
* This is called when we split a page, to get space to do the split.
* For disk btrees, these pages are released when the split is done.
* For memory btrees, they are not.
* t -- tree in which to do split
* Pointer to the newly-allocated page
BTHEADER
*h
= t
->bt_curpage
;
/* if we have a cache, let the cache code do the work */
if (ISDISK(t
) && ISCACHE(t
)) {
nh
= (BTHEADER
*) lrugetnew(t
->bt_s
.bt_d
.d_cache
,
(int) (t
->bt_npages
+ 1),
nh
= (BTHEADER
*) malloc((unsigned) t
->bt_psize
);
if (nh
!= (BTHEADER
*) NULL
) {
nh
->h_pgno
= nh
->h_prevpg
= nh
->h_nextpg
= P_NONE
;
nh
->h_flags
= h
->h_flags
;
(((char *) &(nh
->h_linp
[0])) - ((char *) nh
));
nh
->h_upper
= t
->bt_psize
;
* _BT_WRITE -- Write a specific page to a btree file.
* t -- btree to get the page
* relflag -- (int) this page may/may not be released
* RET_SUCCESS, RET_ERROR.
* Writes a metadata page if none has been written yet.
/* if we haven't done so yet, write the metadata */
if (!(t
->bt_flags
& BTF_METAOK
)) {
if (_bt_wrtmeta(t
) == RET_ERROR
)
/* if we have a cache, let the cache code do the work */
if ((cache
= t
->bt_s
.bt_d
.d_cache
) != (char *) NULL
) {
if (lruwrite(cache
, (int) pgno
) < 0)
if (relflag
&& lrurelease(cache
, (int) pgno
) < 0)
(void) _bt_pgout(h
, (char *) t
->bt_lorder
);
/* now write the current page */
pos
= (long) (pgno
* t
->bt_psize
);
if (lseek(t
->bt_s
.bt_d
.d_fd
, pos
, L_SET
) != pos
)
if (write(t
->bt_s
.bt_d
.d_fd
, (char *) h
, (int) t
->bt_psize
)
(void) _bt_pgin(h
, (char *) t
->bt_lorder
);
htindex
= HASHKEY(h
->h_pgno
);
/* see if we need to overwrite existing entry */
for (b
= t
->bt_s
.bt_ht
[htindex
];
if (b
->ht_pgno
== h
->h_pgno
) {
/* new entry, write it */
b
= (HTBUCKET
*) malloc((unsigned) sizeof (HTBUCKET
));
if (b
== (HTBUCKET
*) NULL
)
b
->ht_next
= t
->bt_s
.bt_ht
[htindex
];
t
->bt_s
.bt_ht
[htindex
] = b
;
* _BT_RELEASE -- Release a locked-down cache page
* During page splits, we want to force pages out to the cache
* before we actually release them, in some cases. This routine
* releases such a page when it is no longer needed.
* t -- btree in which to release page
* RET_SUCCESS, RET_ERROR.
if (ISDISK(t
) && ISCACHE(t
)) {
if (lrurelease(t
->bt_s
.bt_d
.d_cache
, (int) (h
->h_pgno
)) < 0)
* _BT_WRTMETA -- Write metadata to the btree.
* t -- tree to which to write
* RET_SUCCESS, RET_ERROR.
if (lseek(t
->bt_s
.bt_d
.d_fd
, 0l, L_SET
) != 0l)
/* lorder has to be in host-independent format */
m
.m_lorder
= (u_long
) htonl((long) t
->bt_lorder
);
m
.m_version
= BTREEVERSION
;
m
.m_flags
= t
->bt_flags
& BTF_NODUPS
;
if (t
->bt_lorder
!= BYTE_ORDER
) {
if (write(t
->bt_s
.bt_d
.d_fd
, (char *) &m
, sizeof(m
))
t
->bt_flags
|= BTF_METAOK
;