* Copyright (c) 1990, 1993
* The Regents of the University of California. All rights reserved.
* This code is derived from software contributed to Berkeley by
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
#if defined(LIBC_SCCS) && !defined(lint)
static char sccsid
[] = "@(#)bt_split.c 8.1 (Berkeley) 6/4/93";
#endif /* LIBC_SCCS and not lint */
#define __DBINTERFACE_PRIVATE
static int bt_broot
__P((BTREE
*, PAGE
*, PAGE
*, PAGE
*));
__P((BTREE
*, PAGE
*, PAGE
**, PAGE
**, u_int
*, size_t));
static int bt_preserve
__P((BTREE
*, pgno_t
));
__P((BTREE
*, PAGE
*, PAGE
*, PAGE
*, u_int
*, size_t));
__P((BTREE
*, PAGE
*, PAGE
**, PAGE
**, u_int
*, size_t));
static int bt_rroot
__P((BTREE
*, PAGE
*, PAGE
*, PAGE
*));
static recno_t rec_total
__P((PAGE
*));
u_long bt_rootsplit
, bt_split
, bt_sortsplit
, bt_pfxsaved
;
* __BT_SPLIT -- Split the tree.
* flags: BIGKEY/BIGDATA flags
* skip: index to leave open
__bt_split(t
, sp
, key
, data
, flags
, ilen
, skip
)
PAGE
*h
, *l
, *r
, *lchild
, *rchild
;
size_t n
, nbytes
, nksize
;
* Split the page into two pages, l and r. The split routines return
* a pointer to the page into which the key should be inserted and with
* skip set to the offset which should be used. Additionally, l and r
bt_root(t
, sp
, &l
, &r
, &skip
, ilen
) :
bt_page(t
, sp
, &l
, &r
, &skip
, ilen
);
* Insert the new key/data pair into the leaf page. (Key inserts
* always cause a leaf page to split first.)
h
->linp
[skip
] = h
->upper
-= ilen
;
dest
= (char *)h
+ h
->upper
;
WR_RLEAF(dest
, data
, flags
)
WR_BLEAF(dest
, key
, data
, flags
)
/* If the root page was split, make it look right. */
if (sp
->pgno
== P_ROOT
&&
bt_rroot(t
, sp
, l
, r
) : bt_broot(t
, sp
, l
, r
)) == RET_ERROR
)
* Now we walk the parent page stack -- a LIFO stack of the pages that
* were traversed when we searched for the page that split. Each stack
* entry is a page number and a page index offset. The offset is for
* the page traversed on the search. We've just split a page, so we
* have to insert a new key into the parent page.
* If the insert into the parent page causes it to split, may have to
* continue splitting all the way up the tree. We stop if the root
* splits or the page inserted into didn't have to split to hold the
* new key. Some algorithms replace the key for the old page as well
* as the new page. We don't, as there's no reason to believe that the
* first key on the old page is any better than the key we have, and,
* in the case of a key being placed at index 0 causing the split, the
* There are a maximum of 5 pages pinned at any time. We keep the left
* and right pages pinned while working on the parent. The 5 are the
* two children, left parent and right parent (when the parent splits)
* and the root page or the overflow key page when calling bt_preserve.
* This code must make sure that all pins are released other than the
* root page or overflow page which is unlocked elsewhere.
while ((parent
= BT_POP(t
)) != NULL
) {
/* Get the parent page. */
if ((h
= mpool_get(t
->bt_mp
, parent
->pgno
, 0)) == NULL
)
* The new key goes ONE AFTER the index, because the split
skip
= parent
->index
+ 1;
* Calculate the space needed on the parent page.
* Prefix trees: space hack when inserting into BINTERNAL
* pages. Retain only what's needed to distinguish between
* the new entry and the LAST entry on the page to its left.
* If the keys compare equal, retain the entire key. Note,
* we don't touch overflow keys, and the entire key must be
* retained for the next-to-left most key on the leftmost
* page of each level, or the search will fail. Applicable
* ONLY to internal pages that have leaf pages as children.
* Further reduction of the key between pairs of internal
* pages loses too much information.
switch (rchild
->flags
& P_TYPE
) {
bi
= GETBINTERNAL(rchild
, 0);
nbytes
= NBINTERNAL(bi
->ksize
);
bl
= GETBLEAF(rchild
, 0);
nbytes
= NBINTERNAL(bl
->ksize
);
if (t
->bt_pfx
&& !(bl
->flags
& P_BIGKEY
) &&
(h
->prevpg
!= P_INVALID
|| skip
> 1)) {
tbl
= GETBLEAF(lchild
, NEXTINDEX(lchild
) - 1);
nksize
= t
->bt_pfx(&a
, &b
);
bt_pfxsaved
+= nbytes
- n
;
/* Split the parent page if necessary or shift the indices. */
if (h
->upper
- h
->lower
< nbytes
+ sizeof(indx_t
)) {
bt_root(t
, h
, &l
, &r
, &skip
, nbytes
) :
bt_page(t
, h
, &l
, &r
, &skip
, nbytes
);
if (skip
< (nxtindex
= NEXTINDEX(h
)))
memmove(h
->linp
+ skip
+ 1, h
->linp
+ skip
,
(nxtindex
- skip
) * sizeof(indx_t
));
h
->lower
+= sizeof(indx_t
);
/* Insert the key into the parent page. */
switch(rchild
->flags
& P_TYPE
) {
h
->linp
[skip
] = h
->upper
-= nbytes
;
dest
= (char *)h
+ h
->linp
[skip
];
memmove(dest
, bi
, nbytes
);
((BINTERNAL
*)dest
)->pgno
= rchild
->pgno
;
h
->linp
[skip
] = h
->upper
-= nbytes
;
dest
= (char *)h
+ h
->linp
[skip
];
WR_BINTERNAL(dest
, nksize
? nksize
: bl
->ksize
,
rchild
->pgno
, bl
->flags
& P_BIGKEY
);
memmove(dest
, bl
->bytes
, nksize
? nksize
: bl
->ksize
);
if (bl
->flags
& P_BIGKEY
&&
bt_preserve(t
, *(pgno_t
*)bl
->bytes
) == RET_ERROR
)
* Update the left page count. If split
* added at index 0, fix the correct page.
dest
= (char *)h
+ h
->linp
[skip
- 1];
dest
= (char *)l
+ l
->linp
[NEXTINDEX(l
) - 1];
((RINTERNAL
*)dest
)->nrecs
= rec_total(lchild
);
((RINTERNAL
*)dest
)->pgno
= lchild
->pgno
;
/* Update the right page count. */
h
->linp
[skip
] = h
->upper
-= nbytes
;
dest
= (char *)h
+ h
->linp
[skip
];
((RINTERNAL
*)dest
)->nrecs
= rec_total(rchild
);
((RINTERNAL
*)dest
)->pgno
= rchild
->pgno
;
* Update the left page count. If split
* added at index 0, fix the correct page.
dest
= (char *)h
+ h
->linp
[skip
- 1];
dest
= (char *)l
+ l
->linp
[NEXTINDEX(l
) - 1];
((RINTERNAL
*)dest
)->nrecs
= NEXTINDEX(lchild
);
((RINTERNAL
*)dest
)->pgno
= lchild
->pgno
;
/* Update the right page count. */
h
->linp
[skip
] = h
->upper
-= nbytes
;
dest
= (char *)h
+ h
->linp
[skip
];
((RINTERNAL
*)dest
)->nrecs
= NEXTINDEX(rchild
);
((RINTERNAL
*)dest
)->pgno
= rchild
->pgno
;
/* Unpin the held pages. */
mpool_put(t
->bt_mp
, h
, MPOOL_DIRTY
);
/* If the root page was split, make it look right. */
if (sp
->pgno
== P_ROOT
&&
bt_rroot(t
, sp
, l
, r
) : bt_broot(t
, sp
, l
, r
)) == RET_ERROR
)
mpool_put(t
->bt_mp
, lchild
, MPOOL_DIRTY
);
mpool_put(t
->bt_mp
, rchild
, MPOOL_DIRTY
);
/* Unpin the held pages. */
mpool_put(t
->bt_mp
, l
, MPOOL_DIRTY
);
mpool_put(t
->bt_mp
, r
, MPOOL_DIRTY
);
/* Clear any pages left on the stack. */
* If something fails in the above loop we were already walking back
* up the tree and the tree is now inconsistent. Nothing much we can
* do about it but release any memory we're holding.
err1
: mpool_put(t
->bt_mp
, lchild
, MPOOL_DIRTY
);
mpool_put(t
->bt_mp
, rchild
, MPOOL_DIRTY
);
err2
: mpool_put(t
->bt_mp
, l
, 0);
mpool_put(t
->bt_mp
, r
, 0);
* BT_PAGE -- Split a non-root page of a btree.
* lp: pointer to left page pointer
* rp: pointer to right page pointer
* skip: pointer to index to leave open
* Pointer to page in which to insert or NULL on error.
bt_page(t
, h
, lp
, rp
, skip
, ilen
)
/* Put the new right page for the split into place. */
if ((r
= __bt_new(t
, &npg
)) == NULL
)
r
->flags
= h
->flags
& P_TYPE
;
* If we're splitting the last page on a level because we're appending
* a key to it (skip is NEXTINDEX()), it's likely that the data is
* sorted. Adding an empty page on the side of the level is less work
* and can push the fill factor much higher than normal. If we're
* wrong it's no big deal, we'll just do the split the right way next
* time. It may look like it's equally easy to do a similar hack for
* reverse sorted data, that is, split the tree left, but it's not.
if (h
->nextpg
== P_INVALID
&& *skip
== NEXTINDEX(h
)) {
r
->lower
= BTDATAOFF
+ sizeof(indx_t
);
/* Put the new left page for the split into place. */
if ((l
= malloc(t
->bt_psize
)) == NULL
) {
mpool_put(t
->bt_mp
, r
, 0);
l
->flags
= h
->flags
& P_TYPE
;
/* Fix up the previous pointer of the page after the split page. */
if (h
->nextpg
!= P_INVALID
) {
if ((tp
= mpool_get(t
->bt_mp
, h
->nextpg
, 0)) == NULL
) {
/* XXX mpool_free(t->bt_mp, r->pgno); */
mpool_put(t
->bt_mp
, tp
, 0);
* Split right. The key/data pairs aren't sorted in the btree page so
* it's simpler to copy the data from the split page onto two new pages
* instead of copying half the data to the right page and compacting
* the left page in place. Since the left page can't change, we have
* to swap the original and the allocated left page after the split.
tp
= bt_psplit(t
, h
, l
, r
, skip
, ilen
);
/* Move the new left page onto the old left page. */
memmove(h
, l
, t
->bt_psize
);
* BT_ROOT -- Split the root page of a btree.
* lp: pointer to left page pointer
* rp: pointer to right page pointer
* skip: pointer to index to leave open
* Pointer to page in which to insert or NULL on error.
bt_root(t
, h
, lp
, rp
, skip
, ilen
)
/* Put the new left and right pages for the split into place. */
if ((l
= __bt_new(t
, &lnpg
)) == NULL
||
(r
= __bt_new(t
, &rnpg
)) == NULL
)
l
->prevpg
= r
->nextpg
= P_INVALID
;
l
->lower
= r
->lower
= BTDATAOFF
;
l
->upper
= r
->upper
= t
->bt_psize
;
l
->flags
= r
->flags
= h
->flags
& P_TYPE
;
/* Split the root page. */
tp
= bt_psplit(t
, h
, l
, r
, skip
, ilen
);
* BT_RROOT -- Fix up the recno root page after it has been split.
/* Insert the left and right keys, set the header information. */
h
->linp
[0] = h
->upper
= t
->bt_psize
- NRINTERNAL
;
dest
= (char *)h
+ h
->upper
;
l
->flags
& P_RLEAF
? NEXTINDEX(l
) : rec_total(l
), l
->pgno
);
h
->linp
[1] = h
->upper
-= NRINTERNAL
;
dest
= (char *)h
+ h
->upper
;
r
->flags
& P_RLEAF
? NEXTINDEX(r
) : rec_total(r
), r
->pgno
);
h
->lower
= BTDATAOFF
+ 2 * sizeof(indx_t
);
/* Unpin the root page, set to recno internal page. */
mpool_put(t
->bt_mp
, h
, MPOOL_DIRTY
);
* BT_BROOT -- Fix up the btree root page after it has been split.
* If the root page was a leaf page, change it into an internal page.
* We copy the key we split on (but not the key's data, in the case of
* a leaf page) to the new root page.
* The btree comparison code guarantees that the left-most key on any
* level of the tree is never used, so it doesn't need to be filled in.
h
->linp
[0] = h
->upper
= t
->bt_psize
- nbytes
;
dest
= (char *)h
+ h
->upper
;
WR_BINTERNAL(dest
, 0, l
->pgno
, 0);
switch(h
->flags
& P_TYPE
) {
nbytes
= NBINTERNAL(bl
->ksize
);
h
->linp
[1] = h
->upper
-= nbytes
;
dest
= (char *)h
+ h
->upper
;
WR_BINTERNAL(dest
, bl
->ksize
, r
->pgno
, 0);
memmove(dest
, bl
->bytes
, bl
->ksize
);
* If the key is on an overflow page, mark the overflow chain
* so it isn't deleted when the leaf copy of the key is deleted.
if (bl
->flags
& P_BIGKEY
&&
bt_preserve(t
, *(pgno_t
*)bl
->bytes
) == RET_ERROR
)
nbytes
= NBINTERNAL(bi
->ksize
);
h
->linp
[1] = h
->upper
-= nbytes
;
dest
= (char *)h
+ h
->upper
;
memmove(dest
, bi
, nbytes
);
((BINTERNAL
*)dest
)->pgno
= r
->pgno
;
/* There are two keys on the page. */
h
->lower
= BTDATAOFF
+ 2 * sizeof(indx_t
);
/* Unpin the root page, set to btree internal page. */
mpool_put(t
->bt_mp
, h
, MPOOL_DIRTY
);
* BT_PSPLIT -- Do the real work of splitting the page.
* l: page to put lower half of data
* r: page to put upper half of data
* pskip: pointer to index to leave open
* Pointer to page in which to insert.
bt_psplit(t
, h
, l
, r
, pskip
, ilen
)
indx_t full
, half
, nxt
, off
, skip
, top
, used
;
* Split the data to the left and right pages. Leave the skip index
* open. Additionally, make some effort not to split on an overflow
* key. This makes internal page processing faster and can save
* space as overflow keys used by internal pages are never deleted.
full
= t
->bt_psize
- BTDATAOFF
;
for (nxt
= off
= 0, top
= NEXTINDEX(h
); nxt
< top
; ++off
) {
isbigkey
= 0; /* XXX: not really known. */
switch (h
->flags
& P_TYPE
) {
src
= bi
= GETBINTERNAL(h
, nxt
);
nbytes
= NBINTERNAL(bi
->ksize
);
isbigkey
= bi
->flags
& P_BIGKEY
;
src
= bl
= GETBLEAF(h
, nxt
);
isbigkey
= bl
->flags
& P_BIGKEY
;
src
= GETRINTERNAL(h
, nxt
);
src
= rl
= GETRLEAF(h
, nxt
);
* If the key/data pairs are substantial fractions of the max
* possible size for the page, it's possible to get situations
* where we decide to try and copy too much onto the left page.
* Make sure that doesn't happen.
if (skip
<= off
&& used
+ nbytes
>= full
) {
/* Copy the key/data pair, if not the skipped index. */
l
->linp
[off
] = l
->upper
-= nbytes
;
memmove((char *)l
+ l
->upper
, src
, nbytes
);
if (!isbigkey
|| bigkeycnt
== 3)
* Off is the last offset that's valid for the left page.
* Nxt is the first offset to be placed on the right page.
l
->lower
+= (off
+ 1) * sizeof(indx_t
);
* If splitting the page that the cursor was on, the cursor has to be
* adjusted to point to the same record as before the split. If the
* cursor is at or past the skipped slot, the cursor is incremented by
* one. If the cursor is on the right page, it is decremented by the
* number of records split to the left page.
* Don't bother checking for the B_SEQINIT flag, the page number will
if (c
->pgno
== h
->pgno
) {
if (c
->index
< nxt
) /* Left page. */
* If the skipped index was on the left page, just return that page.
* Otherwise, adjust the skip index to reflect the new position on
for (off
= 0; nxt
< top
; ++off
) {
switch (h
->flags
& P_TYPE
) {
src
= bi
= GETBINTERNAL(h
, nxt
);
nbytes
= NBINTERNAL(bi
->ksize
);
src
= bl
= GETBLEAF(h
, nxt
);
src
= GETRINTERNAL(h
, nxt
);
src
= rl
= GETRLEAF(h
, nxt
);
r
->linp
[off
] = r
->upper
-= nbytes
;
memmove((char *)r
+ r
->upper
, src
, nbytes
);
r
->lower
+= off
* sizeof(indx_t
);
/* If the key is being appended to the page, adjust the index. */
r
->lower
+= sizeof(indx_t
);
* BT_PRESERVE -- Mark a chain of pages as used by an internal node.
* Chains of indirect blocks pointed to by leaf nodes get reclaimed when the
* record that references them gets deleted. Chains pointed to by internal
* pages never get deleted. This routine marks a chain as pointed to by an
* pg: page number of first page in the chain.
* RET_SUCCESS, RET_ERROR.
if ((h
= mpool_get(t
->bt_mp
, pg
, 0)) == NULL
)
mpool_put(t
->bt_mp
, h
, MPOOL_DIRTY
);
* REC_TOTAL -- Return the number of recno entries below a page.
* The number of recno entries below a page.
* These values could be set by the bt_psplit routine. The problem is that the
* entry has to be popped off of the stack etc. or the values have to be passed
* all the way back to bt_split/bt_rroot and it's not very clean.
for (recs
= 0, nxt
= 0, top
= NEXTINDEX(h
); nxt
< top
; ++nxt
)
recs
+= GETRINTERNAL(h
, nxt
)->nrecs
;