implement free page reuse
[unix-history] / usr / src / lib / libc / db / btree / bt_split.c
CommitLineData
876dbe65
MO
1/*-
2 * Copyright (c) 1990 The Regents of the University of California.
3 * All rights reserved.
4 *
5 * This code is derived from software contributed to Berkeley by
6 * Mike Olson.
7 *
8 * %sccs.include.redist.c%
9 */
10
11#if defined(LIBC_SCCS) && !defined(lint)
a5d29441 12static char sccsid[] = "@(#)bt_split.c 5.7 (Berkeley) %G%";
876dbe65
MO
13#endif /* LIBC_SCCS and not lint */
14
15#include <sys/types.h>
91b5c9e5 16#define __DBINTERFACE_PRIVATE
876dbe65 17#include <db.h>
91b5c9e5
KB
18#include <limits.h>
19#include <stdio.h>
fb91cf55
KB
20#include <stdlib.h>
21#include <string.h>
876dbe65
MO
22#include "btree.h"
23
91b5c9e5
KB
24static int bt_preserve __P((BTREE *, pgno_t));
25static PAGE *bt_psplit __P((BTREE *, PAGE *, PAGE *, PAGE *, int *));
26static PAGE *bt_page __P((BTREE *, PAGE *, PAGE **, PAGE **, int *));
27static PAGE *bt_root __P((BTREE *, PAGE *, PAGE **, PAGE **, int *));
28static int bt_rroot __P((BTREE *, PAGE *, PAGE *, PAGE *));
29static int bt_broot __P((BTREE *, PAGE *, PAGE *, PAGE *));
30static recno_t rec_total __P((PAGE *));
31
32#ifdef STATISTICS
33u_long bt_rootsplit, bt_split, bt_sortsplit, bt_pfxsaved;
34#endif
35
876dbe65 36/*
91b5c9e5 37 * __BT_SPLIT -- Split the tree.
876dbe65 38 *
91b5c9e5
KB
39 * Parameters:
40 * t: tree
41 * h: page to split
42 * key: key to insert
43 * data: data to insert
44 * flags: BIGKEY/BIGDATA flags
45 * nbytes: length of insertion
46 * skip: index to leave open
876dbe65 47 *
91b5c9e5
KB
48 * Returns:
49 * RET_ERROR, RET_SUCCESS
876dbe65 50 */
876dbe65 51int
91b5c9e5
KB
52__bt_split(t, h, key, data, flags, nbytes, skip)
53 BTREE *t;
54 PAGE *h;
55 const DBT *key, *data;
56 u_long flags;
57 size_t nbytes;
58 int skip;
876dbe65 59{
91b5c9e5
KB
60 BINTERNAL *bi;
61 BLEAF *bl;
62 DBT a, b;
63 EPGNO *parent;
64 PAGE *l, *r, *lchild, *rchild;
65 index_t nxtindex;
66 size_t nksize;
67 int nosplit;
68 char *dest;
876dbe65
MO
69
70 /*
91b5c9e5
KB
71 * Split the page into two pages, l and r. The split routines return
72 * a pointer to the page into which the key should be inserted and skip
73 * set to the offset which should be used. Additionally, l and r are
74 * pinned.
876dbe65 75 */
91b5c9e5
KB
76 h = h->pgno == P_ROOT ?
77 bt_root(t, h, &l, &r, &skip) : bt_page(t, h, &l, &r, &skip);
78 if (h == NULL)
876dbe65
MO
79 return (RET_ERROR);
80
91b5c9e5
KB
81 /*
82 * Grab the space and insert the [rb]leaf structure. Always a [rb]leaf
83 * structure since key inserts always cause a leaf page to split first.
84 */
85 h->linp[skip] = h->upper -= nbytes;
86 dest = (char *)h + h->upper;
f1641417 87 if (ISSET(t, BTF_RECNO))
91b5c9e5 88 WR_RLEAF(dest, data, flags)
f1641417 89 else
91b5c9e5 90 WR_BLEAF(dest, key, data, flags)
876dbe65
MO
91
92 /*
91b5c9e5
KB
93 * Now we walk the parent page stack -- a LIFO stack of the pages that
94 * were traversed when we searched for the page that split. Each stack
95 * entry is a page number and a page index offset. The offset is for
96 * the page traversed on the search. We've just split a page, so we
97 * have to insert a new key into the parent page.
98 *
99 * If the insert into the parent page causes it to split, may have to
100 * continue splitting all the way up the tree. We stop if the root
101 * splits or the page inserted into didn't have to split to hold the
102 * new key. Some algorithms replace the key for the old page as well
103 * as the new page. We don't, as there's no reason to believe that the
104 * first key on the old page is any better than the key we have, and,
105 * in the case of a key being placed at index 0 causing the split, the
106 * key is unavailable.
107 *
108 * There are a maximum of 5 pages pinned at any time. We keep the left
109 * and right pages pinned while working on the parent. The 5 are the
110 * two children, left parent and right parent (when the parent splits)
111 * and the root page or the overflow key page when calling bt_preserve.
112 * This code must make sure that all pins are released other than the
113 * root page or overflow page which is unlocked elsewhere.
876dbe65 114 */
91b5c9e5
KB
115 for (nosplit = 0; (parent = BT_POP(t)) != NULL;) {
116 lchild = l;
117 rchild = r;
876dbe65 118
91b5c9e5
KB
119 /* Get the parent page. */
120 if ((h = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL)
121 goto err2;
876dbe65 122
91b5c9e5
KB
123 /* The new key goes ONE AFTER the index. */
124 skip = parent->index + 1;
876dbe65 125
91b5c9e5
KB
126 /*
127 * Calculate the space needed on the parent page.
128 *
f1641417 129 * Space hack when inserting into BINTERNAL pages. Only need to
91b5c9e5 130 * retain the number of bytes that will distinguish between the
f1641417
KB
131 * new entry and the LAST entry on the page to its left. If the
132 * keys compare equal, retain the entire key. Note, we don't
133 * touch overflow keys and the entire key must be retained for
134 * the next-to-leftmost key on the leftmost page of each level,
135 * or the search will fail.
91b5c9e5
KB
136 */
137 switch (rchild->flags & P_TYPE) {
138 case P_BINTERNAL:
139 bi = GETBINTERNAL(rchild, 0);
140 nbytes = NBINTERNAL(bi->ksize);
141 if (t->bt_pfx && (h->prevpg != P_INVALID || skip > 1) &&
142 !(bi->flags & P_BIGKEY)) {
143 BINTERNAL *tbi;
144 tbi =
145 GETBINTERNAL(lchild, NEXTINDEX(lchild) - 1);
146 a.size = tbi->ksize;
147 a.data = tbi->bytes;
148 b.size = bi->ksize;
149 b.data = bi->bytes;
150 goto prefix;
876dbe65 151 }
91b5c9e5
KB
152 break;
153 case P_BLEAF:
154 bl = GETBLEAF(rchild, 0);
f701460d 155 nbytes = NBINTERNAL(bl->ksize);
91b5c9e5
KB
156 if (t->bt_pfx && (h->prevpg != P_INVALID || skip > 1) &&
157 !(bl->flags & P_BIGKEY)) {
158 BLEAF *tbl;
159 size_t n;
160
161 tbl = GETBLEAF(lchild, NEXTINDEX(lchild) - 1);
162 a.size = tbl->ksize;
163 a.data = tbl->bytes;
164 b.size = bl->ksize;
165 b.data = bl->bytes;
166prefix: nksize = t->bt_pfx(&a, &b);
167 n = NBINTERNAL(nksize);
168 if (n < nbytes) {
169#ifdef STATISTICS
170 bt_pfxsaved += nbytes - n;
171#endif
172 nbytes = n;
173 } else
174 nksize = 0;
175 } else
176 nksize = 0;
177 break;
178 case P_RINTERNAL:
179 case P_RLEAF:
180 nbytes = NRINTERNAL;
181 break;
876dbe65 182 }
876dbe65 183
91b5c9e5
KB
184 /* Split the parent page if necessary or shift the indices. */
185 if (h->upper - h->lower < nbytes + sizeof(index_t)) {
186 h = h->pgno == P_ROOT ?
187 bt_root(t, h, &l, &r, &skip) :
188 bt_page(t, h, &l, &r, &skip);
189 if (h == NULL)
190 goto err1;
191 } else {
192 if (skip < (nxtindex = NEXTINDEX(h)))
193 bcopy(h->linp + skip, h->linp + skip + 1,
194 (nxtindex - skip) * sizeof(index_t));
195 h->lower += sizeof(index_t);
196 nosplit = 1;
197 }
876dbe65 198
91b5c9e5
KB
199 /* Insert the key into the parent page. */
200 switch(rchild->flags & P_TYPE) {
201 case P_BINTERNAL:
202 h->linp[skip] = h->upper -= nbytes;
203 dest = (char *)h + h->linp[skip];
204 bcopy(bi, dest, nbytes);
205 if (nksize)
206 ((BINTERNAL *)dest)->ksize = nksize;
207 ((BINTERNAL *)dest)->pgno = rchild->pgno;
208 break;
209 case P_BLEAF:
210 h->linp[skip] = h->upper -= nbytes;
211 dest = (char *)h + h->linp[skip];
212 WR_BINTERNAL(dest, nksize ? nksize : bl->ksize,
f1641417 213 rchild->pgno, bl->flags & P_BIGKEY);
91b5c9e5
KB
214 bcopy(bl->bytes, dest, nksize ? nksize : bl->ksize);
215 if (bl->flags & P_BIGKEY &&
216 bt_preserve(t, *(pgno_t *)bl->bytes) == RET_ERROR)
217 goto err1;
218 break;
219 case P_RINTERNAL:
220 /* Update both left and right page counts. */
221 h->linp[skip] = h->upper -= nbytes;
222 dest = (char *)h + h->linp[skip];
223 ((RINTERNAL *)dest)->nrecs = rec_total(rchild);
224 ((RINTERNAL *)dest)->pgno = rchild->pgno;
225 dest = (char *)h + h->linp[skip - 1];
226 ((RINTERNAL *)dest)->nrecs = rec_total(lchild);
227 ((RINTERNAL *)dest)->pgno = lchild->pgno;
228 break;
229 case P_RLEAF:
230 /* Update both left and right page counts. */
231 h->linp[skip] = h->upper -= nbytes;
232 dest = (char *)h + h->linp[skip];
233 ((RINTERNAL *)dest)->nrecs = NEXTINDEX(rchild);
234 ((RINTERNAL *)dest)->pgno = rchild->pgno;
235 dest = (char *)h + h->linp[skip - 1];
236 ((RINTERNAL *)dest)->nrecs = NEXTINDEX(lchild);
237 ((RINTERNAL *)dest)->pgno = lchild->pgno;
238 break;
876dbe65
MO
239 }
240
91b5c9e5
KB
241 /* Unpin the held pages. */
242 if (nosplit) {
243 mpool_put(t->bt_mp, h, MPOOL_DIRTY);
244 break;
245 }
246 mpool_put(t->bt_mp, lchild, MPOOL_DIRTY);
247 mpool_put(t->bt_mp, rchild, MPOOL_DIRTY);
876dbe65
MO
248 }
249
91b5c9e5
KB
250 /* Unpin the held pages. */
251 mpool_put(t->bt_mp, l, MPOOL_DIRTY);
252 mpool_put(t->bt_mp, r, MPOOL_DIRTY);
253
f1641417
KB
254 /* Clear any pages left on the stack. */
255 BT_CLR(t);
91b5c9e5 256 return (RET_SUCCESS);
876dbe65 257
91b5c9e5
KB
258 /*
259 * If something fails in the above loop we were already walking back
260 * up the tree and the tree is now inconsistent. Nothing much we can
261 * do about it but release any memory we're holding.
262 */
263err1: mpool_put(t->bt_mp, lchild, MPOOL_DIRTY);
264 mpool_put(t->bt_mp, rchild, MPOOL_DIRTY);
876dbe65 265
91b5c9e5
KB
266err2: mpool_put(t->bt_mp, l, 0);
267 mpool_put(t->bt_mp, r, 0);
268 __dbpanic(t->bt_dbp);
269 return (RET_ERROR);
876dbe65
MO
270}
271
272/*
91b5c9e5 273 * BT_PAGE -- Split a non-root page of a btree.
876dbe65 274 *
91b5c9e5
KB
275 * Parameters:
276 * t: tree
277 * h: root page
278 * lp: pointer to left page pointer
279 * rp: pointer to right page pointer
280 * skip: pointer to index to leave open
876dbe65 281 *
91b5c9e5
KB
282 * Returns:
283 * Pointer to page in which to insert or NULL on error.
876dbe65 284 */
91b5c9e5
KB
285static PAGE *
286bt_page(t, h, lp, rp, skip)
287 BTREE *t;
288 PAGE *h, **lp, **rp;
289 int *skip;
876dbe65 290{
91b5c9e5
KB
291 PAGE *l, *r, *tp;
292 pgno_t npg;
293
294#ifdef STATISTICS
295 ++bt_split;
296#endif
297 /* Put the new right page for the split into place. */
a5d29441 298 if ((r = __bt_new(t, &npg)) == NULL)
91b5c9e5
KB
299 return (NULL);
300 r->pgno = npg;
301 r->lower = BTDATAOFF;
302 r->upper = t->bt_psize;
303 r->nextpg = h->nextpg;
304 r->prevpg = h->pgno;
305 r->flags = h->flags & P_TYPE;
876dbe65 306
91b5c9e5
KB
307 /*
308 * If we're splitting the last page on a level because we're appending
309 * a key to it (skip is NEXTINDEX()), it's likely that the data is
310 * sorted. Adding an empty page on the side of the level is less work
311 * and can push the fill factor much higher than normal. If we're
312 * wrong it's no big deal, we'll just do the split the right way next
313 * time. It may look like it's equally easy to do a similar hack for
314 * reverse sorted data, that is, split the tree left, but it's not.
315 * Don't even try.
316 */
317 if (h->nextpg == P_INVALID && *skip == NEXTINDEX(h)) {
318#ifdef STATISTICS
319 ++bt_sortsplit;
320#endif
321 h->nextpg = r->pgno;
322 r->lower = BTDATAOFF + sizeof(index_t);
323 *skip = 0;
324 *lp = h;
325 *rp = r;
326 return (r);
327 }
876dbe65 328
91b5c9e5
KB
329 /* Put the new left page for the split into place. */
330 if ((l = malloc(t->bt_psize)) == NULL) {
331 mpool_put(t->bt_mp, r, 0);
332 return (NULL);
333 }
334 l->pgno = h->pgno;
335 l->nextpg = r->pgno;
336 l->prevpg = h->prevpg;
337 l->lower = BTDATAOFF;
338 l->upper = t->bt_psize;
339 l->flags = h->flags & P_TYPE;
340
341 /* Fix up the previous pointer of the page after the split page. */
342 if (h->nextpg != P_INVALID) {
343 if ((tp = mpool_get(t->bt_mp, h->nextpg, 0)) == NULL) {
344 free(l);
345 /* XXX mpool_free(t->bt_mp, r->pgno); */
346 return (NULL);
876dbe65 347 }
91b5c9e5
KB
348 tp->prevpg = r->pgno;
349 mpool_put(t->bt_mp, tp, 0);
350 }
876dbe65 351
91b5c9e5
KB
352 /*
353 * Split right. The key/data pairs aren't sorted in the btree page so
354 * it's simpler to copy the data from the split page onto two new pages
355 * instead of copying half the data to the right page and compacting
356 * the left page in place. Since the left page can't change, we have
357 * to swap the original and the allocated left page after the split.
358 */
359 tp = bt_psplit(t, h, l, r, skip);
876dbe65 360
91b5c9e5
KB
361 /* Move the new left page onto the old left page. */
362 bcopy(l, h, t->bt_psize);
363 if (tp == l)
364 tp = h;
365 free(l);
366
367 *lp = h;
368 *rp = r;
369 return (tp);
876dbe65
MO
370}
371
372/*
f1641417 373 * BT_ROOT -- Split the root page of a btree.
876dbe65 374 *
91b5c9e5
KB
375 * Parameters:
376 * t: tree
377 * h: root page
378 * lp: pointer to left page pointer
379 * rp: pointer to right page pointer
380 * skip: pointer to index to leave open
876dbe65 381 *
91b5c9e5
KB
382 * Returns:
383 * Pointer to page in which to insert or NULL on error.
876dbe65 384 */
91b5c9e5
KB
385static PAGE *
386bt_root(t, h, lp, rp, skip)
387 BTREE *t;
388 PAGE *h, **lp, **rp;
389 int *skip;
876dbe65 390{
91b5c9e5
KB
391 PAGE *l, *r, *tp;
392 pgno_t lnpg, rnpg;
393
394#ifdef STATISTICS
395 ++bt_split;
396 ++bt_rootsplit;
397#endif
398 /* Put the new left and right pages for the split into place. */
a5d29441
KB
399 if ((l = __bt_new(t, &lnpg)) == NULL ||
400 (r = __bt_new(t, &rnpg)) == NULL)
91b5c9e5
KB
401 return (NULL);
402 l->pgno = lnpg;
403 r->pgno = rnpg;
404 l->nextpg = r->pgno;
405 r->prevpg = l->pgno;
406 l->prevpg = r->nextpg = P_INVALID;
407 l->lower = r->lower = BTDATAOFF;
408 l->upper = r->upper = t->bt_psize;
409 l->flags = r->flags = h->flags & P_TYPE;
410
411 /* Split the root page. */
412 tp = bt_psplit(t, h, l, r, skip);
413
414 /* Make the root page look right. */
415 if ((ISSET(t, BTF_RECNO) ?
416 bt_rroot(t, h, l, r) : bt_broot(t, h, l, r)) == RET_ERROR)
417 return (NULL);
418
419 *lp = l;
420 *rp = r;
421 return (tp);
876dbe65
MO
422}
423
424/*
91b5c9e5 425 * BT_RROOT -- Fix up the recno root page after the split.
876dbe65 426 *
91b5c9e5
KB
427 * Parameters:
428 * t: tree
429 * h: root page
876dbe65 430 *
91b5c9e5
KB
431 * Returns:
432 * RET_ERROR, RET_SUCCESS
876dbe65 433 */
91b5c9e5
KB
434static int
435bt_rroot(t, h, l, r)
436 BTREE *t;
437 PAGE *h, *l, *r;
876dbe65 438{
91b5c9e5 439 char *dest;
876dbe65 440
91b5c9e5
KB
441 /* Insert the left and right keys, set the header information. */
442 h->linp[0] = h->upper = t->bt_psize - NRINTERNAL;
443 dest = (char *)h + h->upper;
444 WR_RINTERNAL(dest,
445 l->flags & P_RLEAF ? NEXTINDEX(l) : rec_total(l), l->pgno);
876dbe65 446
91b5c9e5
KB
447 h->linp[1] = h->upper -= NRINTERNAL;
448 dest = (char *)h + h->upper;
449 WR_RINTERNAL(dest,
450 r->flags & P_RLEAF ? NEXTINDEX(r) : rec_total(r), r->pgno);
876dbe65 451
91b5c9e5 452 h->lower = BTDATAOFF + 2 * sizeof(index_t);
876dbe65 453
91b5c9e5
KB
454 /* Unpin the root page, set to recno internal page. */
455 h->flags &= ~P_TYPE;
456 h->flags |= P_RINTERNAL;
457 mpool_put(t->bt_mp, h, MPOOL_DIRTY);
876dbe65 458
91b5c9e5
KB
459 return (RET_SUCCESS);
460}
876dbe65 461
91b5c9e5
KB
462/*
463 * BT_BROOT -- Fix up the btree root page after the split.
464 *
465 * Parameters:
466 * t: tree
467 * h: root page
468 *
469 * Returns:
470 * RET_ERROR, RET_SUCCESS
471 */
472static int
473bt_broot(t, h, l, r)
474 BTREE *t;
475 PAGE *h, *l, *r;
476{
477 BINTERNAL *bi;
478 BLEAF *bl;
479 size_t nbytes;
480 char *dest;
876dbe65
MO
481
482 /*
91b5c9e5
KB
483 * If the root page was a leaf page, change it into an internal page.
484 * We copy the key we split on (but not the key's data, in the case of
f701460d 485 * a leaf page) to the new root page.
91b5c9e5
KB
486 *
487 * The btree comparison code guarantees that the left-most key on any
488 * level of the tree is never used, so it doesn't need to be filled
489 * in. (This is not just convenience -- if the insert index is 0, we
490 * don't *have* a key to fill in.) The right key is available because
491 * the split code guarantees not to split on the skipped index.
876dbe65 492 */
f701460d 493 nbytes = NBINTERNAL(0);
91b5c9e5
KB
494 h->linp[0] = h->upper = t->bt_psize - nbytes;
495 dest = (char *)h + h->upper;
496 WR_BINTERNAL(dest, 0, l->pgno, 0);
497
498 switch(h->flags & P_TYPE) {
499 case P_BLEAF:
500 bl = GETBLEAF(r, 0);
501 nbytes = NBINTERNAL(bl->ksize);
502 h->linp[1] = h->upper -= nbytes;
503 dest = (char *)h + h->upper;
504 WR_BINTERNAL(dest, bl->ksize, r->pgno, 0);
505 bcopy(bl->bytes, dest, bl->ksize);
506
f701460d
KB
507 /*
508 * If the key is on an overflow page, mark the overflow chain
509 * so it isn't deleted when the leaf copy of the key is deleted.
510 */
91b5c9e5
KB
511 if (bl->flags & P_BIGKEY &&
512 bt_preserve(t, *(pgno_t *)bl->bytes) == RET_ERROR)
513 return (RET_ERROR);
514 break;
515 case P_BINTERNAL:
516 bi = GETBINTERNAL(r, 0);
517 nbytes = NBINTERNAL(bi->ksize);
518 h->linp[1] = h->upper -= nbytes;
519 dest = (char *)h + h->upper;
520 bcopy(bi, dest, nbytes);
521 ((BINTERNAL *)dest)->pgno = r->pgno;
522 break;
876dbe65 523 }
91b5c9e5 524 h->lower = BTDATAOFF + 2 * sizeof(index_t);
876dbe65 525
91b5c9e5
KB
526 /* Unpin the root page, set to btree internal page. */
527 h->flags &= ~P_TYPE;
528 h->flags |= P_BINTERNAL;
529 mpool_put(t->bt_mp, h, MPOOL_DIRTY);
876dbe65
MO
530
531 return (RET_SUCCESS);
532}
533
534/*
91b5c9e5 535 * BT_PSPLIT -- Do the real work of splitting the page.
876dbe65 536 *
91b5c9e5
KB
537 * Parameters:
538 * t: tree
539 * h: page to be split
540 * l: page to put lower half of data
541 * r: page to put upper half of data
542 * skip: pointer to index to leave open
876dbe65 543 *
91b5c9e5
KB
544 * Returns:
545 * Pointer to page in which to insert.
876dbe65 546 */
91b5c9e5 547static PAGE *
f1641417 548bt_psplit(t, h, l, r, pskip)
91b5c9e5
KB
549 BTREE *t;
550 PAGE *h, *l, *r;
f1641417 551 int *pskip;
876dbe65 552{
91b5c9e5
KB
553 BINTERNAL *bi;
554 BLEAF *bl;
555 RLEAF *rl;
556 EPGNO *c;
557 PAGE *rval;
f1641417 558 index_t half, skip;
91b5c9e5
KB
559 size_t nbytes;
560 void *src;
561 int bigkeycnt, isbigkey, nxt, off, top;
876dbe65
MO
562
563 /*
91b5c9e5
KB
564 * Split the data to the left and right pages. Leave the skip index
565 * open and guarantee that the split doesn't happen on that index (the
566 * right key must be available for the parent page). Additionally,
567 * make some effort not to split on an overflow key. This makes it
568 * faster to process internal pages and can save space since overflow
569 * keys used by internal pages are never deleted.
876dbe65 570 */
91b5c9e5 571 bigkeycnt = 0;
f1641417 572 skip = *pskip;
91b5c9e5
KB
573 half = (t->bt_psize - BTDATAOFF) / 2;
574 for (nxt = off = 0, top = NEXTINDEX(h); nxt < top; ++off) {
f1641417 575 if (skip == off)
91b5c9e5
KB
576 continue;
577 switch (h->flags & P_TYPE) {
578 case P_BINTERNAL:
579 src = bi = GETBINTERNAL(h, nxt);
580 nbytes = NBINTERNAL(bi->ksize);
581 isbigkey = bi->flags & P_BIGKEY;
582 break;
583 case P_BLEAF:
584 src = bl = GETBLEAF(h, nxt);
585 nbytes = NBLEAF(bl);
586 isbigkey = bl->flags & P_BIGKEY;
587 break;
588 case P_RINTERNAL:
589 src = GETRINTERNAL(h, nxt);
590 nbytes = NRINTERNAL;
591 isbigkey = 0;
592 break;
593 case P_RLEAF:
594 src = rl = GETRLEAF(h, nxt);
595 nbytes = NRLEAF(rl);
596 isbigkey = 0;
597 break;
876dbe65 598 }
91b5c9e5
KB
599 ++nxt;
600 l->linp[off] = l->upper -= nbytes;
601 bcopy(src, (char *)l + l->upper, nbytes);
602
603 /* There's no empirical justification for the '3'. */
9f18f0ec
KB
604 if (half < nbytes) {
605 if (skip != off + 1)
606 if (!isbigkey || bigkeycnt == 3)
607 break;
608 else
609 ++bigkeycnt;
610 } else
91b5c9e5
KB
611 half -= nbytes;
612 }
613 l->lower += (off + 1) * sizeof(index_t);
876dbe65 614
91b5c9e5 615 /*
f1641417
KB
616 * If splitting the page that the cursor was on, the cursor has to be
617 * adjusted to point to the same record as before the split. If the
618 * skipped slot and the cursor are both on the left page and the cursor
619 * is on or past the skipped slot, the cursor is incremented by one.
620 * If the skipped slot and the cursor are both on the right page and
621 * the cursor is on or past the skipped slot, the cursor is incremented
622 * by one. If the skipped slot and the cursor aren't on the same page,
623 * the cursor isn't changed. Regardless of the relationship of the
624 * skipped slot and the cursor, if the cursor is on the right page it
625 * is decremented by the number of records split to the left page.
626 *
627 * Don't bother checking for the BTF_SEQINIT flag, the page number will
628 * be P_INVALID.
91b5c9e5
KB
629 */
630 c = &t->bt_bcursor;
631 if (c->pgno == h->pgno)
f1641417 632 if (c->index < off) { /* left page */
91b5c9e5 633 c->pgno = l->pgno;
f1641417
KB
634 if (c->index >= skip)
635 ++c->index;
636 } else { /* right page */
91b5c9e5 637 c->pgno = r->pgno;
f1641417
KB
638 if (c->index >= skip && skip > off)
639 ++c->index;
91b5c9e5 640 c->index -= off;
876dbe65
MO
641 }
642
91b5c9e5
KB
643 /*
644 * Decide which page to return, and adjust the skip index if the
645 * to-be-inserted-upon page has changed.
646 */
f1641417 647 if (skip > off) {
91b5c9e5 648 rval = r;
f1641417 649 *pskip -= off + 1;
91b5c9e5
KB
650 } else
651 rval = l;
652
653 for (off = 0; nxt < top; ++off) {
f1641417
KB
654 if (skip == nxt) {
655 skip = 0;
91b5c9e5
KB
656 continue;
657 }
658 switch (h->flags & P_TYPE) {
659 case P_BINTERNAL:
660 src = bi = GETBINTERNAL(h, nxt);
661 nbytes = NBINTERNAL(bi->ksize);
662 break;
663 case P_BLEAF:
664 src = bl = GETBLEAF(h, nxt);
665 nbytes = NBLEAF(bl);
666 break;
667 case P_RINTERNAL:
668 src = GETRINTERNAL(h, nxt);
669 nbytes = NRINTERNAL;
670 break;
671 case P_RLEAF:
672 src = rl = GETRLEAF(h, nxt);
673 nbytes = NRLEAF(rl);
674 break;
876dbe65 675 }
91b5c9e5
KB
676 ++nxt;
677 r->linp[off] = r->upper -= nbytes;
678 bcopy(src, (char *)r + r->upper, nbytes);
876dbe65 679 }
91b5c9e5 680 r->lower += off * sizeof(index_t);
876dbe65 681
91b5c9e5 682 /* If the key is being appended to the page, adjust the index. */
f1641417 683 if (skip == top)
91b5c9e5 684 r->lower += sizeof(index_t);
876dbe65 685
91b5c9e5
KB
686 return (rval);
687}
688
689/*
690 * BT_PRESERVE -- Mark a chain of pages as used by an internal node.
691 *
692 * Chains of indirect blocks pointed to by leaf nodes get reclaimed when the
693 * record that references them gets deleted. Chains pointed to by internal
694 * pages never get deleted. This routine marks a chain as pointed to by an
695 * internal page.
696 *
697 * Parameters:
698 * t: tree
699 * pg: page number of first page in the chain.
700 *
701 * Returns:
702 * RET_SUCCESS, RET_ERROR.
703 */
704static int
705bt_preserve(t, pg)
706 BTREE *t;
707 pgno_t pg;
708{
709 PAGE *h;
710
711 if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
712 return (RET_ERROR);
713 h->flags |= P_PRESERVE;
714 mpool_put(t->bt_mp, h, MPOOL_DIRTY);
876dbe65
MO
715 return (RET_SUCCESS);
716}
91b5c9e5
KB
717
718/*
719 * REC_TOTAL -- Return the number of recno entries below a page.
720 *
721 * Parameters:
722 * h: page
723 *
724 * Returns:
725 * The number of recno entries below a page.
726 *
727 * XXX
728 * These values could be set by the bt_psplit routine. The problem is that the
729 * entry has to be popped off of the stack etc. or the values have to be passed
730 * all the way back to bt_split/bt_rroot and it's not very clean.
731 */
732static recno_t
733rec_total(h)
734 PAGE *h;
735{
736 recno_t recs;
737 index_t nxt, top;
738
739 for (recs = 0, nxt = 0, top = NEXTINDEX(h); nxt < top; ++nxt)
740 recs += GETRINTERNAL(h, nxt)->nrecs;
741 return (recs);
742}