4.4BSD snapshot (revision 8.1)
[unix-history] / usr / src / lib / libc / db / btree / bt_split.c
CommitLineData
876dbe65
MO
1/*-
2 * Copyright (c) 1990 The Regents of the University of California.
3 * All rights reserved.
4 *
5 * This code is derived from software contributed to Berkeley by
6 * Mike Olson.
7 *
8 * %sccs.include.redist.c%
9 */
10
11#if defined(LIBC_SCCS) && !defined(lint)
c2787311 12static char sccsid[] = "@(#)bt_split.c 5.17 (Berkeley) %G%";
876dbe65
MO
13#endif /* LIBC_SCCS and not lint */
14
15#include <sys/types.h>
365a94a9 16
91b5c9e5 17#define __DBINTERFACE_PRIVATE
91b5c9e5
KB
18#include <limits.h>
19#include <stdio.h>
fb91cf55
KB
20#include <stdlib.h>
21#include <string.h>
365a94a9 22
94ac72c5 23#include <db.h>
876dbe65
MO
24#include "btree.h"
25
e6f9af54 26static int bt_broot __P((BTREE *, PAGE *, PAGE *, PAGE *));
4dde9e25 27static PAGE *bt_page
198068ab 28 __P((BTREE *, PAGE *, PAGE **, PAGE **, u_int *, size_t));
91b5c9e5 29static int bt_preserve __P((BTREE *, pgno_t));
4dde9e25 30static PAGE *bt_psplit
198068ab 31 __P((BTREE *, PAGE *, PAGE *, PAGE *, u_int *, size_t));
4dde9e25 32static PAGE *bt_root
198068ab 33 __P((BTREE *, PAGE *, PAGE **, PAGE **, u_int *, size_t));
91b5c9e5 34static int bt_rroot __P((BTREE *, PAGE *, PAGE *, PAGE *));
91b5c9e5
KB
35static recno_t rec_total __P((PAGE *));
36
37#ifdef STATISTICS
38u_long bt_rootsplit, bt_split, bt_sortsplit, bt_pfxsaved;
39#endif
40
876dbe65 41/*
91b5c9e5 42 * __BT_SPLIT -- Split the tree.
876dbe65 43 *
91b5c9e5
KB
44 * Parameters:
45 * t: tree
e6f9af54 46 * sp: page to split
91b5c9e5
KB
47 * key: key to insert
48 * data: data to insert
49 * flags: BIGKEY/BIGDATA flags
4dde9e25 50 * ilen: insert length
91b5c9e5 51 * skip: index to leave open
876dbe65 52 *
91b5c9e5
KB
53 * Returns:
54 * RET_ERROR, RET_SUCCESS
876dbe65 55 */
876dbe65 56int
4dde9e25 57__bt_split(t, sp, key, data, flags, ilen, skip)
91b5c9e5 58 BTREE *t;
e6f9af54 59 PAGE *sp;
91b5c9e5
KB
60 const DBT *key, *data;
61 u_long flags;
4dde9e25 62 size_t ilen;
053d6b28 63 u_int skip;
876dbe65 64{
91b5c9e5 65 BINTERNAL *bi;
4dde9e25 66 BLEAF *bl, *tbl;
91b5c9e5
KB
67 DBT a, b;
68 EPGNO *parent;
e6f9af54 69 PAGE *h, *l, *r, *lchild, *rchild;
491176fb 70 indx_t nxtindex;
4dde9e25 71 size_t n, nbytes, nksize;
c2787311 72 int parentsplit;
91b5c9e5 73 char *dest;
876dbe65
MO
74
75 /*
91b5c9e5 76 * Split the page into two pages, l and r. The split routines return
e6f9af54
KB
77 * a pointer to the page into which the key should be inserted and with
78 * skip set to the offset which should be used. Additionally, l and r
79 * are pinned.
876dbe65 80 */
e6f9af54 81 h = sp->pgno == P_ROOT ?
4dde9e25
KB
82 bt_root(t, sp, &l, &r, &skip, ilen) :
83 bt_page(t, sp, &l, &r, &skip, ilen);
91b5c9e5 84 if (h == NULL)
876dbe65
MO
85 return (RET_ERROR);
86
91b5c9e5 87 /*
e6f9af54
KB
88 * Insert the new key/data pair into the leaf page. (Key inserts
89 * always cause a leaf page to split first.)
91b5c9e5 90 */
4dde9e25 91 h->linp[skip] = h->upper -= ilen;
91b5c9e5 92 dest = (char *)h + h->upper;
6236c130 93 if (ISSET(t, R_RECNO))
91b5c9e5 94 WR_RLEAF(dest, data, flags)
f1641417 95 else
91b5c9e5 96 WR_BLEAF(dest, key, data, flags)
876dbe65 97
e6f9af54
KB
98 /* If the root page was split, make it look right. */
99 if (sp->pgno == P_ROOT &&
6236c130 100 (ISSET(t, R_RECNO) ?
e6f9af54
KB
101 bt_rroot(t, sp, l, r) : bt_broot(t, sp, l, r)) == RET_ERROR)
102 goto err2;
103
876dbe65 104 /*
91b5c9e5
KB
105 * Now we walk the parent page stack -- a LIFO stack of the pages that
106 * were traversed when we searched for the page that split. Each stack
107 * entry is a page number and a page index offset. The offset is for
108 * the page traversed on the search. We've just split a page, so we
109 * have to insert a new key into the parent page.
110 *
111 * If the insert into the parent page causes it to split, may have to
112 * continue splitting all the way up the tree. We stop if the root
113 * splits or the page inserted into didn't have to split to hold the
114 * new key. Some algorithms replace the key for the old page as well
115 * as the new page. We don't, as there's no reason to believe that the
116 * first key on the old page is any better than the key we have, and,
117 * in the case of a key being placed at index 0 causing the split, the
118 * key is unavailable.
119 *
120 * There are a maximum of 5 pages pinned at any time. We keep the left
121 * and right pages pinned while working on the parent. The 5 are the
122 * two children, left parent and right parent (when the parent splits)
123 * and the root page or the overflow key page when calling bt_preserve.
124 * This code must make sure that all pins are released other than the
125 * root page or overflow page which is unlocked elsewhere.
876dbe65 126 */
c2787311 127 while ((parent = BT_POP(t)) != NULL) {
91b5c9e5
KB
128 lchild = l;
129 rchild = r;
876dbe65 130
91b5c9e5
KB
131 /* Get the parent page. */
132 if ((h = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL)
133 goto err2;
876dbe65 134
4dde9e25
KB
135 /*
136 * The new key goes ONE AFTER the index, because the split
137 * was to the right.
138 */
91b5c9e5 139 skip = parent->index + 1;
876dbe65 140
91b5c9e5
KB
141 /*
142 * Calculate the space needed on the parent page.
143 *
4dde9e25
KB
144 * Prefix trees: space hack when inserting into BINTERNAL
145 * pages. Retain only what's needed to distinguish between
146 * the new entry and the LAST entry on the page to its left.
147 * If the keys compare equal, retain the entire key. Note,
148 * we don't touch overflow keys, and the entire key must be
149 * retained for the next-to-left most key on the leftmost
150 * page of each level, or the search will fail. Applicable
151 * ONLY to internal pages that have leaf pages as children.
152 * Further reduction of the key between pairs of internal
153 * pages loses too much information.
91b5c9e5
KB
154 */
155 switch (rchild->flags & P_TYPE) {
156 case P_BINTERNAL:
157 bi = GETBINTERNAL(rchild, 0);
158 nbytes = NBINTERNAL(bi->ksize);
91b5c9e5
KB
159 break;
160 case P_BLEAF:
161 bl = GETBLEAF(rchild, 0);
f701460d 162 nbytes = NBINTERNAL(bl->ksize);
4dde9e25
KB
163 if (t->bt_pfx && !(bl->flags & P_BIGKEY) &&
164 (h->prevpg != P_INVALID || skip > 1)) {
91b5c9e5
KB
165 tbl = GETBLEAF(lchild, NEXTINDEX(lchild) - 1);
166 a.size = tbl->ksize;
167 a.data = tbl->bytes;
168 b.size = bl->ksize;
169 b.data = bl->bytes;
4dde9e25 170 nksize = t->bt_pfx(&a, &b);
91b5c9e5
KB
171 n = NBINTERNAL(nksize);
172 if (n < nbytes) {
173#ifdef STATISTICS
174 bt_pfxsaved += nbytes - n;
175#endif
176 nbytes = n;
177 } else
178 nksize = 0;
179 } else
180 nksize = 0;
181 break;
182 case P_RINTERNAL:
183 case P_RLEAF:
184 nbytes = NRINTERNAL;
185 break;
365a94a9
KB
186 default:
187 abort();
876dbe65 188 }
876dbe65 189
91b5c9e5 190 /* Split the parent page if necessary or shift the indices. */
491176fb 191 if (h->upper - h->lower < nbytes + sizeof(indx_t)) {
e6f9af54 192 sp = h;
91b5c9e5 193 h = h->pgno == P_ROOT ?
4dde9e25
KB
194 bt_root(t, h, &l, &r, &skip, nbytes) :
195 bt_page(t, h, &l, &r, &skip, nbytes);
91b5c9e5
KB
196 if (h == NULL)
197 goto err1;
c2787311 198 parentsplit = 1;
91b5c9e5
KB
199 } else {
200 if (skip < (nxtindex = NEXTINDEX(h)))
1911b5bf 201 memmove(h->linp + skip + 1, h->linp + skip,
491176fb
KB
202 (nxtindex - skip) * sizeof(indx_t));
203 h->lower += sizeof(indx_t);
c2787311 204 parentsplit = 0;
91b5c9e5 205 }
876dbe65 206
91b5c9e5
KB
207 /* Insert the key into the parent page. */
208 switch(rchild->flags & P_TYPE) {
209 case P_BINTERNAL:
210 h->linp[skip] = h->upper -= nbytes;
211 dest = (char *)h + h->linp[skip];
1911b5bf 212 memmove(dest, bi, nbytes);
91b5c9e5
KB
213 ((BINTERNAL *)dest)->pgno = rchild->pgno;
214 break;
215 case P_BLEAF:
216 h->linp[skip] = h->upper -= nbytes;
217 dest = (char *)h + h->linp[skip];
218 WR_BINTERNAL(dest, nksize ? nksize : bl->ksize,
f1641417 219 rchild->pgno, bl->flags & P_BIGKEY);
1911b5bf 220 memmove(dest, bl->bytes, nksize ? nksize : bl->ksize);
91b5c9e5
KB
221 if (bl->flags & P_BIGKEY &&
222 bt_preserve(t, *(pgno_t *)bl->bytes) == RET_ERROR)
223 goto err1;
224 break;
225 case P_RINTERNAL:
c2787311
KB
226 /*
227 * Update the left page count. If split
228 * added at index 0, fix the correct page.
229 */
230 if (skip > 0)
231 dest = (char *)h + h->linp[skip - 1];
232 else
233 dest = (char *)l + l->linp[NEXTINDEX(l) - 1];
234 ((RINTERNAL *)dest)->nrecs = rec_total(lchild);
235 ((RINTERNAL *)dest)->pgno = lchild->pgno;
236
237 /* Update the right page count. */
91b5c9e5
KB
238 h->linp[skip] = h->upper -= nbytes;
239 dest = (char *)h + h->linp[skip];
240 ((RINTERNAL *)dest)->nrecs = rec_total(rchild);
241 ((RINTERNAL *)dest)->pgno = rchild->pgno;
91b5c9e5
KB
242 break;
243 case P_RLEAF:
c2787311
KB
244 /*
245 * Update the left page count. If split
246 * added at index 0, fix the correct page.
247 */
248 if (skip > 0)
249 dest = (char *)h + h->linp[skip - 1];
250 else
251 dest = (char *)l + l->linp[NEXTINDEX(l) - 1];
252 ((RINTERNAL *)dest)->nrecs = NEXTINDEX(lchild);
253 ((RINTERNAL *)dest)->pgno = lchild->pgno;
254
255 /* Update the right page count. */
91b5c9e5
KB
256 h->linp[skip] = h->upper -= nbytes;
257 dest = (char *)h + h->linp[skip];
258 ((RINTERNAL *)dest)->nrecs = NEXTINDEX(rchild);
259 ((RINTERNAL *)dest)->pgno = rchild->pgno;
91b5c9e5 260 break;
365a94a9
KB
261 default:
262 abort();
876dbe65
MO
263 }
264
91b5c9e5 265 /* Unpin the held pages. */
c2787311 266 if (!parentsplit) {
91b5c9e5
KB
267 mpool_put(t->bt_mp, h, MPOOL_DIRTY);
268 break;
269 }
e6f9af54
KB
270
271 /* If the root page was split, make it look right. */
272 if (sp->pgno == P_ROOT &&
6236c130 273 (ISSET(t, R_RECNO) ?
e6f9af54
KB
274 bt_rroot(t, sp, l, r) : bt_broot(t, sp, l, r)) == RET_ERROR)
275 goto err1;
276
91b5c9e5
KB
277 mpool_put(t->bt_mp, lchild, MPOOL_DIRTY);
278 mpool_put(t->bt_mp, rchild, MPOOL_DIRTY);
876dbe65
MO
279 }
280
91b5c9e5
KB
281 /* Unpin the held pages. */
282 mpool_put(t->bt_mp, l, MPOOL_DIRTY);
283 mpool_put(t->bt_mp, r, MPOOL_DIRTY);
284
f1641417 285 /* Clear any pages left on the stack. */
91b5c9e5 286 return (RET_SUCCESS);
876dbe65 287
91b5c9e5
KB
288 /*
289 * If something fails in the above loop we were already walking back
290 * up the tree and the tree is now inconsistent. Nothing much we can
291 * do about it but release any memory we're holding.
292 */
293err1: mpool_put(t->bt_mp, lchild, MPOOL_DIRTY);
294 mpool_put(t->bt_mp, rchild, MPOOL_DIRTY);
876dbe65 295
91b5c9e5
KB
296err2: mpool_put(t->bt_mp, l, 0);
297 mpool_put(t->bt_mp, r, 0);
298 __dbpanic(t->bt_dbp);
299 return (RET_ERROR);
876dbe65
MO
300}
301
302/*
91b5c9e5 303 * BT_PAGE -- Split a non-root page of a btree.
876dbe65 304 *
91b5c9e5
KB
305 * Parameters:
306 * t: tree
307 * h: root page
308 * lp: pointer to left page pointer
309 * rp: pointer to right page pointer
310 * skip: pointer to index to leave open
4dde9e25 311 * ilen: insert length
876dbe65 312 *
91b5c9e5
KB
313 * Returns:
314 * Pointer to page in which to insert or NULL on error.
876dbe65 315 */
91b5c9e5 316static PAGE *
4dde9e25 317bt_page(t, h, lp, rp, skip, ilen)
91b5c9e5
KB
318 BTREE *t;
319 PAGE *h, **lp, **rp;
198068ab 320 u_int *skip;
4dde9e25 321 size_t ilen;
876dbe65 322{
91b5c9e5
KB
323 PAGE *l, *r, *tp;
324 pgno_t npg;
325
326#ifdef STATISTICS
327 ++bt_split;
328#endif
329 /* Put the new right page for the split into place. */
a5d29441 330 if ((r = __bt_new(t, &npg)) == NULL)
91b5c9e5
KB
331 return (NULL);
332 r->pgno = npg;
333 r->lower = BTDATAOFF;
334 r->upper = t->bt_psize;
335 r->nextpg = h->nextpg;
336 r->prevpg = h->pgno;
337 r->flags = h->flags & P_TYPE;
876dbe65 338
91b5c9e5
KB
339 /*
340 * If we're splitting the last page on a level because we're appending
341 * a key to it (skip is NEXTINDEX()), it's likely that the data is
342 * sorted. Adding an empty page on the side of the level is less work
343 * and can push the fill factor much higher than normal. If we're
344 * wrong it's no big deal, we'll just do the split the right way next
345 * time. It may look like it's equally easy to do a similar hack for
346 * reverse sorted data, that is, split the tree left, but it's not.
347 * Don't even try.
348 */
349 if (h->nextpg == P_INVALID && *skip == NEXTINDEX(h)) {
350#ifdef STATISTICS
351 ++bt_sortsplit;
352#endif
353 h->nextpg = r->pgno;
491176fb 354 r->lower = BTDATAOFF + sizeof(indx_t);
91b5c9e5
KB
355 *skip = 0;
356 *lp = h;
357 *rp = r;
358 return (r);
359 }
876dbe65 360
91b5c9e5
KB
361 /* Put the new left page for the split into place. */
362 if ((l = malloc(t->bt_psize)) == NULL) {
363 mpool_put(t->bt_mp, r, 0);
364 return (NULL);
365 }
366 l->pgno = h->pgno;
367 l->nextpg = r->pgno;
368 l->prevpg = h->prevpg;
369 l->lower = BTDATAOFF;
370 l->upper = t->bt_psize;
371 l->flags = h->flags & P_TYPE;
372
373 /* Fix up the previous pointer of the page after the split page. */
374 if (h->nextpg != P_INVALID) {
375 if ((tp = mpool_get(t->bt_mp, h->nextpg, 0)) == NULL) {
376 free(l);
377 /* XXX mpool_free(t->bt_mp, r->pgno); */
378 return (NULL);
876dbe65 379 }
91b5c9e5
KB
380 tp->prevpg = r->pgno;
381 mpool_put(t->bt_mp, tp, 0);
382 }
876dbe65 383
91b5c9e5
KB
384 /*
385 * Split right. The key/data pairs aren't sorted in the btree page so
386 * it's simpler to copy the data from the split page onto two new pages
387 * instead of copying half the data to the right page and compacting
388 * the left page in place. Since the left page can't change, we have
389 * to swap the original and the allocated left page after the split.
390 */
4dde9e25 391 tp = bt_psplit(t, h, l, r, skip, ilen);
876dbe65 392
91b5c9e5 393 /* Move the new left page onto the old left page. */
1911b5bf 394 memmove(h, l, t->bt_psize);
91b5c9e5
KB
395 if (tp == l)
396 tp = h;
397 free(l);
398
399 *lp = h;
400 *rp = r;
401 return (tp);
876dbe65
MO
402}
403
404/*
f1641417 405 * BT_ROOT -- Split the root page of a btree.
876dbe65 406 *
91b5c9e5
KB
407 * Parameters:
408 * t: tree
409 * h: root page
410 * lp: pointer to left page pointer
411 * rp: pointer to right page pointer
412 * skip: pointer to index to leave open
4dde9e25 413 * ilen: insert length
876dbe65 414 *
91b5c9e5
KB
415 * Returns:
416 * Pointer to page in which to insert or NULL on error.
876dbe65 417 */
91b5c9e5 418static PAGE *
4dde9e25 419bt_root(t, h, lp, rp, skip, ilen)
91b5c9e5
KB
420 BTREE *t;
421 PAGE *h, **lp, **rp;
198068ab 422 u_int *skip;
4dde9e25 423 size_t ilen;
876dbe65 424{
91b5c9e5
KB
425 PAGE *l, *r, *tp;
426 pgno_t lnpg, rnpg;
427
428#ifdef STATISTICS
429 ++bt_split;
430 ++bt_rootsplit;
431#endif
432 /* Put the new left and right pages for the split into place. */
a5d29441
KB
433 if ((l = __bt_new(t, &lnpg)) == NULL ||
434 (r = __bt_new(t, &rnpg)) == NULL)
91b5c9e5
KB
435 return (NULL);
436 l->pgno = lnpg;
437 r->pgno = rnpg;
438 l->nextpg = r->pgno;
439 r->prevpg = l->pgno;
440 l->prevpg = r->nextpg = P_INVALID;
441 l->lower = r->lower = BTDATAOFF;
442 l->upper = r->upper = t->bt_psize;
443 l->flags = r->flags = h->flags & P_TYPE;
444
445 /* Split the root page. */
4dde9e25 446 tp = bt_psplit(t, h, l, r, skip, ilen);
91b5c9e5 447
91b5c9e5
KB
448 *lp = l;
449 *rp = r;
450 return (tp);
876dbe65
MO
451}
452
453/*
e6f9af54 454 * BT_RROOT -- Fix up the recno root page after it has been split.
876dbe65 455 *
91b5c9e5
KB
456 * Parameters:
457 * t: tree
458 * h: root page
e6f9af54
KB
459 * l: left page
460 * r: right page
876dbe65 461 *
91b5c9e5
KB
462 * Returns:
463 * RET_ERROR, RET_SUCCESS
876dbe65 464 */
91b5c9e5
KB
465static int
466bt_rroot(t, h, l, r)
467 BTREE *t;
468 PAGE *h, *l, *r;
876dbe65 469{
91b5c9e5 470 char *dest;
876dbe65 471
91b5c9e5
KB
472 /* Insert the left and right keys, set the header information. */
473 h->linp[0] = h->upper = t->bt_psize - NRINTERNAL;
474 dest = (char *)h + h->upper;
475 WR_RINTERNAL(dest,
476 l->flags & P_RLEAF ? NEXTINDEX(l) : rec_total(l), l->pgno);
876dbe65 477
91b5c9e5
KB
478 h->linp[1] = h->upper -= NRINTERNAL;
479 dest = (char *)h + h->upper;
480 WR_RINTERNAL(dest,
481 r->flags & P_RLEAF ? NEXTINDEX(r) : rec_total(r), r->pgno);
876dbe65 482
491176fb 483 h->lower = BTDATAOFF + 2 * sizeof(indx_t);
876dbe65 484
91b5c9e5
KB
485 /* Unpin the root page, set to recno internal page. */
486 h->flags &= ~P_TYPE;
487 h->flags |= P_RINTERNAL;
488 mpool_put(t->bt_mp, h, MPOOL_DIRTY);
876dbe65 489
91b5c9e5
KB
490 return (RET_SUCCESS);
491}
876dbe65 492
91b5c9e5 493/*
e6f9af54 494 * BT_BROOT -- Fix up the btree root page after it has been split.
91b5c9e5
KB
495 *
496 * Parameters:
497 * t: tree
498 * h: root page
e6f9af54
KB
499 * l: left page
500 * r: right page
91b5c9e5
KB
501 *
502 * Returns:
503 * RET_ERROR, RET_SUCCESS
504 */
505static int
506bt_broot(t, h, l, r)
507 BTREE *t;
508 PAGE *h, *l, *r;
509{
510 BINTERNAL *bi;
511 BLEAF *bl;
512 size_t nbytes;
513 char *dest;
876dbe65
MO
514
515 /*
91b5c9e5
KB
516 * If the root page was a leaf page, change it into an internal page.
517 * We copy the key we split on (but not the key's data, in the case of
f701460d 518 * a leaf page) to the new root page.
91b5c9e5
KB
519 *
520 * The btree comparison code guarantees that the left-most key on any
e6f9af54 521 * level of the tree is never used, so it doesn't need to be filled in.
876dbe65 522 */
f701460d 523 nbytes = NBINTERNAL(0);
91b5c9e5
KB
524 h->linp[0] = h->upper = t->bt_psize - nbytes;
525 dest = (char *)h + h->upper;
526 WR_BINTERNAL(dest, 0, l->pgno, 0);
527
528 switch(h->flags & P_TYPE) {
529 case P_BLEAF:
530 bl = GETBLEAF(r, 0);
531 nbytes = NBINTERNAL(bl->ksize);
532 h->linp[1] = h->upper -= nbytes;
533 dest = (char *)h + h->upper;
534 WR_BINTERNAL(dest, bl->ksize, r->pgno, 0);
1911b5bf 535 memmove(dest, bl->bytes, bl->ksize);
91b5c9e5 536
f701460d
KB
537 /*
538 * If the key is on an overflow page, mark the overflow chain
539 * so it isn't deleted when the leaf copy of the key is deleted.
540 */
91b5c9e5
KB
541 if (bl->flags & P_BIGKEY &&
542 bt_preserve(t, *(pgno_t *)bl->bytes) == RET_ERROR)
543 return (RET_ERROR);
544 break;
545 case P_BINTERNAL:
546 bi = GETBINTERNAL(r, 0);
547 nbytes = NBINTERNAL(bi->ksize);
548 h->linp[1] = h->upper -= nbytes;
549 dest = (char *)h + h->upper;
1911b5bf 550 memmove(dest, bi, nbytes);
91b5c9e5
KB
551 ((BINTERNAL *)dest)->pgno = r->pgno;
552 break;
365a94a9
KB
553 default:
554 abort();
876dbe65 555 }
e6f9af54
KB
556
557 /* There are two keys on the page. */
491176fb 558 h->lower = BTDATAOFF + 2 * sizeof(indx_t);
876dbe65 559
91b5c9e5
KB
560 /* Unpin the root page, set to btree internal page. */
561 h->flags &= ~P_TYPE;
562 h->flags |= P_BINTERNAL;
563 mpool_put(t->bt_mp, h, MPOOL_DIRTY);
876dbe65
MO
564
565 return (RET_SUCCESS);
566}
567
568/*
91b5c9e5 569 * BT_PSPLIT -- Do the real work of splitting the page.
876dbe65 570 *
91b5c9e5
KB
571 * Parameters:
572 * t: tree
573 * h: page to be split
574 * l: page to put lower half of data
575 * r: page to put upper half of data
e6f9af54 576 * pskip: pointer to index to leave open
4dde9e25 577 * ilen: insert length
876dbe65 578 *
91b5c9e5
KB
579 * Returns:
580 * Pointer to page in which to insert.
876dbe65 581 */
91b5c9e5 582static PAGE *
4dde9e25 583bt_psplit(t, h, l, r, pskip, ilen)
91b5c9e5
KB
584 BTREE *t;
585 PAGE *h, *l, *r;
198068ab 586 u_int *pskip;
4dde9e25 587 size_t ilen;
876dbe65 588{
91b5c9e5
KB
589 BINTERNAL *bi;
590 BLEAF *bl;
591 RLEAF *rl;
592 EPGNO *c;
593 PAGE *rval;
91b5c9e5 594 void *src;
053d6b28
KB
595 indx_t full, half, nxt, off, skip, top, used;
596 size_t nbytes;
597 int bigkeycnt, isbigkey;
876dbe65
MO
598
599 /*
4dde9e25
KB
600 * Split the data to the left and right pages. Leave the skip index
601 * open. Additionally, make some effort not to split on an overflow
602 * key. This makes internal page processing faster and can save
603 * space as overflow keys used by internal pages are never deleted.
876dbe65 604 */
91b5c9e5 605 bigkeycnt = 0;
f1641417 606 skip = *pskip;
4dde9e25
KB
607 full = t->bt_psize - BTDATAOFF;
608 half = full / 2;
609 used = 0;
91b5c9e5 610 for (nxt = off = 0, top = NEXTINDEX(h); nxt < top; ++off) {
4dde9e25
KB
611 if (skip == off) {
612 nbytes = ilen;
613 isbigkey = 0; /* XXX: not really known. */
614 } else
615 switch (h->flags & P_TYPE) {
616 case P_BINTERNAL:
617 src = bi = GETBINTERNAL(h, nxt);
618 nbytes = NBINTERNAL(bi->ksize);
619 isbigkey = bi->flags & P_BIGKEY;
620 break;
621 case P_BLEAF:
622 src = bl = GETBLEAF(h, nxt);
623 nbytes = NBLEAF(bl);
624 isbigkey = bl->flags & P_BIGKEY;
625 break;
626 case P_RINTERNAL:
627 src = GETRINTERNAL(h, nxt);
628 nbytes = NRINTERNAL;
629 isbigkey = 0;
630 break;
631 case P_RLEAF:
632 src = rl = GETRLEAF(h, nxt);
633 nbytes = NRLEAF(rl);
634 isbigkey = 0;
635 break;
636 default:
637 abort();
638 }
639
640 /*
641 * If the key/data pairs are substantial fractions of the max
642 * possible size for the page, it's possible to get situations
643 * where we decide to try and copy too much onto the left page.
644 * Make sure that doesn't happen.
645 */
646 if (skip <= off && used + nbytes >= full) {
647 --off;
91b5c9e5 648 break;
876dbe65 649 }
91b5c9e5 650
4dde9e25
KB
651 /* Copy the key/data pair, if not the skipped index. */
652 if (skip != off) {
653 ++nxt;
654
655 l->linp[off] = l->upper -= nbytes;
1911b5bf 656 memmove((char *)l + l->upper, src, nbytes);
4dde9e25
KB
657 }
658
659 used += nbytes;
660 if (used >= half) {
e6f9af54
KB
661 if (!isbigkey || bigkeycnt == 3)
662 break;
663 else
664 ++bigkeycnt;
4dde9e25 665 }
91b5c9e5 666 }
4dde9e25
KB
667
668 /*
669 * Off is the last offset that's valid for the left page.
670 * Nxt is the first offset to be placed on the right page.
671 */
491176fb 672 l->lower += (off + 1) * sizeof(indx_t);
876dbe65 673
91b5c9e5 674 /*
f1641417
KB
675 * If splitting the page that the cursor was on, the cursor has to be
676 * adjusted to point to the same record as before the split. If the
4dde9e25
KB
677 * cursor is at or past the skipped slot, the cursor is incremented by
678 * one. If the cursor is on the right page, it is decremented by the
679 * number of records split to the left page.
f1641417 680 *
6236c130 681 * Don't bother checking for the B_SEQINIT flag, the page number will
f1641417 682 * be P_INVALID.
91b5c9e5
KB
683 */
684 c = &t->bt_bcursor;
4dde9e25
KB
685 if (c->pgno == h->pgno) {
686 if (c->index >= skip)
687 ++c->index;
688 if (c->index < nxt) /* Left page. */
91b5c9e5 689 c->pgno = l->pgno;
4dde9e25 690 else { /* Right page. */
91b5c9e5 691 c->pgno = r->pgno;
4dde9e25 692 c->index -= nxt;
876dbe65 693 }
4dde9e25 694 }
876dbe65 695
91b5c9e5 696 /*
4dde9e25
KB
697 * If the skipped index was on the left page, just return that page.
698 * Otherwise, adjust the skip index to reflect the new position on
699 * the right page.
91b5c9e5 700 */
4dde9e25
KB
701 if (skip <= off) {
702 skip = 0;
91b5c9e5 703 rval = l;
4dde9e25
KB
704 } else {
705 rval = r;
706 *pskip -= nxt;
707 }
91b5c9e5
KB
708
709 for (off = 0; nxt < top; ++off) {
f1641417 710 if (skip == nxt) {
4dde9e25 711 ++off;
f1641417 712 skip = 0;
91b5c9e5
KB
713 }
714 switch (h->flags & P_TYPE) {
715 case P_BINTERNAL:
716 src = bi = GETBINTERNAL(h, nxt);
717 nbytes = NBINTERNAL(bi->ksize);
718 break;
719 case P_BLEAF:
720 src = bl = GETBLEAF(h, nxt);
721 nbytes = NBLEAF(bl);
722 break;
723 case P_RINTERNAL:
724 src = GETRINTERNAL(h, nxt);
725 nbytes = NRINTERNAL;
726 break;
727 case P_RLEAF:
728 src = rl = GETRLEAF(h, nxt);
729 nbytes = NRLEAF(rl);
730 break;
365a94a9
KB
731 default:
732 abort();
876dbe65 733 }
91b5c9e5
KB
734 ++nxt;
735 r->linp[off] = r->upper -= nbytes;
1911b5bf 736 memmove((char *)r + r->upper, src, nbytes);
876dbe65 737 }
491176fb 738 r->lower += off * sizeof(indx_t);
876dbe65 739
91b5c9e5 740 /* If the key is being appended to the page, adjust the index. */
f1641417 741 if (skip == top)
491176fb 742 r->lower += sizeof(indx_t);
876dbe65 743
91b5c9e5
KB
744 return (rval);
745}
746
747/*
748 * BT_PRESERVE -- Mark a chain of pages as used by an internal node.
749 *
750 * Chains of indirect blocks pointed to by leaf nodes get reclaimed when the
751 * record that references them gets deleted. Chains pointed to by internal
752 * pages never get deleted. This routine marks a chain as pointed to by an
753 * internal page.
754 *
755 * Parameters:
756 * t: tree
757 * pg: page number of first page in the chain.
758 *
759 * Returns:
760 * RET_SUCCESS, RET_ERROR.
761 */
762static int
763bt_preserve(t, pg)
764 BTREE *t;
765 pgno_t pg;
766{
767 PAGE *h;
768
769 if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
770 return (RET_ERROR);
771 h->flags |= P_PRESERVE;
772 mpool_put(t->bt_mp, h, MPOOL_DIRTY);
876dbe65
MO
773 return (RET_SUCCESS);
774}
91b5c9e5
KB
775
776/*
777 * REC_TOTAL -- Return the number of recno entries below a page.
778 *
779 * Parameters:
780 * h: page
781 *
782 * Returns:
783 * The number of recno entries below a page.
784 *
785 * XXX
786 * These values could be set by the bt_psplit routine. The problem is that the
787 * entry has to be popped off of the stack etc. or the values have to be passed
788 * all the way back to bt_split/bt_rroot and it's not very clean.
789 */
790static recno_t
791rec_total(h)
792 PAGE *h;
793{
794 recno_t recs;
491176fb 795 indx_t nxt, top;
91b5c9e5
KB
796
797 for (recs = 0, nxt = 0, top = NEXTINDEX(h); nxt < top; ++nxt)
798 recs += GETRINTERNAL(h, nxt)->nrecs;
799 return (recs);
800}