Commit | Line | Data |
---|---|---|
876dbe65 MO |
1 | /*- |
2 | * Copyright (c) 1990 The Regents of the University of California. | |
3 | * All rights reserved. | |
4 | * | |
5 | * This code is derived from software contributed to Berkeley by | |
6 | * Mike Olson. | |
7 | * | |
8 | * %sccs.include.redist.c% | |
9 | */ | |
10 | ||
11 | #if defined(LIBC_SCCS) && !defined(lint) | |
a5d29441 | 12 | static char sccsid[] = "@(#)bt_split.c 5.7 (Berkeley) %G%"; |
876dbe65 MO |
13 | #endif /* LIBC_SCCS and not lint */ |
14 | ||
15 | #include <sys/types.h> | |
91b5c9e5 | 16 | #define __DBINTERFACE_PRIVATE |
876dbe65 | 17 | #include <db.h> |
91b5c9e5 KB |
18 | #include <limits.h> |
19 | #include <stdio.h> | |
fb91cf55 KB |
20 | #include <stdlib.h> |
21 | #include <string.h> | |
876dbe65 MO |
22 | #include "btree.h" |
23 | ||
91b5c9e5 KB |
24 | static int bt_preserve __P((BTREE *, pgno_t)); |
25 | static PAGE *bt_psplit __P((BTREE *, PAGE *, PAGE *, PAGE *, int *)); | |
26 | static PAGE *bt_page __P((BTREE *, PAGE *, PAGE **, PAGE **, int *)); | |
27 | static PAGE *bt_root __P((BTREE *, PAGE *, PAGE **, PAGE **, int *)); | |
28 | static int bt_rroot __P((BTREE *, PAGE *, PAGE *, PAGE *)); | |
29 | static int bt_broot __P((BTREE *, PAGE *, PAGE *, PAGE *)); | |
30 | static recno_t rec_total __P((PAGE *)); | |
31 | ||
32 | #ifdef STATISTICS | |
33 | u_long bt_rootsplit, bt_split, bt_sortsplit, bt_pfxsaved; | |
34 | #endif | |
35 | ||
876dbe65 | 36 | /* |
91b5c9e5 | 37 | * __BT_SPLIT -- Split the tree. |
876dbe65 | 38 | * |
91b5c9e5 KB |
39 | * Parameters: |
40 | * t: tree | |
41 | * h: page to split | |
42 | * key: key to insert | |
43 | * data: data to insert | |
44 | * flags: BIGKEY/BIGDATA flags | |
45 | * nbytes: length of insertion | |
46 | * skip: index to leave open | |
876dbe65 | 47 | * |
91b5c9e5 KB |
48 | * Returns: |
49 | * RET_ERROR, RET_SUCCESS | |
876dbe65 | 50 | */ |
876dbe65 | 51 | int |
91b5c9e5 KB |
52 | __bt_split(t, h, key, data, flags, nbytes, skip) |
53 | BTREE *t; | |
54 | PAGE *h; | |
55 | const DBT *key, *data; | |
56 | u_long flags; | |
57 | size_t nbytes; | |
58 | int skip; | |
876dbe65 | 59 | { |
91b5c9e5 KB |
60 | BINTERNAL *bi; |
61 | BLEAF *bl; | |
62 | DBT a, b; | |
63 | EPGNO *parent; | |
64 | PAGE *l, *r, *lchild, *rchild; | |
65 | index_t nxtindex; | |
66 | size_t nksize; | |
67 | int nosplit; | |
68 | char *dest; | |
876dbe65 MO |
69 | |
70 | /* | |
91b5c9e5 KB |
71 | * Split the page into two pages, l and r. The split routines return |
72 | * a pointer to the page into which the key should be inserted and skip | |
73 | * set to the offset which should be used. Additionally, l and r are | |
74 | * pinned. | |
876dbe65 | 75 | */ |
91b5c9e5 KB |
76 | h = h->pgno == P_ROOT ? |
77 | bt_root(t, h, &l, &r, &skip) : bt_page(t, h, &l, &r, &skip); | |
78 | if (h == NULL) | |
876dbe65 MO |
79 | return (RET_ERROR); |
80 | ||
91b5c9e5 KB |
81 | /* |
82 | * Grab the space and insert the [rb]leaf structure. Always a [rb]leaf | |
83 | * structure since key inserts always cause a leaf page to split first. | |
84 | */ | |
85 | h->linp[skip] = h->upper -= nbytes; | |
86 | dest = (char *)h + h->upper; | |
f1641417 | 87 | if (ISSET(t, BTF_RECNO)) |
91b5c9e5 | 88 | WR_RLEAF(dest, data, flags) |
f1641417 | 89 | else |
91b5c9e5 | 90 | WR_BLEAF(dest, key, data, flags) |
876dbe65 MO |
91 | |
92 | /* | |
91b5c9e5 KB |
93 | * Now we walk the parent page stack -- a LIFO stack of the pages that |
94 | * were traversed when we searched for the page that split. Each stack | |
95 | * entry is a page number and a page index offset. The offset is for | |
96 | * the page traversed on the search. We've just split a page, so we | |
97 | * have to insert a new key into the parent page. | |
98 | * | |
99 | * If the insert into the parent page causes it to split, may have to | |
100 | * continue splitting all the way up the tree. We stop if the root | |
101 | * splits or the page inserted into didn't have to split to hold the | |
102 | * new key. Some algorithms replace the key for the old page as well | |
103 | * as the new page. We don't, as there's no reason to believe that the | |
104 | * first key on the old page is any better than the key we have, and, | |
105 | * in the case of a key being placed at index 0 causing the split, the | |
106 | * key is unavailable. | |
107 | * | |
108 | * There are a maximum of 5 pages pinned at any time. We keep the left | |
109 | * and right pages pinned while working on the parent. The 5 are the | |
110 | * two children, left parent and right parent (when the parent splits) | |
111 | * and the root page or the overflow key page when calling bt_preserve. | |
112 | * This code must make sure that all pins are released other than the | |
113 | * root page or overflow page which is unlocked elsewhere. | |
876dbe65 | 114 | */ |
91b5c9e5 KB |
115 | for (nosplit = 0; (parent = BT_POP(t)) != NULL;) { |
116 | lchild = l; | |
117 | rchild = r; | |
876dbe65 | 118 | |
91b5c9e5 KB |
119 | /* Get the parent page. */ |
120 | if ((h = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL) | |
121 | goto err2; | |
876dbe65 | 122 | |
91b5c9e5 KB |
123 | /* The new key goes ONE AFTER the index. */ |
124 | skip = parent->index + 1; | |
876dbe65 | 125 | |
91b5c9e5 KB |
126 | /* |
127 | * Calculate the space needed on the parent page. | |
128 | * | |
f1641417 | 129 | * Space hack when inserting into BINTERNAL pages. Only need to |
91b5c9e5 | 130 | * retain the number of bytes that will distinguish between the |
f1641417 KB |
131 | * new entry and the LAST entry on the page to its left. If the |
132 | * keys compare equal, retain the entire key. Note, we don't | |
133 | * touch overflow keys and the entire key must be retained for | |
134 | * the next-to-leftmost key on the leftmost page of each level, | |
135 | * or the search will fail. | |
91b5c9e5 KB |
136 | */ |
137 | switch (rchild->flags & P_TYPE) { | |
138 | case P_BINTERNAL: | |
139 | bi = GETBINTERNAL(rchild, 0); | |
140 | nbytes = NBINTERNAL(bi->ksize); | |
141 | if (t->bt_pfx && (h->prevpg != P_INVALID || skip > 1) && | |
142 | !(bi->flags & P_BIGKEY)) { | |
143 | BINTERNAL *tbi; | |
144 | tbi = | |
145 | GETBINTERNAL(lchild, NEXTINDEX(lchild) - 1); | |
146 | a.size = tbi->ksize; | |
147 | a.data = tbi->bytes; | |
148 | b.size = bi->ksize; | |
149 | b.data = bi->bytes; | |
150 | goto prefix; | |
876dbe65 | 151 | } |
91b5c9e5 KB |
152 | break; |
153 | case P_BLEAF: | |
154 | bl = GETBLEAF(rchild, 0); | |
f701460d | 155 | nbytes = NBINTERNAL(bl->ksize); |
91b5c9e5 KB |
156 | if (t->bt_pfx && (h->prevpg != P_INVALID || skip > 1) && |
157 | !(bl->flags & P_BIGKEY)) { | |
158 | BLEAF *tbl; | |
159 | size_t n; | |
160 | ||
161 | tbl = GETBLEAF(lchild, NEXTINDEX(lchild) - 1); | |
162 | a.size = tbl->ksize; | |
163 | a.data = tbl->bytes; | |
164 | b.size = bl->ksize; | |
165 | b.data = bl->bytes; | |
166 | prefix: nksize = t->bt_pfx(&a, &b); | |
167 | n = NBINTERNAL(nksize); | |
168 | if (n < nbytes) { | |
169 | #ifdef STATISTICS | |
170 | bt_pfxsaved += nbytes - n; | |
171 | #endif | |
172 | nbytes = n; | |
173 | } else | |
174 | nksize = 0; | |
175 | } else | |
176 | nksize = 0; | |
177 | break; | |
178 | case P_RINTERNAL: | |
179 | case P_RLEAF: | |
180 | nbytes = NRINTERNAL; | |
181 | break; | |
876dbe65 | 182 | } |
876dbe65 | 183 | |
91b5c9e5 KB |
184 | /* Split the parent page if necessary or shift the indices. */ |
185 | if (h->upper - h->lower < nbytes + sizeof(index_t)) { | |
186 | h = h->pgno == P_ROOT ? | |
187 | bt_root(t, h, &l, &r, &skip) : | |
188 | bt_page(t, h, &l, &r, &skip); | |
189 | if (h == NULL) | |
190 | goto err1; | |
191 | } else { | |
192 | if (skip < (nxtindex = NEXTINDEX(h))) | |
193 | bcopy(h->linp + skip, h->linp + skip + 1, | |
194 | (nxtindex - skip) * sizeof(index_t)); | |
195 | h->lower += sizeof(index_t); | |
196 | nosplit = 1; | |
197 | } | |
876dbe65 | 198 | |
91b5c9e5 KB |
199 | /* Insert the key into the parent page. */ |
200 | switch(rchild->flags & P_TYPE) { | |
201 | case P_BINTERNAL: | |
202 | h->linp[skip] = h->upper -= nbytes; | |
203 | dest = (char *)h + h->linp[skip]; | |
204 | bcopy(bi, dest, nbytes); | |
205 | if (nksize) | |
206 | ((BINTERNAL *)dest)->ksize = nksize; | |
207 | ((BINTERNAL *)dest)->pgno = rchild->pgno; | |
208 | break; | |
209 | case P_BLEAF: | |
210 | h->linp[skip] = h->upper -= nbytes; | |
211 | dest = (char *)h + h->linp[skip]; | |
212 | WR_BINTERNAL(dest, nksize ? nksize : bl->ksize, | |
f1641417 | 213 | rchild->pgno, bl->flags & P_BIGKEY); |
91b5c9e5 KB |
214 | bcopy(bl->bytes, dest, nksize ? nksize : bl->ksize); |
215 | if (bl->flags & P_BIGKEY && | |
216 | bt_preserve(t, *(pgno_t *)bl->bytes) == RET_ERROR) | |
217 | goto err1; | |
218 | break; | |
219 | case P_RINTERNAL: | |
220 | /* Update both left and right page counts. */ | |
221 | h->linp[skip] = h->upper -= nbytes; | |
222 | dest = (char *)h + h->linp[skip]; | |
223 | ((RINTERNAL *)dest)->nrecs = rec_total(rchild); | |
224 | ((RINTERNAL *)dest)->pgno = rchild->pgno; | |
225 | dest = (char *)h + h->linp[skip - 1]; | |
226 | ((RINTERNAL *)dest)->nrecs = rec_total(lchild); | |
227 | ((RINTERNAL *)dest)->pgno = lchild->pgno; | |
228 | break; | |
229 | case P_RLEAF: | |
230 | /* Update both left and right page counts. */ | |
231 | h->linp[skip] = h->upper -= nbytes; | |
232 | dest = (char *)h + h->linp[skip]; | |
233 | ((RINTERNAL *)dest)->nrecs = NEXTINDEX(rchild); | |
234 | ((RINTERNAL *)dest)->pgno = rchild->pgno; | |
235 | dest = (char *)h + h->linp[skip - 1]; | |
236 | ((RINTERNAL *)dest)->nrecs = NEXTINDEX(lchild); | |
237 | ((RINTERNAL *)dest)->pgno = lchild->pgno; | |
238 | break; | |
876dbe65 MO |
239 | } |
240 | ||
91b5c9e5 KB |
241 | /* Unpin the held pages. */ |
242 | if (nosplit) { | |
243 | mpool_put(t->bt_mp, h, MPOOL_DIRTY); | |
244 | break; | |
245 | } | |
246 | mpool_put(t->bt_mp, lchild, MPOOL_DIRTY); | |
247 | mpool_put(t->bt_mp, rchild, MPOOL_DIRTY); | |
876dbe65 MO |
248 | } |
249 | ||
91b5c9e5 KB |
250 | /* Unpin the held pages. */ |
251 | mpool_put(t->bt_mp, l, MPOOL_DIRTY); | |
252 | mpool_put(t->bt_mp, r, MPOOL_DIRTY); | |
253 | ||
f1641417 KB |
254 | /* Clear any pages left on the stack. */ |
255 | BT_CLR(t); | |
91b5c9e5 | 256 | return (RET_SUCCESS); |
876dbe65 | 257 | |
91b5c9e5 KB |
258 | /* |
259 | * If something fails in the above loop we were already walking back | |
260 | * up the tree and the tree is now inconsistent. Nothing much we can | |
261 | * do about it but release any memory we're holding. | |
262 | */ | |
263 | err1: mpool_put(t->bt_mp, lchild, MPOOL_DIRTY); | |
264 | mpool_put(t->bt_mp, rchild, MPOOL_DIRTY); | |
876dbe65 | 265 | |
91b5c9e5 KB |
266 | err2: mpool_put(t->bt_mp, l, 0); |
267 | mpool_put(t->bt_mp, r, 0); | |
268 | __dbpanic(t->bt_dbp); | |
269 | return (RET_ERROR); | |
876dbe65 MO |
270 | } |
271 | ||
272 | /* | |
91b5c9e5 | 273 | * BT_PAGE -- Split a non-root page of a btree. |
876dbe65 | 274 | * |
91b5c9e5 KB |
275 | * Parameters: |
276 | * t: tree | |
277 | * h: root page | |
278 | * lp: pointer to left page pointer | |
279 | * rp: pointer to right page pointer | |
280 | * skip: pointer to index to leave open | |
876dbe65 | 281 | * |
91b5c9e5 KB |
282 | * Returns: |
283 | * Pointer to page in which to insert or NULL on error. | |
876dbe65 | 284 | */ |
91b5c9e5 KB |
285 | static PAGE * |
286 | bt_page(t, h, lp, rp, skip) | |
287 | BTREE *t; | |
288 | PAGE *h, **lp, **rp; | |
289 | int *skip; | |
876dbe65 | 290 | { |
91b5c9e5 KB |
291 | PAGE *l, *r, *tp; |
292 | pgno_t npg; | |
293 | ||
294 | #ifdef STATISTICS | |
295 | ++bt_split; | |
296 | #endif | |
297 | /* Put the new right page for the split into place. */ | |
a5d29441 | 298 | if ((r = __bt_new(t, &npg)) == NULL) |
91b5c9e5 KB |
299 | return (NULL); |
300 | r->pgno = npg; | |
301 | r->lower = BTDATAOFF; | |
302 | r->upper = t->bt_psize; | |
303 | r->nextpg = h->nextpg; | |
304 | r->prevpg = h->pgno; | |
305 | r->flags = h->flags & P_TYPE; | |
876dbe65 | 306 | |
91b5c9e5 KB |
307 | /* |
308 | * If we're splitting the last page on a level because we're appending | |
309 | * a key to it (skip is NEXTINDEX()), it's likely that the data is | |
310 | * sorted. Adding an empty page on the side of the level is less work | |
311 | * and can push the fill factor much higher than normal. If we're | |
312 | * wrong it's no big deal, we'll just do the split the right way next | |
313 | * time. It may look like it's equally easy to do a similar hack for | |
314 | * reverse sorted data, that is, split the tree left, but it's not. | |
315 | * Don't even try. | |
316 | */ | |
317 | if (h->nextpg == P_INVALID && *skip == NEXTINDEX(h)) { | |
318 | #ifdef STATISTICS | |
319 | ++bt_sortsplit; | |
320 | #endif | |
321 | h->nextpg = r->pgno; | |
322 | r->lower = BTDATAOFF + sizeof(index_t); | |
323 | *skip = 0; | |
324 | *lp = h; | |
325 | *rp = r; | |
326 | return (r); | |
327 | } | |
876dbe65 | 328 | |
91b5c9e5 KB |
329 | /* Put the new left page for the split into place. */ |
330 | if ((l = malloc(t->bt_psize)) == NULL) { | |
331 | mpool_put(t->bt_mp, r, 0); | |
332 | return (NULL); | |
333 | } | |
334 | l->pgno = h->pgno; | |
335 | l->nextpg = r->pgno; | |
336 | l->prevpg = h->prevpg; | |
337 | l->lower = BTDATAOFF; | |
338 | l->upper = t->bt_psize; | |
339 | l->flags = h->flags & P_TYPE; | |
340 | ||
341 | /* Fix up the previous pointer of the page after the split page. */ | |
342 | if (h->nextpg != P_INVALID) { | |
343 | if ((tp = mpool_get(t->bt_mp, h->nextpg, 0)) == NULL) { | |
344 | free(l); | |
345 | /* XXX mpool_free(t->bt_mp, r->pgno); */ | |
346 | return (NULL); | |
876dbe65 | 347 | } |
91b5c9e5 KB |
348 | tp->prevpg = r->pgno; |
349 | mpool_put(t->bt_mp, tp, 0); | |
350 | } | |
876dbe65 | 351 | |
91b5c9e5 KB |
352 | /* |
353 | * Split right. The key/data pairs aren't sorted in the btree page so | |
354 | * it's simpler to copy the data from the split page onto two new pages | |
355 | * instead of copying half the data to the right page and compacting | |
356 | * the left page in place. Since the left page can't change, we have | |
357 | * to swap the original and the allocated left page after the split. | |
358 | */ | |
359 | tp = bt_psplit(t, h, l, r, skip); | |
876dbe65 | 360 | |
91b5c9e5 KB |
361 | /* Move the new left page onto the old left page. */ |
362 | bcopy(l, h, t->bt_psize); | |
363 | if (tp == l) | |
364 | tp = h; | |
365 | free(l); | |
366 | ||
367 | *lp = h; | |
368 | *rp = r; | |
369 | return (tp); | |
876dbe65 MO |
370 | } |
371 | ||
372 | /* | |
f1641417 | 373 | * BT_ROOT -- Split the root page of a btree. |
876dbe65 | 374 | * |
91b5c9e5 KB |
375 | * Parameters: |
376 | * t: tree | |
377 | * h: root page | |
378 | * lp: pointer to left page pointer | |
379 | * rp: pointer to right page pointer | |
380 | * skip: pointer to index to leave open | |
876dbe65 | 381 | * |
91b5c9e5 KB |
382 | * Returns: |
383 | * Pointer to page in which to insert or NULL on error. | |
876dbe65 | 384 | */ |
91b5c9e5 KB |
385 | static PAGE * |
386 | bt_root(t, h, lp, rp, skip) | |
387 | BTREE *t; | |
388 | PAGE *h, **lp, **rp; | |
389 | int *skip; | |
876dbe65 | 390 | { |
91b5c9e5 KB |
391 | PAGE *l, *r, *tp; |
392 | pgno_t lnpg, rnpg; | |
393 | ||
394 | #ifdef STATISTICS | |
395 | ++bt_split; | |
396 | ++bt_rootsplit; | |
397 | #endif | |
398 | /* Put the new left and right pages for the split into place. */ | |
a5d29441 KB |
399 | if ((l = __bt_new(t, &lnpg)) == NULL || |
400 | (r = __bt_new(t, &rnpg)) == NULL) | |
91b5c9e5 KB |
401 | return (NULL); |
402 | l->pgno = lnpg; | |
403 | r->pgno = rnpg; | |
404 | l->nextpg = r->pgno; | |
405 | r->prevpg = l->pgno; | |
406 | l->prevpg = r->nextpg = P_INVALID; | |
407 | l->lower = r->lower = BTDATAOFF; | |
408 | l->upper = r->upper = t->bt_psize; | |
409 | l->flags = r->flags = h->flags & P_TYPE; | |
410 | ||
411 | /* Split the root page. */ | |
412 | tp = bt_psplit(t, h, l, r, skip); | |
413 | ||
414 | /* Make the root page look right. */ | |
415 | if ((ISSET(t, BTF_RECNO) ? | |
416 | bt_rroot(t, h, l, r) : bt_broot(t, h, l, r)) == RET_ERROR) | |
417 | return (NULL); | |
418 | ||
419 | *lp = l; | |
420 | *rp = r; | |
421 | return (tp); | |
876dbe65 MO |
422 | } |
423 | ||
424 | /* | |
91b5c9e5 | 425 | * BT_RROOT -- Fix up the recno root page after the split. |
876dbe65 | 426 | * |
91b5c9e5 KB |
427 | * Parameters: |
428 | * t: tree | |
429 | * h: root page | |
876dbe65 | 430 | * |
91b5c9e5 KB |
431 | * Returns: |
432 | * RET_ERROR, RET_SUCCESS | |
876dbe65 | 433 | */ |
91b5c9e5 KB |
434 | static int |
435 | bt_rroot(t, h, l, r) | |
436 | BTREE *t; | |
437 | PAGE *h, *l, *r; | |
876dbe65 | 438 | { |
91b5c9e5 | 439 | char *dest; |
876dbe65 | 440 | |
91b5c9e5 KB |
441 | /* Insert the left and right keys, set the header information. */ |
442 | h->linp[0] = h->upper = t->bt_psize - NRINTERNAL; | |
443 | dest = (char *)h + h->upper; | |
444 | WR_RINTERNAL(dest, | |
445 | l->flags & P_RLEAF ? NEXTINDEX(l) : rec_total(l), l->pgno); | |
876dbe65 | 446 | |
91b5c9e5 KB |
447 | h->linp[1] = h->upper -= NRINTERNAL; |
448 | dest = (char *)h + h->upper; | |
449 | WR_RINTERNAL(dest, | |
450 | r->flags & P_RLEAF ? NEXTINDEX(r) : rec_total(r), r->pgno); | |
876dbe65 | 451 | |
91b5c9e5 | 452 | h->lower = BTDATAOFF + 2 * sizeof(index_t); |
876dbe65 | 453 | |
91b5c9e5 KB |
454 | /* Unpin the root page, set to recno internal page. */ |
455 | h->flags &= ~P_TYPE; | |
456 | h->flags |= P_RINTERNAL; | |
457 | mpool_put(t->bt_mp, h, MPOOL_DIRTY); | |
876dbe65 | 458 | |
91b5c9e5 KB |
459 | return (RET_SUCCESS); |
460 | } | |
876dbe65 | 461 | |
91b5c9e5 KB |
462 | /* |
463 | * BT_BROOT -- Fix up the btree root page after the split. | |
464 | * | |
465 | * Parameters: | |
466 | * t: tree | |
467 | * h: root page | |
468 | * | |
469 | * Returns: | |
470 | * RET_ERROR, RET_SUCCESS | |
471 | */ | |
472 | static int | |
473 | bt_broot(t, h, l, r) | |
474 | BTREE *t; | |
475 | PAGE *h, *l, *r; | |
476 | { | |
477 | BINTERNAL *bi; | |
478 | BLEAF *bl; | |
479 | size_t nbytes; | |
480 | char *dest; | |
876dbe65 MO |
481 | |
482 | /* | |
91b5c9e5 KB |
483 | * If the root page was a leaf page, change it into an internal page. |
484 | * We copy the key we split on (but not the key's data, in the case of | |
f701460d | 485 | * a leaf page) to the new root page. |
91b5c9e5 KB |
486 | * |
487 | * The btree comparison code guarantees that the left-most key on any | |
488 | * level of the tree is never used, so it doesn't need to be filled | |
489 | * in. (This is not just convenience -- if the insert index is 0, we | |
490 | * don't *have* a key to fill in.) The right key is available because | |
491 | * the split code guarantees not to split on the skipped index. | |
876dbe65 | 492 | */ |
f701460d | 493 | nbytes = NBINTERNAL(0); |
91b5c9e5 KB |
494 | h->linp[0] = h->upper = t->bt_psize - nbytes; |
495 | dest = (char *)h + h->upper; | |
496 | WR_BINTERNAL(dest, 0, l->pgno, 0); | |
497 | ||
498 | switch(h->flags & P_TYPE) { | |
499 | case P_BLEAF: | |
500 | bl = GETBLEAF(r, 0); | |
501 | nbytes = NBINTERNAL(bl->ksize); | |
502 | h->linp[1] = h->upper -= nbytes; | |
503 | dest = (char *)h + h->upper; | |
504 | WR_BINTERNAL(dest, bl->ksize, r->pgno, 0); | |
505 | bcopy(bl->bytes, dest, bl->ksize); | |
506 | ||
f701460d KB |
507 | /* |
508 | * If the key is on an overflow page, mark the overflow chain | |
509 | * so it isn't deleted when the leaf copy of the key is deleted. | |
510 | */ | |
91b5c9e5 KB |
511 | if (bl->flags & P_BIGKEY && |
512 | bt_preserve(t, *(pgno_t *)bl->bytes) == RET_ERROR) | |
513 | return (RET_ERROR); | |
514 | break; | |
515 | case P_BINTERNAL: | |
516 | bi = GETBINTERNAL(r, 0); | |
517 | nbytes = NBINTERNAL(bi->ksize); | |
518 | h->linp[1] = h->upper -= nbytes; | |
519 | dest = (char *)h + h->upper; | |
520 | bcopy(bi, dest, nbytes); | |
521 | ((BINTERNAL *)dest)->pgno = r->pgno; | |
522 | break; | |
876dbe65 | 523 | } |
91b5c9e5 | 524 | h->lower = BTDATAOFF + 2 * sizeof(index_t); |
876dbe65 | 525 | |
91b5c9e5 KB |
526 | /* Unpin the root page, set to btree internal page. */ |
527 | h->flags &= ~P_TYPE; | |
528 | h->flags |= P_BINTERNAL; | |
529 | mpool_put(t->bt_mp, h, MPOOL_DIRTY); | |
876dbe65 MO |
530 | |
531 | return (RET_SUCCESS); | |
532 | } | |
533 | ||
534 | /* | |
91b5c9e5 | 535 | * BT_PSPLIT -- Do the real work of splitting the page. |
876dbe65 | 536 | * |
91b5c9e5 KB |
537 | * Parameters: |
538 | * t: tree | |
539 | * h: page to be split | |
540 | * l: page to put lower half of data | |
541 | * r: page to put upper half of data | |
542 | * skip: pointer to index to leave open | |
876dbe65 | 543 | * |
91b5c9e5 KB |
544 | * Returns: |
545 | * Pointer to page in which to insert. | |
876dbe65 | 546 | */ |
91b5c9e5 | 547 | static PAGE * |
f1641417 | 548 | bt_psplit(t, h, l, r, pskip) |
91b5c9e5 KB |
549 | BTREE *t; |
550 | PAGE *h, *l, *r; | |
f1641417 | 551 | int *pskip; |
876dbe65 | 552 | { |
91b5c9e5 KB |
553 | BINTERNAL *bi; |
554 | BLEAF *bl; | |
555 | RLEAF *rl; | |
556 | EPGNO *c; | |
557 | PAGE *rval; | |
f1641417 | 558 | index_t half, skip; |
91b5c9e5 KB |
559 | size_t nbytes; |
560 | void *src; | |
561 | int bigkeycnt, isbigkey, nxt, off, top; | |
876dbe65 MO |
562 | |
563 | /* | |
91b5c9e5 KB |
564 | * Split the data to the left and right pages. Leave the skip index |
565 | * open and guarantee that the split doesn't happen on that index (the | |
566 | * right key must be available for the parent page). Additionally, | |
567 | * make some effort not to split on an overflow key. This makes it | |
568 | * faster to process internal pages and can save space since overflow | |
569 | * keys used by internal pages are never deleted. | |
876dbe65 | 570 | */ |
91b5c9e5 | 571 | bigkeycnt = 0; |
f1641417 | 572 | skip = *pskip; |
91b5c9e5 KB |
573 | half = (t->bt_psize - BTDATAOFF) / 2; |
574 | for (nxt = off = 0, top = NEXTINDEX(h); nxt < top; ++off) { | |
f1641417 | 575 | if (skip == off) |
91b5c9e5 KB |
576 | continue; |
577 | switch (h->flags & P_TYPE) { | |
578 | case P_BINTERNAL: | |
579 | src = bi = GETBINTERNAL(h, nxt); | |
580 | nbytes = NBINTERNAL(bi->ksize); | |
581 | isbigkey = bi->flags & P_BIGKEY; | |
582 | break; | |
583 | case P_BLEAF: | |
584 | src = bl = GETBLEAF(h, nxt); | |
585 | nbytes = NBLEAF(bl); | |
586 | isbigkey = bl->flags & P_BIGKEY; | |
587 | break; | |
588 | case P_RINTERNAL: | |
589 | src = GETRINTERNAL(h, nxt); | |
590 | nbytes = NRINTERNAL; | |
591 | isbigkey = 0; | |
592 | break; | |
593 | case P_RLEAF: | |
594 | src = rl = GETRLEAF(h, nxt); | |
595 | nbytes = NRLEAF(rl); | |
596 | isbigkey = 0; | |
597 | break; | |
876dbe65 | 598 | } |
91b5c9e5 KB |
599 | ++nxt; |
600 | l->linp[off] = l->upper -= nbytes; | |
601 | bcopy(src, (char *)l + l->upper, nbytes); | |
602 | ||
603 | /* There's no empirical justification for the '3'. */ | |
9f18f0ec KB |
604 | if (half < nbytes) { |
605 | if (skip != off + 1) | |
606 | if (!isbigkey || bigkeycnt == 3) | |
607 | break; | |
608 | else | |
609 | ++bigkeycnt; | |
610 | } else | |
91b5c9e5 KB |
611 | half -= nbytes; |
612 | } | |
613 | l->lower += (off + 1) * sizeof(index_t); | |
876dbe65 | 614 | |
91b5c9e5 | 615 | /* |
f1641417 KB |
616 | * If splitting the page that the cursor was on, the cursor has to be |
617 | * adjusted to point to the same record as before the split. If the | |
618 | * skipped slot and the cursor are both on the left page and the cursor | |
619 | * is on or past the skipped slot, the cursor is incremented by one. | |
620 | * If the skipped slot and the cursor are both on the right page and | |
621 | * the cursor is on or past the skipped slot, the cursor is incremented | |
622 | * by one. If the skipped slot and the cursor aren't on the same page, | |
623 | * the cursor isn't changed. Regardless of the relationship of the | |
624 | * skipped slot and the cursor, if the cursor is on the right page it | |
625 | * is decremented by the number of records split to the left page. | |
626 | * | |
627 | * Don't bother checking for the BTF_SEQINIT flag, the page number will | |
628 | * be P_INVALID. | |
91b5c9e5 KB |
629 | */ |
630 | c = &t->bt_bcursor; | |
631 | if (c->pgno == h->pgno) | |
f1641417 | 632 | if (c->index < off) { /* left page */ |
91b5c9e5 | 633 | c->pgno = l->pgno; |
f1641417 KB |
634 | if (c->index >= skip) |
635 | ++c->index; | |
636 | } else { /* right page */ | |
91b5c9e5 | 637 | c->pgno = r->pgno; |
f1641417 KB |
638 | if (c->index >= skip && skip > off) |
639 | ++c->index; | |
91b5c9e5 | 640 | c->index -= off; |
876dbe65 MO |
641 | } |
642 | ||
91b5c9e5 KB |
643 | /* |
644 | * Decide which page to return, and adjust the skip index if the | |
645 | * to-be-inserted-upon page has changed. | |
646 | */ | |
f1641417 | 647 | if (skip > off) { |
91b5c9e5 | 648 | rval = r; |
f1641417 | 649 | *pskip -= off + 1; |
91b5c9e5 KB |
650 | } else |
651 | rval = l; | |
652 | ||
653 | for (off = 0; nxt < top; ++off) { | |
f1641417 KB |
654 | if (skip == nxt) { |
655 | skip = 0; | |
91b5c9e5 KB |
656 | continue; |
657 | } | |
658 | switch (h->flags & P_TYPE) { | |
659 | case P_BINTERNAL: | |
660 | src = bi = GETBINTERNAL(h, nxt); | |
661 | nbytes = NBINTERNAL(bi->ksize); | |
662 | break; | |
663 | case P_BLEAF: | |
664 | src = bl = GETBLEAF(h, nxt); | |
665 | nbytes = NBLEAF(bl); | |
666 | break; | |
667 | case P_RINTERNAL: | |
668 | src = GETRINTERNAL(h, nxt); | |
669 | nbytes = NRINTERNAL; | |
670 | break; | |
671 | case P_RLEAF: | |
672 | src = rl = GETRLEAF(h, nxt); | |
673 | nbytes = NRLEAF(rl); | |
674 | break; | |
876dbe65 | 675 | } |
91b5c9e5 KB |
676 | ++nxt; |
677 | r->linp[off] = r->upper -= nbytes; | |
678 | bcopy(src, (char *)r + r->upper, nbytes); | |
876dbe65 | 679 | } |
91b5c9e5 | 680 | r->lower += off * sizeof(index_t); |
876dbe65 | 681 | |
91b5c9e5 | 682 | /* If the key is being appended to the page, adjust the index. */ |
f1641417 | 683 | if (skip == top) |
91b5c9e5 | 684 | r->lower += sizeof(index_t); |
876dbe65 | 685 | |
91b5c9e5 KB |
686 | return (rval); |
687 | } | |
688 | ||
689 | /* | |
690 | * BT_PRESERVE -- Mark a chain of pages as used by an internal node. | |
691 | * | |
692 | * Chains of indirect blocks pointed to by leaf nodes get reclaimed when the | |
693 | * record that references them gets deleted. Chains pointed to by internal | |
694 | * pages never get deleted. This routine marks a chain as pointed to by an | |
695 | * internal page. | |
696 | * | |
697 | * Parameters: | |
698 | * t: tree | |
699 | * pg: page number of first page in the chain. | |
700 | * | |
701 | * Returns: | |
702 | * RET_SUCCESS, RET_ERROR. | |
703 | */ | |
704 | static int | |
705 | bt_preserve(t, pg) | |
706 | BTREE *t; | |
707 | pgno_t pg; | |
708 | { | |
709 | PAGE *h; | |
710 | ||
711 | if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) | |
712 | return (RET_ERROR); | |
713 | h->flags |= P_PRESERVE; | |
714 | mpool_put(t->bt_mp, h, MPOOL_DIRTY); | |
876dbe65 MO |
715 | return (RET_SUCCESS); |
716 | } | |
91b5c9e5 KB |
717 | |
718 | /* | |
719 | * REC_TOTAL -- Return the number of recno entries below a page. | |
720 | * | |
721 | * Parameters: | |
722 | * h: page | |
723 | * | |
724 | * Returns: | |
725 | * The number of recno entries below a page. | |
726 | * | |
727 | * XXX | |
728 | * These values could be set by the bt_psplit routine. The problem is that the | |
729 | * entry has to be popped off of the stack etc. or the values have to be passed | |
730 | * all the way back to bt_split/bt_rroot and it's not very clean. | |
731 | */ | |
732 | static recno_t | |
733 | rec_total(h) | |
734 | PAGE *h; | |
735 | { | |
736 | recno_t recs; | |
737 | index_t nxt, top; | |
738 | ||
739 | for (recs = 0, nxt = 0, top = NEXTINDEX(h); nxt < top; ++nxt) | |
740 | recs += GETRINTERNAL(h, nxt)->nrecs; | |
741 | return (recs); | |
742 | } |