Add diclaimer of copyright to _osname() manual page.
[unix-history] / lib / libc / db / btree / split.c
CommitLineData
15637ed4
RG
1/*-
2 * Copyright (c) 1990 The Regents of the University of California.
3 * All rights reserved.
4 *
5 * This code is derived from software contributed to Berkeley by
6 * Mike Olson.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 * 3. All advertising materials mentioning features or use of this software
17 * must display the following acknowledgement:
18 * This product includes software developed by the University of
19 * California, Berkeley and its contributors.
20 * 4. Neither the name of the University nor the names of its contributors
21 * may be used to endorse or promote products derived from this software
22 * without specific prior written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
35 */
36
37#if defined(LIBC_SCCS) && !defined(lint)
38static char sccsid[] = "@(#)split.c 5.2 (Berkeley) 2/22/91";
39#endif /* LIBC_SCCS and not lint */
40
41#include <sys/types.h>
42#include <db.h>
43#include <stdlib.h>
44#include <string.h>
45#include "btree.h"
46
47/*
48 * _BT_SPLIT -- Split a page into two pages.
49 *
50 * Splits are caused by insertions, and propogate up the tree in
51 * the usual way. The root page is always page 1 in the file on
52 * disk, so root splits are handled specially. On entry to this
53 * routine, t->bt_curpage is the page to be split.
54 *
55 * Parameters:
56 * t -- btree in which to do split.
57 *
58 * Returns:
59 * RET_SUCCESS, RET_ERROR.
60 *
61 * Side Effects:
62 * Changes the notion of the current page.
63 */
64
65int
66_bt_split(t)
67 BTREE_P t;
68{
69 BTHEADER *h;
70 BTHEADER *left, *right;
71 pgno_t nextpgno, parent;
72 int nbytes, len;
73 IDATUM *id;
74 DATUM *d;
75 char *src;
76 IDATUM *new;
77 pgno_t oldchain;
78 u_char flags;
79
80 h = (BTHEADER *) t->bt_curpage;
81
82 /* split root page specially, since it must remain page 1 */
83 if (h->h_pgno == P_ROOT) {
84 return (_bt_splitroot(t));
85 }
86
87 /*
88 * This is a little complicated. We go to some trouble to
89 * figure out which of the three possible cases -- in-memory tree,
90 * disk tree (no cache), and disk tree (cache) -- we have, in order
91 * to avoid unnecessary copying. If we have a disk cache, then we
92 * have to do some extra copying, though, since the cache code
93 * manages buffers externally to this code.
94 */
95
96 if (ISDISK(t) && ISCACHE(t)) {
97 if ((left = (BTHEADER *) malloc((unsigned) t->bt_psize))
98 == (BTHEADER *) NULL)
99 return (RET_ERROR);
100 left->h_pgno = left->h_prevpg = left->h_nextpg = P_NONE;
101 left->h_flags = t->bt_curpage->h_flags;
102 left->h_lower = (index_t)
103 (((char *) &(left->h_linp[0])) - ((char *) left));
104 left->h_upper = t->bt_psize;
105
106 } else {
107 if ((left = _bt_allocpg(t)) == (BTHEADER *) NULL)
108 return (RET_ERROR);
109 }
110 left->h_pgno = h->h_pgno;
111
112 if ((right = _bt_allocpg(t)) == (BTHEADER *) NULL)
113 return (RET_ERROR);
114 right->h_pgno = ++(t->bt_npages);
115
116 /* now do the split */
117 if (_bt_dopsplit(t, left, right) == RET_ERROR)
118 return (RET_ERROR);
119
120 right->h_prevpg = left->h_pgno;
121 nextpgno = right->h_nextpg = h->h_nextpg;
122 left->h_nextpg = right->h_pgno;
123 left->h_prevpg = h->h_prevpg;
124
125 /* okay, now use the left half of the page as the new page */
126 if (ISDISK(t) && ISCACHE(t)) {
127 (void) bcopy((char *) left, (char *) t->bt_curpage,
128 (int) t->bt_psize);
129 (void) free ((char *) left);
130 left = t->bt_curpage;
131 } else {
132 (void) free((char *) t->bt_curpage);
133 t->bt_curpage = left;
134 }
135
136 /*
137 * Write the new pages out. We need them to stay where they are
138 * until we're done updating the parent pages.
139 */
140
141 if (_bt_write(t, left, NORELEASE) == RET_ERROR)
142 return (RET_ERROR);
143 if (_bt_write(t, right, NORELEASE) == RET_ERROR)
144 return (RET_ERROR);
145
146 /* update 'prev' pointer of old neighbor of left */
147 if (nextpgno != P_NONE) {
148 if (_bt_getpage(t, nextpgno) == RET_ERROR)
149 return (RET_ERROR);
150 h = t->bt_curpage;
151 h->h_prevpg = right->h_pgno;
152 h->h_flags |= F_DIRTY;
153 }
154
155 if ((parent = _bt_pop(t)) != P_NONE) {
156 if (right->h_flags & F_LEAF) {
157 d = (DATUM *) GETDATUM(right, 0);
158 len = d->d_ksize;
159 if (d->d_flags & D_BIGKEY) {
160 bcopy(&(d->d_bytes[0]),
161 (char *) &oldchain,
162 sizeof(oldchain));
163 if (_bt_markchain(t, oldchain) == RET_ERROR)
164 return (RET_ERROR);
165 src = (char *) &oldchain;
166 flags = D_BIGKEY;
167 } else {
168 src = (char *) &(d->d_bytes[0]);
169 flags = 0;
170 }
171 } else {
172 id = (IDATUM *) GETDATUM(right, 0);
173 len = id->i_size;
174 flags = id->i_flags;
175 src = (char *) &(id->i_bytes[0]);
176 }
177 nbytes = len + (sizeof(IDATUM) - sizeof(char));
178 new = (IDATUM *) malloc((unsigned) nbytes);
179 if (new == (IDATUM *) NULL)
180 return (RET_ERROR);
181 new->i_size = len;
182 new->i_pgno = right->h_pgno;
183 new->i_flags = flags;
184 if (len > 0)
185 (void) bcopy(src, (char *) &(new->i_bytes[0]), len);
186
187 nbytes = LONGALIGN(nbytes) + sizeof(index_t);
188 if (_bt_getpage(t, parent) == RET_ERROR)
189 return (RET_ERROR);
190
191 h = t->bt_curpage;
192
193 /*
194 * Split the parent if we need to, then reposition the
195 * tree's current page pointer for the new datum.
196 */
197 if ((h->h_upper - h->h_lower) < nbytes) {
198 if (_bt_split(t) == RET_ERROR)
199 return (RET_ERROR);
200 if (_bt_reposition(t, new, parent, right->h_prevpg)
201 == RET_ERROR)
202 return (RET_ERROR);
203 }
204
205 /* okay, now insert the new idatum */
206 if (_bt_inserti(t, new, right->h_prevpg) == RET_ERROR)
207 return (RET_ERROR);
208 }
209
210 /*
211 * Okay, split is done; don't need right page stapled down anymore.
212 * The page we call 'left' above is the new version of the old
213 * (split) page, so we can't release it.
214 */
215
216 if (_bt_release(t, right) == RET_ERROR)
217 return (RET_ERROR);
218 if (ISDISK(t) && !ISCACHE(t))
219 (void) free((char *) right);
220
221 return (RET_SUCCESS);
222}
223
224/*
225 * _BT_REPOSITION -- Reposition the current page pointer of a btree.
226 *
227 * After splitting a node in the tree in order to make room for
228 * an insertion, we need to figure out which page after the split
229 * should get the item we want to insert. This routine positions
230 * the tree's current page pointer appropriately.
231 *
232 * Parameters:
233 * t -- tree to position
234 * new -- the item we want to insert
235 * parent -- parent of the node that we just split
236 * prev -- page number of item directly to the left of
237 * new's position in the tree.
238 *
239 * Returns:
240 * RET_SUCCESS, RET_ERROR.
241 *
242 * Side Effects:
243 * None.
244 */
245
246int
247_bt_reposition(t, new, parent, prev)
248 BTREE_P t;
249 IDATUM *new;
250 pgno_t parent;
251 pgno_t prev;
252{
253 index_t i, next;
254 IDATUM *idx;
255
256 if (parent == P_ROOT) {
257
258 /*
259 * If we just split the root page, then there are guaranteed
260 * to be exactly two IDATUMs on it. Look at both of them
261 * to see if they point to the page that we want.
262 */
263
264 if (_bt_getpage(t, parent) == RET_ERROR)
265 return (RET_ERROR);
266
267 next = NEXTINDEX(t->bt_curpage);
268 for (i = 0; i < next; i++) {
269 idx = (IDATUM *) GETDATUM(t->bt_curpage, i);
270 if (_bt_getpage(t, idx->i_pgno) == RET_ERROR)
271 return (RET_ERROR);
272 if (_bt_isonpage(t, new, prev) == RET_SUCCESS)
273 return (RET_SUCCESS);
274 if (_bt_getpage(t, parent) == RET_ERROR)
275 return (RET_ERROR);
276 }
277 } else {
278
279 /*
280 * Get the parent page -- which is where the new item would
281 * have gone -- and figure out whether the new item now goes
282 * on the parent, or the page immediately to the right of
283 * the parent.
284 */
285
286 if (_bt_getpage(t, parent) == RET_ERROR)
287 return (RET_ERROR);
288 if (_bt_isonpage(t, new, prev) == RET_SUCCESS)
289 return (RET_SUCCESS);
290 if (_bt_getpage(t, t->bt_curpage->h_nextpg) == RET_ERROR)
291 return (RET_ERROR);
292 if (_bt_isonpage(t, new, prev) == RET_SUCCESS)
293 return (RET_SUCCESS);
294 }
295 return (RET_ERROR);
296}
297
298/*
299 * _BT_ISONPAGE -- Is the IDATUM for a given page number on the current page?
300 *
301 * This routine is used by _bt_reposition to decide whether the current
302 * page is the correct one on which to insert a new item.
303 *
304 * Parameters:
305 * t -- tree to check
306 * new -- the item that will be inserted (used for binary search)
307 * prev -- page number of page whose IDATUM is immediately to
308 * the left of new's position in the tree.
309 *
310 * Returns:
311 * RET_SUCCESS, RET_ERROR (corresponding to TRUE, FALSE).
312 */
313
314int
315_bt_isonpage(t, new, prev)
316 BTREE_P t;
317 IDATUM *new;
318 pgno_t prev;
319{
320 BTHEADER *h = (BTHEADER *) t->bt_curpage;
321 index_t i, next;
322 IDATUM *idx;
323
324 i = _bt_binsrch(t, &(new->i_bytes[0]));
325 while (i != 0 && _bt_cmp(t, &(new->i_bytes[0]), i) == 0)
326 --i;
327 next = NEXTINDEX(h);
328 idx = (IDATUM *) GETDATUM(h, i);
329 while (i < next && idx->i_pgno != prev) {
330 i++;
331 idx = (IDATUM *) GETDATUM(h, i);
332 }
333 if (idx->i_pgno == prev)
334 return (RET_SUCCESS);
335 else
336 return (RET_ERROR);
337}
338
339/*
340 * _BT_SPLITROOT -- Split the root of a btree.
341 *
342 * The root page for a btree is always page one. This means that in
343 * order to split the root, we need to do extra work.
344 *
345 * Parameters:
346 * t -- tree to split
347 *
348 * Returns:
349 * RET_SUCCESS, RET_ERROR.
350 *
351 * Side Effects:
352 * Splits root upward in the usual way, adding two new pages
353 * to the tree (rather than just one, as in usual splits).
354 */
355
356int
357_bt_splitroot(t)
358 BTREE_P t;
359{
360 BTHEADER *h = t->bt_curpage;
361 BTHEADER *left, *right;
362 IDATUM *id;
363 BTHEADER *where_h;
364 char *src, *dest;
365 int len, nbytes;
366 u_long was_leaf;
367 pgno_t oldchain;
368 u_char flags;
369
370 /* get two new pages for the split */
371 if ((left = _bt_allocpg(t)) == (BTHEADER *) NULL)
372 return (RET_ERROR);
373 left->h_pgno = ++(t->bt_npages);
374 if ((right = _bt_allocpg(t)) == (BTHEADER *) NULL)
375 return (RET_ERROR);
376 right->h_pgno = ++(t->bt_npages);
377
378 /* do the split */
379 if (_bt_dopsplit(t, left, right) == RET_ERROR)
380 return (RET_ERROR);
381
382 /* connect the new pages correctly */
383 right->h_prevpg = left->h_pgno;
384 left->h_nextpg = right->h_pgno;
385
386 /*
387 * Write the child pages out now. We need them to remain
388 * where they are until we finish updating parent pages,
389 * however.
390 */
391
392 if (_bt_write(t, left, NORELEASE) == RET_ERROR)
393 return (RET_ERROR);
394 if (_bt_write(t, right, NORELEASE) == RET_ERROR)
395 return (RET_ERROR);
396
397 /* now change the root page into an internal page */
398 was_leaf = (h->h_flags & F_LEAF);
399 h->h_flags &= ~F_LEAF;
400 h->h_lower = (index_t) (((char *) (&(h->h_linp[0]))) - ((char *) h));
401 h->h_upper = (index_t) t->bt_psize;
402 (void) bzero((char *) &(h->h_linp[0]), (int) (h->h_upper - h->h_lower));
403
404 /* put two new keys on root page */
405 where_h = left;
406 while (where_h) {
407 DATUM *data;
408 IDATUM *idata;
409
410 if (was_leaf) {
411 data = (DATUM *) GETDATUM(where_h, 0);
412
413 if (where_h == left) {
414 len = 0; /* first key in tree is null */
415 } else {
416 if (data->d_flags & D_BIGKEY) {
417 bcopy(&(data->d_bytes[0]),
418 (char *) &oldchain,
419 sizeof(oldchain));
420 if (_bt_markchain(t, oldchain) == RET_ERROR)
421 return (RET_ERROR);
422 src = (char *) &oldchain;
423 flags = D_BIGKEY;
424 } else {
425 src = (char *) &(data->d_bytes[0]);
426 flags = 0;
427 }
428 len = data->d_ksize;
429 }
430 } else {
431 idata = (IDATUM *) GETDATUM(where_h, 0);
432 len = idata->i_size;
433 flags = idata->i_flags;
434 src = &(idata->i_bytes[0]);
435 }
436 dest = ((char *) h) + h->h_upper;
437 nbytes = len + (sizeof (IDATUM) - sizeof(char));
438 dest -= LONGALIGN(nbytes);
439 id = (IDATUM *) dest;
440 id->i_size = len;
441 id->i_pgno = where_h->h_pgno;
442 id->i_flags = flags;
443 if (len > 0)
444 (void) bcopy((char *) src, (char *) &(id->i_bytes[0]), len);
445 dest -= ((int) h);
446 h->h_linp[NEXTINDEX(h)] = (index_t) dest;
447 h->h_upper = (index_t) dest;
448 h->h_lower += sizeof(index_t);
449
450 /* next page */
451 if (where_h == left)
452 where_h = right;
453 else
454 where_h = (BTHEADER *) NULL;
455 }
456
457 if (_bt_release(t, left) == RET_ERROR)
458 return (RET_ERROR);
459 if (_bt_release(t, right) == RET_ERROR)
460 return (RET_ERROR);
461
462 /*
463 * That's it, split is done. If we're doing a non-cached disk
464 * btree, we can free up the pages we allocated, as they're on
465 * disk, now.
466 */
467
468 if (ISDISK(t) && !ISCACHE(t)) {
469 (void) free ((char *) left);
470 (void) free ((char *) right);
471 }
472
473 h->h_flags |= F_DIRTY;
474
475 return (RET_SUCCESS);
476}
477
478/*
479 * _BT_DOPSPLIT -- Do the work of splitting a page
480 *
481 * This routine takes two page pointers and splits the data on the
482 * current page of the btree between them.
483 *
484 * We do a lot of work here to handle duplicate keys on a page; we
485 * have to place these keys carefully to guarantee that later searches
486 * will find them correctly. See comments in the code below for details.
487 *
488 * Parameters:
489 * t -- tree to split
490 * left -- pointer to page to get left half of the data
491 * right -- pointer to page to get right half of the data
492 *
493 * Returns:
494 * None.
495 */
496
497int
498_bt_dopsplit(t, left, right)
499 BTREE_P t;
500 BTHEADER *left;
501 BTHEADER *right;
502{
503 BTHEADER *h = t->bt_curpage;
504 size_t psize;
505 char *where;
506 BTHEADER *where_h;
507 index_t where_i;
508 int nbytes, dsize, fixedsize, freespc;
509 index_t i;
510 index_t save_lower, save_upper, save_i;
511 index_t switch_i;
512 char *save_key;
513 DATUM *d;
514 CURSOR *c;
515 index_t top;
516 int free_save;
517 pgno_t chain;
518 int ignore;
519
520 /*
521 * Our strategy is to put half the bytes on each page. We figure
522 * out how many bytes we have total, and then move items until
523 * the last item moved put at least 50% of the data on the left
524 * page.
525 */
526 save_key = (char *) NULL;
527 psize = (int) t->bt_psize;
528 where = ((char *) left) + psize;
529 where_h = left;
530 where_i = 0;
531 nbytes = psize - (int) ((char *) &(h->h_linp[0]) - ((char *) h));
532 freespc = nbytes;
533
534 top = NEXTINDEX(h);
535 if (h->h_flags & F_LEAF)
536 fixedsize = (sizeof(DATUM) - sizeof(char));
537 else
538 fixedsize = (sizeof(IDATUM) - sizeof(char));
539
540 save_key = (char *) NULL;
541
542 /* move data */
543 for (i = 0; i < top; i++) {
544
545 /*
546 * Internal and leaf pages have different layouts for
547 * data items, but in both cases the first entry in the
548 * data item is a size_t.
549 */
550 d = (DATUM *) GETDATUM(h,i);
551 if (h->h_flags & F_LEAF) {
552 dsize = d->d_ksize + d->d_dsize + fixedsize;
553 } else {
554 dsize = d->d_ksize + fixedsize;
555 }
556
557 /*
558 * If a page contains duplicate keys, we have to be
559 * careful about splits. A sequence of duplicate keys
560 * may not begin in the middle of one page and end in
561 * the middle of another; it must begin on a page boundary,
562 * in order for searches on the internal nodes to work
563 * correctly.
564 */
565 if (where_h == left) {
566 if (save_key == (char *) NULL) {
567 if (h->h_flags & F_LEAF) {
568 if (d->d_flags & D_BIGKEY) {
569 free_save = TRUE;
570 bcopy(&(d->d_bytes[0]),
571 (char *) &chain,
572 sizeof(chain));
573 if (_bt_getbig(t, chain,
574 &save_key,
575 &ignore)
576 == RET_ERROR)
577 return (RET_ERROR);
578 } else {
579 free_save = FALSE;
580 save_key = (char *) &(d->d_bytes[0]);
581 }
582 } else {
583 IDATUM *id = (IDATUM *) d;
584
585 if (id->i_flags & D_BIGKEY) {
586 free_save = TRUE;
587 bcopy(&(id->i_bytes[0]),
588 (char *) &chain,
589 sizeof(chain));
590 if (_bt_getbig(t, chain,
591 &save_key,
592 &ignore)
593 == RET_ERROR)
594 return (RET_ERROR);
595 } else {
596 free_save = FALSE;
597 save_key = (char *)
598 &(id->i_bytes[0]);
599 }
600 }
601 save_i = 0;
602 save_lower = where_h->h_lower;
603 save_upper = where_h->h_upper;
604 } else {
605 if (_bt_cmp(t, save_key, i) != 0) {
606 save_lower = where_h->h_lower;
607 save_upper = where_h->h_upper;
608 save_i = i;
609 }
610 if (h->h_flags & F_LEAF) {
611 if (free_save)
612 (void) free(save_key);
613 if (d->d_flags & D_BIGKEY) {
614 free_save = TRUE;
615 bcopy(&(d->d_bytes[0]),
616 (char *) &chain,
617 sizeof(chain));
618 if (_bt_getbig(t, chain,
619 &save_key,
620 &ignore)
621 == RET_ERROR)
622 return (RET_ERROR);
623 } else {
624 free_save = FALSE;
625 save_key = (char *) &(d->d_bytes[0]);
626 }
627 } else {
628 IDATUM *id = (IDATUM *) d;
629
630 if (id->i_flags & D_BIGKEY) {
631 free_save = TRUE;
632 bcopy(&(id->i_bytes[0]),
633 (char *) &chain,
634 sizeof(chain));
635 if (_bt_getbig(t, chain,
636 &save_key,
637 &ignore)
638 == RET_ERROR)
639 return (RET_ERROR);
640 } else {
641 free_save = FALSE;
642 save_key = (char *)
643 &(id->i_bytes[0]);
644 }
645 }
646 }
647 }
648
649 /* copy data and update page state */
650 where -= LONGALIGN(dsize);
651 (void) bcopy((char *) d, (char *) where, dsize);
652 where_h->h_upper = where_h->h_linp[where_i] =
653 (index_t) (where - (int) where_h);
654 where_h->h_lower += sizeof(index_t);
655 where_i++;
656
657 /* if we've moved half, switch to the right-hand page */
658 nbytes -= LONGALIGN(dsize) + sizeof(index_t);
659 if ((freespc - nbytes) > nbytes) {
660 nbytes = 2 * freespc;
661
662 /* identical keys go on the same page */
663 if (save_i > 0) {
664 /* i gets incremented at loop bottom... */
665 i = save_i - 1;
666 where_h->h_lower = save_lower;
667 where_h->h_upper = save_upper;
668 }
669 where = ((char *) right) + psize;
670 where_h = right;
671 switch_i = where_i;
672 where_i = 0;
673 }
674 }
675
676 /*
677 * If there was an active scan on the database, and we just
678 * split the page that the cursor was on, we may need to
679 * adjust the cursor to point to the same entry as before the
680 * split.
681 */
682
683 c = &(t->bt_cursor);
684 if ((t->bt_flags & BTF_SEQINIT)
685 && (c->c_pgno == h->h_pgno)
686 && (c->c_index >= switch_i)) {
687 c->c_pgno = where_h->h_pgno;
688 c->c_index -= where_i;
689 }
690 return (RET_SUCCESS);
691}