Rev #2. Redo the btree code add recno's and checkpoint
authorKeith Bostic <bostic@ucbvax.Berkeley.EDU>
Thu, 5 Sep 1991 03:54:11 +0000 (19:54 -0800)
committerKeith Bostic <bostic@ucbvax.Berkeley.EDU>
Thu, 5 Sep 1991 03:54:11 +0000 (19:54 -0800)
SCCS-vsn: lib/libc/db/btree/Makefile.inc 5.4
SCCS-vsn: lib/libc/db/btree/bt_delete.c 5.3
SCCS-vsn: lib/libc/db/btree/bt_get.c 5.3
SCCS-vsn: lib/libc/db/btree/bt_open.c 5.10
SCCS-vsn: lib/libc/db/btree/bt_overflow.c 5.3
SCCS-vsn: lib/libc/db/btree/bt_put.c 5.4
SCCS-vsn: lib/libc/db/btree/bt_search.c 5.3
SCCS-vsn: lib/libc/db/btree/bt_seq.c 5.5
SCCS-vsn: lib/libc/db/btree/bt_split.c 5.3
SCCS-vsn: lib/libc/db/btree/bt_utils.c 5.4
SCCS-vsn: lib/libc/db/btree/btree.h 5.3
SCCS-vsn: lib/libc/db/btree/bt_close.c 5.1
SCCS-vsn: lib/libc/db/btree/bt_conv.c 5.1
SCCS-vsn: lib/libc/db/btree/bt_debug.c 5.1
SCCS-vsn: lib/libc/db/btree/bt_stack.c 5.1
SCCS-vsn: lib/libc/db/btree/extern.h 5.1
SCCS-vsn: lib/libc/db/test/btree.tests/main.c 5.2
SCCS-vsn: lib/libc/db/db/db.c 5.1
SCCS-vsn: lib/libc/db/recno/extern.h 5.1
SCCS-vsn: lib/libc/db/recno/rec_close.c 5.1
SCCS-vsn: lib/libc/db/recno/rec_delete.c 5.1
SCCS-vsn: lib/libc/db/recno/rec_get.c 5.1
SCCS-vsn: lib/libc/db/recno/rec_open.c 5.1
SCCS-vsn: lib/libc/db/recno/rec_put.c 5.1
SCCS-vsn: lib/libc/db/recno/rec_search.c 5.1
SCCS-vsn: lib/libc/db/recno/rec_seq.c 5.1
SCCS-vsn: lib/libc/db/recno/rec_utils.c 5.1

27 files changed:
usr/src/lib/libc/db/btree/Makefile.inc
usr/src/lib/libc/db/btree/bt_close.c [new file with mode: 0644]
usr/src/lib/libc/db/btree/bt_conv.c [new file with mode: 0644]
usr/src/lib/libc/db/btree/bt_debug.c [new file with mode: 0644]
usr/src/lib/libc/db/btree/bt_delete.c
usr/src/lib/libc/db/btree/bt_get.c
usr/src/lib/libc/db/btree/bt_open.c
usr/src/lib/libc/db/btree/bt_overflow.c
usr/src/lib/libc/db/btree/bt_put.c
usr/src/lib/libc/db/btree/bt_search.c
usr/src/lib/libc/db/btree/bt_seq.c
usr/src/lib/libc/db/btree/bt_split.c
usr/src/lib/libc/db/btree/bt_stack.c [new file with mode: 0644]
usr/src/lib/libc/db/btree/bt_utils.c
usr/src/lib/libc/db/btree/btree.h
usr/src/lib/libc/db/btree/extern.h [new file with mode: 0644]
usr/src/lib/libc/db/db/db.c [new file with mode: 0644]
usr/src/lib/libc/db/recno/extern.h [new file with mode: 0644]
usr/src/lib/libc/db/recno/rec_close.c [new file with mode: 0644]
usr/src/lib/libc/db/recno/rec_delete.c [new file with mode: 0644]
usr/src/lib/libc/db/recno/rec_get.c [new file with mode: 0644]
usr/src/lib/libc/db/recno/rec_open.c [new file with mode: 0644]
usr/src/lib/libc/db/recno/rec_put.c [new file with mode: 0644]
usr/src/lib/libc/db/recno/rec_search.c [new file with mode: 0644]
usr/src/lib/libc/db/recno/rec_seq.c [new file with mode: 0644]
usr/src/lib/libc/db/recno/rec_utils.c [new file with mode: 0644]
usr/src/lib/libc/db/test/btree.tests/main.c

index 99d7eee..33951e6 100644 (file)
@@ -1,8 +1,7 @@
-#      @(#)Makefile.inc        5.3 (Berkeley) %G%
-
-# btree sources
+#      @(#)Makefile.inc        5.4 (Berkeley) %G%
 
 .PATH: ${.CURDIR}/db/btree
 
 
 .PATH: ${.CURDIR}/db/btree
 
-SRCS+= big.c btree.c delete.c insert.c lrucache.c lruhash.c lrutils.c \
-       search.c seq.c split.c storage.c updutils.c utils.c
+SRCS+= bt_close.c bt_conv.c bt_debug.c bt_delete.c bt_get.c bt_open.c \
+       bt_overflow.c bt_put.c bt_search.c bt_seq.c bt_split.c bt_stack.c \
+       bt_utils.c
diff --git a/usr/src/lib/libc/db/btree/bt_close.c b/usr/src/lib/libc/db/btree/bt_close.c
new file mode 100644 (file)
index 0000000..5130dc8
--- /dev/null
@@ -0,0 +1,150 @@
+/*-
+ * Copyright (c) 1990 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Mike Olson.
+ *
+ * %sccs.include.redist.c%
+ */
+
+#if defined(LIBC_SCCS) && !defined(lint)
+static char sccsid[] = "@(#)bt_close.c 5.1 (Berkeley) %G%";
+#endif /* LIBC_SCCS and not lint */
+
+#include <sys/param.h>
+#include <errno.h>
+#include <db.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "btree.h"
+
+static int bt_meta __P((BTREE *));
+
+/*
+ * BT_CLOSE -- Close a btree.
+ *
+ * Parameters:
+ *     dbp:    pointer to access method
+ *
+ * Returns:
+ *     RET_ERROR, RET_SUCCESS
+ */
+int
+__bt_close(dbp)
+       DB *dbp;
+{
+       BTREE *t;
+       int fd;
+
+       t = dbp->internal;
+
+       /*
+        * Delete any already deleted record that we've been saving
+        * because the cursor pointed to it.
+        */
+       if (ISSET(t, BTF_DELCRSR) && __bt_crsrdel(t, &t->bt_bcursor))
+               return (RET_ERROR);
+
+       if (__bt_sync(dbp) == RET_ERROR)
+               return (RET_ERROR);
+
+       if (mpool_close(t->bt_mp) == RET_ERROR)
+               return (RET_ERROR);
+
+       if (t->bt_stack)
+               free(t->bt_stack);
+       if (t->bt_kbuf)
+               free(t->bt_kbuf);
+       if (t->bt_dbuf)
+               free(t->bt_dbuf);
+
+       fd = t->bt_fd;
+       free(t);
+       free(dbp);
+       return (close(fd) ? RET_ERROR : RET_SUCCESS);
+}
+
+/*
+ * BT_SYNC -- sync the btree to disk.
+ *
+ * Parameters:
+ *     dbp:    pointer to access method
+ *
+ * Returns:
+ *     RET_SUCCESS, RET_ERROR.
+ *
+ * XXX
+ * Currently don't handle a key marked for deletion when the tree is synced.
+ * Should copy the page and write it out instead of the real page.
+ */
+int
+__bt_sync(dbp)
+       const DB *dbp;
+{
+       BTREE *t;
+       int status;
+
+       t = dbp->internal;
+
+       if (ISSET(t, BTF_INMEM))
+               return (RET_SUCCESS);
+
+       if (ISSET(t, BTF_RDONLY)) {
+               errno = EPERM;
+               return (RET_ERROR);
+       }
+
+       if (ISSET(t, BTF_METADIRTY) && bt_meta(t) == RET_ERROR)
+               return (RET_ERROR);
+
+       if ((status = mpool_sync(t->bt_mp)) == RET_SUCCESS) {
+               UNSET(t, BTF_MODIFIED);
+               return (RET_SUCCESS);
+       }
+       return (status);
+}
+
+/*
+ * BT_META -- write the tree meta data to disk.
+ *
+ * Parameters:
+ *     t:      tree
+ *
+ * Returns:
+ *     RET_ERROR, RET_SUCCESS
+ */
+static int
+bt_meta(t)
+       BTREE *t;
+{
+       BTMETA m;
+       void *p;
+
+       if ((p = mpool_get(t->bt_mp, P_META, 0)) == NULL)
+               return (RET_ERROR);
+
+       /* Fill in meta structure -- lorder MUST be host-independent. */
+       m.m_magic = BTREEMAGIC;
+       m.m_version = BTREEVERSION;
+       m.m_psize = t->bt_psize;
+       m.m_free = 0;           /* XXX */
+       m.m_nrecs = t->bt_nrecs;
+       m.m_flags = t->bt_flags & SAVEMETA;
+       m.m_lorder = (u_long)htonl((long)t->bt_lorder);
+
+       if (t->bt_lorder != BYTE_ORDER) {
+               BLSWAP(m.m_magic);
+               BLSWAP(m.m_version);
+               BLSWAP(m.m_psize);
+               BLSWAP(m.m_free);
+               BLSWAP(m.m_nrecs);
+               BLSWAP(m.m_flags);
+       }
+
+       bcopy(&m, p, sizeof(BTMETA));
+       mpool_put(t->bt_mp, p, MPOOL_DIRTY);
+       return (RET_SUCCESS);
+}
diff --git a/usr/src/lib/libc/db/btree/bt_conv.c b/usr/src/lib/libc/db/btree/bt_conv.c
new file mode 100644 (file)
index 0000000..f17989f
--- /dev/null
@@ -0,0 +1,125 @@
+/*-
+ * Copyright (c) 1990 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Mike Olson.
+ *
+ * %sccs.include.redist.c%
+ */
+
+#if defined(LIBC_SCCS) && !defined(lint)
+static char sccsid[] = "@(#)bt_conv.c  5.1 (Berkeley) %G%";
+#endif /* LIBC_SCCS and not lint */
+
+#include <sys/param.h>
+#include <db.h>
+#include <stdio.h>
+#include "btree.h"
+
+/*
+ * __BT_BPGIN, __BT_BPGOUT --
+ *     Convert host-specific number layout to/from the host-independent
+ *     format stored on disk.
+ *
+ * Parameters:
+ *     tree:   tree
+ *     h:      page to convert
+ *
+ * Side Effects:
+ *     Layout of tree metadata on the page is changed in place.
+ *
+ * Warnings:
+ *     Everywhere else in the code, the types pgno_t and index_t are
+ *     opaque.  These two routines know what they really are.
+ */
+void
+__bt_pgin(t, pg, p)
+       void *t;
+       pgno_t pg;
+       void *p;
+{
+       register BINTERNAL *bi;
+       register BLEAF *bl;
+       register int i, top;
+       PAGE *h;
+
+       if (((BTREE *)t)->bt_lorder == BYTE_ORDER)
+               return;
+
+       h = p;
+       BLSWAP(h->pgno);
+       BLSWAP(h->prevpg);
+       BLSWAP(h->nextpg);
+       BLSWAP(h->flags);
+       BLSWAP(h->lower);
+       BLSWAP(h->upper);
+
+       top = NEXTINDEX(h);
+       if (!(h->flags & (P_BLEAF | P_RLEAF)))
+               for (i = 0; i < top; i++) {
+                       BLSWAP(h->linp[i]);
+                       bi = GETBINTERNAL(h, i);
+                       BLSWAP(bi->ksize);
+                       BLSWAP(bi->pgno);
+                       BLSWAP(bi->flags);
+                       if (bi->flags & P_BIGKEY)
+                               BLSWAP(*(long *)bi->bytes);
+               }
+       else if (!(h->flags & P_OVERFLOW))
+               for (i = 0; i < top; i++) {
+                       BLSWAP(h->linp[i]);
+                       bl = GETBLEAF(h, i);
+                       BLSWAP(bl->dsize);
+                       BLSWAP(bl->ksize);
+                       BLSWAP(bl->flags);
+                       if (bl->flags & P_BIGKEY)
+                               BLSWAP(*(long *)bl->bytes);
+                       if (bl->flags & P_BIGDATA)
+                               BLSWAP(*(long *)(bl->bytes + bl->ksize));
+               }
+}
+
+void
+__bt_pgout(t, pg, p)
+       void *t;
+       pgno_t pg;
+       void *p;
+{
+       register BINTERNAL *bi;
+       register BLEAF *bl;
+       register int i, top;
+       PAGE *h;
+
+       if (((BTREE *)t)->bt_lorder == BYTE_ORDER)
+               return;
+
+       h = p;
+       top = NEXTINDEX(h);
+       if (!(h->flags & (P_BLEAF | P_RLEAF)))
+               for (i = 0; i < top; i++) {
+                       bi = GETBINTERNAL(h, i);
+                       BLSWAP(bi->ksize);
+                       BLSWAP(bi->pgno);
+                       if (bi->flags & P_BIGKEY)
+                               BLSWAP(*(long *)bi->bytes);
+                       BLSWAP(h->linp[i]);
+               }
+       else if (!(h->flags & P_OVERFLOW))
+               for (i = 0; i < top; i++) {
+                       bl = GETBLEAF(h, i);
+                       BLSWAP(bl->ksize);
+                       BLSWAP(bl->dsize);
+                       if (bl->flags & P_BIGKEY)
+                               BLSWAP(*(long *)bl->bytes);
+                       if (bl->flags & P_BIGDATA)
+                               BLSWAP(*(long *)(bl->bytes + bl->ksize));
+                       BLSWAP(h->linp[i]);
+               }
+       BLSWAP(h->pgno);
+       BLSWAP(h->prevpg);
+       BLSWAP(h->nextpg);
+       BLSWAP(h->flags);
+       BLSWAP(h->lower);
+       BLSWAP(h->upper);
+}
diff --git a/usr/src/lib/libc/db/btree/bt_debug.c b/usr/src/lib/libc/db/btree/bt_debug.c
new file mode 100644 (file)
index 0000000..a34d792
--- /dev/null
@@ -0,0 +1,245 @@
+/*-
+ * Copyright (c) 1990 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Mike Olson.
+ *
+ * %sccs.include.redist.c%
+ */
+
+#if defined(LIBC_SCCS) && !defined(lint)
+static char sccsid[] = "@(#)bt_debug.c 5.1 (Berkeley) %G%";
+#endif /* LIBC_SCCS and not lint */
+
+#include <sys/param.h>
+#include <db.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "btree.h"
+
+#ifdef DEBUG
+/*
+ * BT_DUMP -- Dump the tree
+ *
+ * Parameters:
+ *     dbp:    pointer to the DB
+ */
+void
+__bt_dump(dbp)
+       DB *dbp;
+{
+       BTREE *t;
+       PAGE *h;
+       pgno_t i;
+       char *sep;
+
+       t = dbp->internal;
+       (void)fprintf(stderr, "%s: pgsz %d", 
+           ISSET(t, BTF_INMEM) ? "memory" : "disk", t->bt_psize);
+       if (ISSET(t, BTF_RECNO))
+               (void)fprintf(stderr, " keys %lu", t->bt_nrecs);
+#define        X(flag, name) \
+       if (ISSET(t, flag)) { \
+               (void)fprintf(stderr, "%s%s", sep, name); \
+               sep = ", "; \
+       }
+       if (t->bt_flags) {
+               sep = " flags (";
+               X(BTF_DELCRSR,  "DELCRSR");
+               X(BTF_FIXEDLEN, "FIXEDLEN");
+               X(BTF_INMEM,    "INMEM");
+               X(BTF_NODUPS,   "NODUPS");
+               X(BTF_RDONLY,   "RDONLY");
+               X(BTF_RECNO,    "RECNO");
+               X(BTF_SEQINIT,  "SEQINIT");
+               X(BTF_METADIRTY,"METADIRTY");
+               (void)fprintf(stderr, ")\n");
+       }
+#undef X
+
+       for (i = P_ROOT; (h = mpool_get(t->bt_mp, i, 0)) != NULL; ++i) {
+               __bt_dpage(h);
+               (void)mpool_put(t->bt_mp, h, 0);
+       }
+}
+
+/*
+ * BT_DPAGE -- Dump the page
+ *
+ * Parameters:
+ *     h:      pointer to the PAGE
+ */
+void
+__bt_dpage(h)
+       PAGE *h;
+{
+       BINTERNAL *bi;
+       BLEAF *bl;
+       RINTERNAL *ri;
+       RLEAF *rl;
+       index_t cur, top;
+       char *sep;
+
+       (void)fprintf(stderr, "    page %d: (", h->pgno);
+#define        X(flag, name) \
+       if (h->flags & flag) { \
+               (void)fprintf(stderr, "%s%s", sep, name); \
+               sep = ", "; \
+       }
+       sep = "";
+       X(P_BINTERNAL,  "BINTERNAL")            /* types */
+       X(P_BLEAF,      "BLEAF")
+       X(P_RINTERNAL,  "RINTERNAL")            /* types */
+       X(P_RLEAF,      "RLEAF")
+       X(P_OVERFLOW,   "OVERFLOW")
+       X(P_PRESERVE,   "PRESERVE");
+       (void)fprintf(stderr, ")\n");
+#undef X
+
+       (void)fprintf(stderr, "\tprev %2d next %2d", h->prevpg, h->nextpg);
+       if (h->flags & P_OVERFLOW)
+               return;
+
+       top = NEXTINDEX(h);
+       (void)fprintf(stderr, " lower %3d upper %3d nextind %d\n",
+           h->lower, h->upper, top);
+       for (cur = 0; cur < top; cur++) {
+               (void)fprintf(stderr, "\t[%03d] %4d ", cur, h->linp[cur]);
+               switch(h->flags & P_TYPE) {
+               case P_BINTERNAL:
+                       bi = GETBINTERNAL(h, cur);
+                       (void)fprintf(stderr,
+                           "size %2d pgno %2d", bi->ksize, bi->pgno);
+                       if (bi->flags & P_BIGKEY)
+                               (void)fprintf(stderr, " (indirect)");
+                       else if (bi->ksize)
+                               (void)fprintf(stderr, " {%s}", bi->bytes);
+                       break;
+               case P_RINTERNAL:
+                       ri = GETRINTERNAL(h, cur);
+                       (void)fprintf(stderr, "entries %2d pgno %2d",
+                               ri->nrecs, ri->pgno);
+                       break;
+               case P_BLEAF:
+                       bl = GETBLEAF(h, cur);
+                       if (bl->flags & P_BIGKEY)
+                               (void)fprintf(stderr,
+                                   "big key page %lu size %u/",
+                                   *(pgno_t *)bl->bytes,
+                                   *(size_t *)(bl->bytes + sizeof(pgno_t)));
+                       else if (bl->ksize)
+                               (void)fprintf(stderr, "%s/", bl->bytes);
+                       if (bl->flags & P_BIGDATA)
+                               (void)fprintf(stderr,
+                                   "big data page %lu size %u",
+                                   *(pgno_t *)(bl->bytes + bl->ksize),
+                                   *(size_t *)(bl->bytes + bl->ksize +
+                                   sizeof(pgno_t)));
+                       else if (bl->dsize)
+                               (void)fprintf(stderr,
+                                   "%s", bl->bytes + bl->ksize);
+                       break;
+               case P_RLEAF:
+                       rl = GETRLEAF(h, cur);
+                       if (rl->flags & P_BIGDATA)
+                               (void)fprintf(stderr,
+                                   "big data page %lu size %u",
+                                   *(pgno_t *)rl->bytes,
+                                   *(size_t *)(rl->bytes + sizeof(pgno_t)));
+                       else if (rl->dsize)
+                               (void)fprintf(stderr, "%s", rl->bytes);
+                       break;
+               }
+               (void)fprintf(stderr, "\n");
+       }
+}
+#endif
+
+#ifdef STATISTICS
+/*
+ * BT_STAT -- Gather/print the tree statistics
+ *
+ * Parameters:
+ *     dbp:    pointer to the DB
+ */
+void
+__bt_stat(dbp)
+       DB *dbp;
+{
+       extern u_long bt_cache_hit, bt_cache_miss;
+       extern u_long bt_rootsplit, bt_split, bt_sortsplit;
+       extern u_long bt_pfxsaved;
+       BTREE *t;
+       PAGE *h;
+       pgno_t i, pcont, pinternal, pleaf;
+       u_long ifree, lfree, nkeys;
+       int levels;
+
+       t = dbp->internal;
+       pcont = pinternal = pleaf = 0;
+       nkeys = ifree = lfree = 0;
+       for (i = P_ROOT; (h = mpool_get(t->bt_mp, i, 0)) != NULL; ++i) {
+               switch(h->flags & P_TYPE) {
+               case P_BINTERNAL:
+               case P_RINTERNAL:
+                       ++pinternal;
+                       ifree += h->upper - h->lower;
+                       break;
+               case P_BLEAF:
+               case P_RLEAF:
+                       ++pleaf;
+                       lfree += h->upper - h->lower;
+                       nkeys += NEXTINDEX(h);
+                       break;
+               case P_OVERFLOW:
+                       ++pcont;
+                       break;
+               }
+               (void)mpool_put(t->bt_mp, h, 0);
+       }
+
+       /* Count the levels of the tree. */
+       for (i = P_ROOT, levels = 0 ;; ++levels) {
+               h = mpool_get(t->bt_mp, i, 0);
+               if (h->flags & (P_BLEAF|P_RLEAF)) {
+                       if (levels == 0)
+                               levels = 1;
+                       (void)mpool_put(t->bt_mp, h, 0);
+                       break;
+               }
+               i = ISSET(t, BTF_RECNO) ?
+                   GETRINTERNAL(h, 0)->pgno :
+                   GETBINTERNAL(h, 0)->pgno;
+               (void)mpool_put(t->bt_mp, h, 0);
+       }
+
+       (void)fprintf(stderr, "%d level%s with %ld keys",
+           levels, levels == 1 ? "" : "s", nkeys);
+       if (ISSET(t, BTF_RECNO))
+               (void)fprintf(stderr, " (%ld header count)", t->bt_nrecs);
+       (void)fprintf(stderr,
+           "\n%lu pages (leaf %ld, internal %ld, overflow %ld)\n",
+           pinternal + pleaf + pcont, pleaf, pinternal, pcont);
+       (void)fprintf(stderr, "%ld cache hits, %ld cache misses\n",
+           bt_cache_hit, bt_cache_miss);
+       (void)fprintf(stderr, "%ld splits (%ld root splits, %ld sort splits)\n",
+           bt_split, bt_rootsplit, bt_sortsplit);
+       pleaf *= t->bt_psize - BTDATAOFF;
+       if (pleaf)
+               (void)fprintf(stderr,
+                   "%.0f%% leaf fill (%ld bytes used, %ld bytes free)\n",
+                   ((double)(pleaf - lfree) / pleaf) * 100,
+                   pleaf - lfree, lfree);
+       pinternal *= t->bt_psize - BTDATAOFF;
+       if (pinternal)
+               (void)fprintf(stderr,
+                   "%.0f%% internal fill (%ld bytes used, %ld bytes free\n",
+                   ((double)(pinternal - ifree) / pinternal) * 100,
+                   pinternal - ifree, ifree);
+       if (bt_pfxsaved)
+               (void)fprintf(stderr, "prefix checking removed %lu bytes.\n",
+                   bt_pfxsaved);
+}
+#endif
index 3161031..f51cac6 100644 (file)
  */
 
 #if defined(LIBC_SCCS) && !defined(lint)
  */
 
 #if defined(LIBC_SCCS) && !defined(lint)
-static char sccsid[] = "@(#)bt_delete.c        5.2 (Berkeley) %G%";
+static char sccsid[] = "@(#)bt_delete.c        5.3 (Berkeley) %G%";
 #endif /* LIBC_SCCS and not lint */
 
 #include <sys/types.h>
 #endif /* LIBC_SCCS and not lint */
 
 #include <sys/types.h>
-#include <db.h>
 #include <errno.h>
 #include <errno.h>
+#include <db.h>
+#include <stdio.h>
 #include <string.h>
 #include "btree.h"
 
 #include <string.h>
 #include "btree.h"
 
+static int bt_bdelete __P((BTREE *, const DBT *));
+
 /*
 /*
- *  _BT_CRSRDEL -- Delete the item pointed to by the cursor.
- *
- *     This routine deletes the item most recently returned by a scan
- *     through the tree.  Since it only makes sense to delete the current
- *     record once, we make sure that we don't try to delete twice without
- *     advancing the scan.
- *
- *     Parameters:
- *             t -- tree in which to do deletion
+ * __BT_DELETE -- Delete the item(s) referenced by a key.
  *
  *
- *     Returns:
- *             RET_SUCCESS, RET_ERROR.
+ * Parameters:
+ *     dbp:    pointer to access method
+ *     key:    key to delete
+ *     flags:  R_CURSOR if deleting what the cursor references
  *
  *
- *     Side Effects:
- *             The call to _bt_delone marks the cursor, so we can tell that
- *             the current record has been deleted.
+ * Returns:
+ *     RET_ERROR, RET_SUCCESS and RET_SPECIAL if the key not found.
  */
  */
-
 int
 int
-_bt_crsrdel(t)
-       BTREE_P t;
+__bt_delete(dbp, key, flags)
+       const DB *dbp;
+       const DBT *key;
+       u_int flags;
 {
 {
-       CURSOR *c;
-
-       c = &(t->bt_cursor);
+       BTREE *t;
+       int status;
 
 
-       /* a cursor must exist, and can't have deleted the current key yet */
-       if (!(t->bt_flags & BTF_SEQINIT) || (c->c_flags & CRSR_BEFORE)) {
-               errno = EINVAL;
+       t = dbp->internal;
+       if (ISSET(t, BTF_RDONLY)) {
+               errno = EPERM;
                return (RET_ERROR);
        }
                return (RET_ERROR);
        }
-
-       if (_bt_getpage(t, c->c_pgno) == RET_ERROR)
-               return (RET_ERROR);
-
-       if (c->c_index >= NEXTINDEX(t->bt_curpage)) {
+       switch(flags) {
+       case 0:
+               status = bt_bdelete(t, key);
+               break;
+       case R_CURSOR:
+               /*
+                * If flags is R_CURSOR, delete the cursor; must already have
+                * started a scan and not have already deleted the record.  For
+                * the delete cursor bit to have been set requires that the
+                * scan be initialized, so no reason to check.
+                */
+               status = ISSET(t, BTF_DELCRSR) ?
+                   RET_SPECIAL : __bt_crsrdel(t, &t->bt_bcursor);
+               break;
+       default:
                errno = EINVAL;
                return (RET_ERROR);
        }
                errno = EINVAL;
                return (RET_ERROR);
        }
-
-       return (_bt_delone(t, c->c_index));
+       if (status == RET_SUCCESS)
+               SET(t, BTF_MODIFIED);
+       return (status);
 }
 
 /*
 }
 
 /*
- *  _BT_DELONE -- Delete a single entry from a btree.
- *
- *     This routine physically removes a btree entry from a leaf page.
- *     IDATUM items are *never* removed from internal nodes, regardless
- *     of whether the entries that originally caused them to be added
- *     are removed from the tree or not.  In addition, pages made empty
- *     by element deletion are not actually reclaimed.  They are,
- *     however, made available for reuse.
- *
- *     To delete an item from a page, we pack the remaining items at
- *     the end of the page, overwriting the deleted item's entry.  We
- *     move the line pointers backward on the page, overwriting the
- *     original item's line pointer.  This guarantees that the space in
- *     the middle of the page is free -- a property that our insertion
- *     strategy relies on.
- *
- *     This routine doesn't reclaim pages all of whose entries have
- *     been deleted.  These pages are available for reuse, however.
- *     If an item is deleted that was too big to fit on a page, then
- *     the blocks that it occupies are put on a free list for reuse.
+ * BT_BDELETE -- Delete all key/data pairs matching the specified key.
  *
  *
- *     Parameters:
- *             t -- btree from which to delete item
- *             index -- index of entry on current page to delete
+ * Parameters:
+ *     tree:   tree
+ *     key:    key to delete
  *
  *
- *     Returns:
- *             RET_SUCCESS, RET_ERROR.
- *
- *     Side Effects:
- *             Physically changes page layout, adjusts internal page
- *             state to reflect the deletion of the item, and updates
- *             the list of free pages for this tree.
+ * Returns:
+ *     RET_ERROR, RET_SUCCESS and RET_SPECIAL if the key not found.
  */
  */
-
-int
-_bt_delone(t, index)
-       BTREE_P t;
-       index_t index;
+static int
+bt_bdelete(t, key)
+       BTREE *t;
+       const DBT *key;
 {
 {
-       char *src, *dest;
-       int nbytes, nmoved;
-       index_t off;
-       index_t top;
-       index_t i;
-       pgno_t chain;
-       BTHEADER *h;
-       CURSOR *c;
-       DATUM *d;
-
-       /* deletion may confuse an active scan.  fix it.  */
-       c = &(t->bt_cursor);
-       if (t->bt_flags & BTF_SEQINIT && t->bt_curpage->h_pgno == c->c_pgno)
-               if (_bt_fixscan(t, index, (DATUM *) NULL, DELETE) == RET_ERROR)
-                       return (RET_ERROR);
+       EPG *e, save;
+       PAGE *h;
+       pgno_t cpgno, pg;
+       index_t cindex;
+       int deleted, exact;
+
+       /* Find any matching record; __bt_search pins the page. */
+       if ((e = __bt_search(t, key, &exact)) == NULL)
+               return (RET_ERROR);
+       if (!exact) {
+               mpool_put(t->bt_mp, e->page, 0);
+               return (RET_SPECIAL);
+       }
 
 
-       h = t->bt_curpage;
-       off = h->h_linp[index];
-       d = (DATUM *) GETDATUM(h, index);
+       /*
+        * Delete forward, then delete backward, from the found key.  The
+        * ordering is so that the deletions don't mess up the page refs.
+        * The first loop deletes the found key, the second unpins the found
+        * page.
+        *
+        * If find the key referenced by the cursor, don't delete it, just
+        * flag it for future deletion.  The cursor page number is P_INVALID
+        * unless the sequential scan is initialized, so no reason to check.
+        * A special case is when the already deleted cursor record was the
+        * only record found.  If so, then the delete opertion fails as no
+        * records were deleted.
+        *
+        * Cycle in place in the current page until the current record doesn't
+        * match the key or the page is empty.  If the latter, walk forward,
+        * skipping empty pages and repeating until an record doesn't match
+        * the key or the end of the tree is reached.
+        */
+       cpgno = t->bt_bcursor.pgno;
+       cindex = t->bt_bcursor.index;
+       save = *e;
+       for (h = e->page, deleted = 0;;) {
+               do {
+                       if (h->pgno == cpgno && e->index == cindex) {
+                               if (NOTSET(t, BTF_DELCRSR)) {
+                                       SET(t, BTF_DELCRSR);
+                                       deleted = 1;
+                               }
+                               ++e->index;
+                       } else {
+                               if (__bt_dleaf(t, h, e->index))
+                                       goto err;
+                               mpool_put(t->bt_mp, h, MPOOL_DIRTY);
+                               deleted = 1;
+                       }
+               } while (e->index < NEXTINDEX(h) && __bt_cmp(t, key, e) == 0);
 
 
-       /* if this is a big item, reclaim the space it occupies */
-       if (d->d_flags & D_BIGKEY) {
-               bcopy(&(d->d_bytes[0]),
-                     (char *) &chain,
-                     sizeof(chain));
-               if (_bt_delindir(t, chain) == RET_ERROR)
-                       return (RET_ERROR);
-               h = t->bt_curpage;
-               d = (DATUM *) GETDATUM(h, index);
+               /*
+                * Quit if didn't find a match, no next page, or first key on
+                * the next page doesn't match.  Make a special effort not to
+                * unpin the page the original match was on, but also make sure
+                * it's unpinned if an error occurs.
+                */
+               if (e->index < NEXTINDEX(h))
+                       break;
+               for (;;) {
+                       if ((pg = h->nextpg) == P_INVALID)
+                               goto done1;
+                       if (h->pgno != save.page->pgno)
+                               mpool_put(t->bt_mp, h, 0);
+                       if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) {
+                               if (h->pgno == save.page->pgno)
+                                       mpool_put(t->bt_mp, save.page, 0);
+                               return (RET_ERROR);
+                       }
+                       if (NEXTINDEX(h) != 0) {
+                               e->page = h;
+                               e->index = 0;
+                               break;
+                       }
+               }
+
+               if (__bt_cmp(t, key, e) != 0)
+                       break;
        }
        }
-       if (d->d_flags & D_BIGDATA) {
-               bcopy(&(d->d_bytes[d->d_ksize]),
-                     (char *) &chain,
-                     sizeof(chain));
-               if (_bt_delindir(t, chain) == RET_ERROR)
+
+       /*
+        * Reach here with the last page that was looked at pinned, and it may
+        * or may not be the same as the page with the original match.  If it's
+        * not, release it.
+        */
+done1: if (h->pgno != save.page->pgno)
+               mpool_put(t->bt_mp, h, 0);
+
+       /*
+        * Walk backwards from the record previous to the record returned by
+        * __bt_search, skipping empty pages, until a current record doesn't
+        * match the key or reach the beginning of the tree.
+        */
+       *e = save;
+       for (;;) {
+               if (e->index)
+                       --e->index;
+               for (h = e->page; e->index; --e->index) {
+                       if (__bt_cmp(t, key, e) != 0)
+                               goto done2;
+                       if (h->pgno == cpgno && e->index == cindex) {
+                               if (NOTSET(t, BTF_DELCRSR)) {
+                                       SET(t, BTF_DELCRSR);
+                                       deleted = 1;
+                               }
+                       } else {
+                               if (__bt_dleaf(t, h, e->index) == RET_ERROR)
+                                       goto err;
+                               mpool_put(t->bt_mp, h, MPOOL_DIRTY);
+                               deleted = 1;
+                       }
+               }
+
+               if ((pg = h->prevpg) == P_INVALID)
+                       goto done2;
+               mpool_put(t->bt_mp, h, 0);
+               if ((e->page = mpool_get(t->bt_mp, pg, 0)) == NULL)
                        return (RET_ERROR);
                        return (RET_ERROR);
-               h = t->bt_curpage;
-               d = (DATUM *) GETDATUM(h, index);
+               e->index = NEXTINDEX(h);
        }
 
        }
 
-       /* move the data down on the page */
-       nbytes = d->d_ksize + d->d_dsize
-                + (sizeof(DATUM) - sizeof(char));
-       nbytes = LONGALIGN(nbytes);
-       src = ((char *) h) + h->h_upper;
-       dest = src + nbytes;
-       nmoved = (int) (((char *) d) - src);
-       (void) bcopy(src, dest, nmoved);
-
-       /* next move the line pointers up */
-       src = (char *) &(h->h_linp[index + 1]);
-       dest = (char *) &(h->h_linp[index]);
-       nmoved = (int) (((char *) &(h->h_linp[NEXTINDEX(h)])) - src);
-       (void) bcopy(src, dest, nmoved);
-
-       /* remember that we freed up some space */
-       h->h_upper += nbytes;
-       h->h_lower -= sizeof(index_t);
-
-       /* adjust offsets in line pointers affected by moving the data */
-       top = NEXTINDEX(h);
-       for (i = 0; i < top; i++) {
-               if (h->h_linp[i] < off)
-                       h->h_linp[i] += nbytes;
-       }
+       /*
+        * Reach here with the last page that was looked at pinned.  Release
+        * it.
+        */
+done2: mpool_put(t->bt_mp, h, 0);
+       return (deleted ? RET_SUCCESS : RET_SPECIAL);
+
+err:   mpool_put(t->bt_mp, h, 0);
+       return (RET_ERROR);
+}
+
+/*
+ * __BT_DLEAF -- Delete a single record from a leaf page.
+ *
+ * Parameters:
+ *     t:      tree
+ *     index:  index on current page to delete
+ *
+ * Returns:
+ *     RET_SUCCESS, RET_ERROR.
+ */
+int
+__bt_dleaf(t, h, index)
+       BTREE *t;
+       PAGE *h;
+       int index;
+{
+       register BLEAF *bl;
+       register index_t *ip, offset;
+       register size_t nbytes;
+       register int cnt;
+       char *from;
+       void *to;
+
+       /*
+        * Delete a record from a btree leaf page.  Internal records are never
+        * deleted from internal pages, regardless of the records that caused
+        * them to be added being deleted.  Pages made empty by deletion are
+        * not reclaimed.  They are, however, made available for reuse.
+        *
+        * Pack the remaining entries at the end of the page, shift the indices
+        * down, overwriting the deleted record and its index.  If the record
+        * uses overflow pages, make them available for reuse.
+        */
+       to = bl = GETBLEAF(h, index);
+       if (bl->flags & P_BIGKEY && __ovfl_delete(t, bl->bytes) == RET_ERROR)
+               return (RET_ERROR);
+       if (bl->flags & P_BIGDATA &&
+           __ovfl_delete(t, bl->bytes + bl->ksize) == RET_ERROR)
+               return (RET_ERROR);
+       nbytes = NBLEAF(bl);
 
 
-       /* it's gone */
-       h->h_flags |= F_DIRTY;
+       /*
+        * Compress the key/data pairs.  Compress and adjust the [BR]LEAF
+        * offsets.  Reset the headers.
+        */
+       from = (char *)h + h->upper;
+       bcopy(from, from + nbytes, (char *)to - from);
+       h->upper += nbytes;
 
 
+       offset = h->linp[index];
+       for (cnt = &h->linp[index] - (ip = &h->linp[0]); cnt--; ++ip)
+               if (ip[0] < offset)
+                       ip[0] += nbytes;
+       for (cnt = &h->linp[NEXTINDEX(h)] - ip; --cnt; ++ip)
+               ip[0] = ip[1] < offset ? ip[1] + nbytes : ip[1];
+       h->lower -= sizeof(index_t);
        return (RET_SUCCESS);
 }
        return (RET_SUCCESS);
 }
index 3c68482..83eb3c7 100644 (file)
  */
 
 #if defined(LIBC_SCCS) && !defined(lint)
  */
 
 #if defined(LIBC_SCCS) && !defined(lint)
-static char sccsid[] = "@(#)bt_get.c   5.2 (Berkeley) %G%";
+static char sccsid[] = "@(#)bt_get.c   5.3 (Berkeley) %G%";
 #endif /* LIBC_SCCS and not lint */
 
 #endif /* LIBC_SCCS and not lint */
 
-#include <sys/param.h>
-#include <db.h>
+#include <sys/types.h>
 #include <errno.h>
 #include <errno.h>
-#include <stdlib.h>
-#include <string.h>
-#include <unistd.h>
+#include <db.h>
+#include <stdio.h>
+#include <stddef.h>
 #include "btree.h"
 
 /*
 #include "btree.h"
 
 /*
- *  BT_GETPAGE -- Make pgno the current page of the btree.
- *
- *     This routine is just a wrapper that decides whether to call the
- *     memory or disk-based routine to do the work.
+ * __BT_GET -- Get a record from the btree.
  *
  *
- *     Parameters:
- *             t -- btree in which to get page
- *             pgno -- page number to get
+ * Parameters:
+ *     dbp:    pointer to access method
+ *     key:    key to find
+ *     data:   data to return
+ *     flag:   currently unused
  *
  *
- *     Returns:
- *             RET_SUCCESS or RET_ERROR.
+ * Returns:
+ *     RET_ERROR, RET_SUCCESS and RET_SPECIAL if the key not found.
  */
  */
-
 int
 int
-_bt_getpage(t, pgno)
-       BTREE_P t;
-       pgno_t pgno;
+__bt_get(dbp, key, data, flags)
+       const DB *dbp;
+       DBT *key, *data;
+       u_int flags;
 {
 {
-#ifdef DEBUG
-       if (pgno == P_NONE)
-               _punt();
-#endif /* DEBUG */
+       BTREE *t;
+       EPG *e;
+       int exact, status;
 
 
-       /* see if we can get away without doing any work */
-       if (t->bt_curpage != (BTHEADER *) NULL) {
-               if (t->bt_curpage->h_pgno == pgno)
-                       return (RET_SUCCESS);
+       if (flags) {
+               errno = EINVAL;
+               return (RET_ERROR);
+       }
+       t = dbp->internal;
+       if ((e = __bt_search(t, key, &exact)) == NULL)
+               return (RET_ERROR);
+       if (!exact) {
+               mpool_put(t->bt_mp, e->page, 0);
+               return (RET_SPECIAL);
        }
 
        }
 
-       if (t->bt_fname == (char *) NULL)
-               return (_bt_getmpage(t, pgno));
-       else
-               return (_bt_getdpage(t, pgno));
-}
-
-/*
- *  _BT_GETMPAGE -- Make pgno the current page of the btree.
- *
- *     This routine gets pages for in-memory btrees.
- *
- *     Parameters:
- *             t -- btree in which to get page
- *             pgno -- page number to get
- *
- *     Returns:
- *             RET_SUCCESS or RET_ERROR.
- */
-
-int
-_bt_getmpage(t, pgno)
-       register BTREE_P t;
-       pgno_t pgno;
-{
-       int htindex;
-       BTHEADER *h;
-       HTBUCKET *b;
-
-       if (t->bt_curpage == (BTHEADER *) NULL) {
-               if (pgno != P_ROOT) {
-                       errno = EBADF;
-                       return (RET_ERROR);
-               }
-
-               t->bt_npages++;
-               h = (BTHEADER *) malloc((unsigned) t->bt_psize);
-               if (h == (BTHEADER *) NULL)
-                       return (RET_ERROR);
-
-               h->h_pgno = P_ROOT;
-               h->h_flags = F_LEAF;
-               h->h_lower = (index_t)
-                               (((char *) &(h->h_linp[0])) - ((char *) h));
-               h->h_upper = t->bt_psize;
-               h->h_prevpg = h->h_nextpg = P_NONE;
-
-               t->bt_curpage = h;
-
-               /* get the root page into the hash table */
-               if (_bt_write(t, h, RELEASE) == RET_ERROR)
+       /*
+        * A special case is if we found the record but it's flagged for
+        * deletion.  In this case, we want to find another record with the
+        * same key, if it exists.  Rather than look around the tree we call
+        * __bt_first and have it redo the search as __bt_first will not
+        * return keys marked for deletion.  Slow, but should never happen.
+        */
+       if (ISSET(t, BTF_DELCRSR) && e->page->pgno == t->bt_bcursor.pgno &&
+           e->index == t->bt_bcursor.index) {
+               mpool_put(t->bt_mp, e->page, 0);
+               if ((e = __bt_first(t, key, &exact)) == NULL)
                        return (RET_ERROR);
                        return (RET_ERROR);
+               if (!exact)
+                       return (RET_SPECIAL);
        }
 
        }
 
-       htindex = HASHKEY(pgno);
-
-       for (b = t->bt_s.bt_ht[htindex];
-            b != (HTBUCKET *) NULL;
-            b = b->ht_next) {
-               if (b->ht_pgno == pgno) {
-                       t->bt_curpage = b->ht_page;
-                       return (RET_SUCCESS);
-               }
-       }
-       return (RET_ERROR);
+       status = __bt_ret(t, e, data, key);
+       mpool_put(t->bt_mp, e->page, 0);
+       return (status);
 }
 
 /*
 }
 
 /*
- *  _BT_GETDPAGE -- Make pgno the current page of the btree.
+ * __BT_FIRST -- Find the first record in the tree matching the key.
  *
  *
- *     This routine gets pages for disk btrees.
+ * Parameters:
+ *     t:      the tree
+ *     key:    the key
  *
  *
- *     Because disk btree pages must be readable across machine architectures,
- *     the btree code writes integers out in network format.  This routine
- *     converts them back to host format before returning the page.
- *
- *     Parameters:
- *             t -- btree in which to get page
- *             pgno -- page number to get
- *
- *     Returns:
- *             RET_SUCCESS, RET_ERROR.
+ * Returns:
+ *     The first matching record.
  */
  */
-
-int
-_bt_getdpage(t, pgno)
-       register BTREE_P t;
-       pgno_t pgno;
+EPG *
+__bt_first(t, key, exactp)
+       BTREE *t;
+       DBT *key;
+       int *exactp;
 {
 {
-       BTHEADER *h;
-       char *cache;
-       long pos;
-       int n, nbytes;
-
-       /* if we have an lru cache, let the cache code do the work */
-       if (ISCACHE(t)) {
-               cache = t->bt_s.bt_d.d_cache;
-
-               /* release the old page */
-               if (t->bt_curpage != (BTHEADER *) NULL) {
-                       pgno_t opgno = t->bt_curpage->h_pgno;
-                       t->bt_curpage->h_flags &= ~F_DIRTY;
-
-                       if (lruwrite(cache, (int) opgno) < 0)
-                               return (RET_ERROR);
-
-                       if (lrurelease(cache, (int) opgno) < 0)
-                               return (RET_ERROR);
-               }
-
-               if (pgno > t->bt_npages) {
-                       if ((h = (BTHEADER *) lrugetnew(cache, (int)pgno, &nbytes))
-                           == (BTHEADER *) NULL)
-                               return (RET_ERROR);
-                       t->bt_npages = pgno;
-               } else {
-                       if ((h = (BTHEADER *) lruget(cache, (int)pgno, &nbytes))
-                           == (BTHEADER *) NULL)
-                               return (RET_ERROR);
-               }
-
-               /* init this page, if necessary */
-               if (nbytes == 0) {
-                       h->h_pgno = pgno;
-                       h->h_flags = F_LEAF;
-                       h->h_lower = (index_t)
-                               (((char *) &(h->h_linp[0])) - ((char *) h));
-                       h->h_upper = t->bt_psize;
-                       h->h_prevpg = h->h_nextpg = P_NONE;
-               }
+       register PAGE *h;
+       register EPG *e;
+       EPG save;
+       pgno_t cpgno, pg;
+       index_t cindex;
+       int found;
 
 
-               t->bt_curpage = h;
-
-               return (RET_SUCCESS);
+       /*
+        * Find any matching record; __bt_search pins the page.  Only exact
+        * matches are interesting.
+        */
+       if ((e = __bt_search(t, key, exactp)) == NULL)
+               return (NULL);
+       if (!*exactp) {
+               mpool_put(t->bt_mp, e->page, 0);
+               return (e);
        }
 
        }
 
-       /* sync the current page, if necessary */
-       if (t->bt_curpage != (BTHEADER *) NULL) {
-               if (t->bt_curpage->h_flags & F_DIRTY)
-                       if (_bt_write(t, t->bt_curpage, RELEASE) == RET_ERROR)
-                               return (RET_ERROR);
-       } else {
-               if (t->bt_npages == 0)
-                       t->bt_npages = 1;
+       if (ISSET(t, BTF_DELCRSR)) {
+               cpgno = t->bt_bcursor.pgno;
+               cindex = t->bt_bcursor.index;
+       } else
+               cpgno = P_INVALID;
 
 
-               /* if no current page, get space for one */
-               if ((t->bt_curpage = (BTHEADER *) malloc((unsigned) t->bt_psize))
-                   == (BTHEADER *) NULL) {
-                       return (RET_ERROR);
+       /*
+        * Walk backwards, skipping empty pages, as long as the entry matches
+        * and there are keys left in the tree.  Save a copy of each match in
+        * case we go too far.  A special case is that we don't return a match
+        * on records that the cursor references that have already been flagged
+        * for deletion.
+        */
+       save = *e;
+       h = e->page;
+       found = 0;
+       do {
+               if (cpgno != h->pgno || cindex != e->index) {
+                       if (save.page->pgno != e->page->pgno) {
+                               mpool_put(t->bt_mp, save.page, 0);
+                               save = *e;
+                       } else
+                               save.index = e->index;
+                       found = 1;
                }
                }
-       }
-
-       n = t->bt_psize;
-       pos = (long) (pgno * n);
-
-       /* seek to correct location in file */
-       if (lseek(t->bt_s.bt_d.d_fd, pos, L_SET) != pos) {
-               return (RET_ERROR);
-       }
-
-       /* read the page */
-       if ((nbytes = read(t->bt_s.bt_d.d_fd, t->bt_curpage, n)) < n) {
-
                /*
                /*
-                *  If we didn't get a full page, we must have gotten no
-                *  data at all -- in which case we're trying to read a
-                *  root page that doesn't exist yet.  This is the only
-                *  case in which this is okay.  If this happens, construct
-                *  an empty root page by hand.
+                * Make a special effort not to unpin the page the last (or
+                * original) match was on, but also make sure it's unpinned
+                * if an error occurs.
                 */
                 */
-               if (nbytes != 0 || pgno != P_ROOT) {
-                       errno = EBADF;
-                       return (RET_ERROR);
-               }
-
-               h = (BTHEADER *) t->bt_curpage;
-               h->h_pgno = pgno;
-               h->h_flags = F_LEAF;
-               h->h_lower = (index_t)
-                               (((char *) &(h->h_linp[0])) - ((char *) h));
-               h->h_upper = t->bt_psize;
-               h->h_prevpg = h->h_nextpg = P_NONE;
-       } else
-               (void) _bt_pgin(t->bt_curpage, (char *) t->bt_lorder);
-
-       return (RET_SUCCESS);
-}
-
-/*
- *  _BT_PGOUT, _BT_PGIN -- Convert host-specific number layout to/from
- *                        the host-independent format stored on disk.
- *
- *     Parameters:
- *             h -- page to convert
- *             _lorder -- byte order for pages (stored as a char * in the
- *                        cache, and passed around as a magic cookie).
- *
- *     Returns:
- *             RET_SUCCESS (lru code requires a return value).
- *
- *     Side Effects:
- *             Layout of tree metadata on the page is changed in place.
- *
- *     Warnings:
- *             Everywhere else in the code, the types pgno_t and index_t
- *             are opaque.  These two routines know what they really
- *             are.
- */
-
-int
-_bt_pgout(h, _lorder)
-       BTHEADER *h;
-       char *_lorder;
-{
-       int i;
-       int top;
-       int lorder;
-       DATUM *d;
-       IDATUM *id;
-       size_t chain;
-
-       lorder = (int) _lorder;
-       if (lorder == BYTE_ORDER)
-               return (RET_SUCCESS);
-
-       if (h->h_flags & F_LEAF) {
-               if (h->h_flags & F_CONT) {
-                       if (h->h_prevpg == P_NONE) {
-                               size_t longsz;
-
-                               (void) bcopy((char *) &(h->h_linp[0]),
-                                             (char *) &longsz,
-                                             sizeof(longsz));
-                               BLSWAP(longsz);
-                               (void) bcopy((char *) &longsz,
-                                             (char *) &(h->h_linp[0]),
-                                             sizeof(longsz));
-                       }
-               } else {
-                       top = NEXTINDEX(h);
-                       for (i = 0; i < top; i++) {
-                               d = (DATUM *) GETDATUM(h, i);
-                               if (d->d_flags & D_BIGKEY) {
-                                       (void) bcopy((char *) &(d->d_bytes[0]),
-                                                     (char *) &chain,
-                                                     sizeof(chain));
-                                       BLSWAP(chain);
-                                       (void) bcopy((char *) &chain,
-                                                     (char *) &(d->d_bytes[0]),
-                                                     sizeof(chain));
-                               }
-                               if (d->d_flags & D_BIGDATA) {
-                                       (void) bcopy((char *) &(d->d_bytes[d->d_ksize]),
-                                                     (char *) &chain,
-                                                     sizeof(chain));
-                                       BLSWAP(chain);
-                                       (void) bcopy((char *) &chain,
-                                                     (char *) &(d->d_bytes[d->d_ksize]),
-                                                     sizeof(chain));
+               if (e->index == 0)
+                       do {
+                               if (h->prevpg == P_INVALID)
+                                       goto done1;
+                               if (h->pgno != save.page->pgno)
+                                       mpool_put(t->bt_mp, h, 0);
+                               if ((h = mpool_get(t->bt_mp,
+                                   h->prevpg, 0)) == NULL) {
+                                       if (h->pgno == save.page->pgno)
+                                               mpool_put(t->bt_mp,
+                                                   save.page, 0);
+                                       return (NULL);
                                }
                                }
-                               BLSWAP(d->d_dsize);
-                               BLSWAP(d->d_ksize);
-                               BLSWAP(d->d_flags);
-                               BLSWAP(h->h_linp[i]);
-                       }
-               }
-       } else {
-               top = NEXTINDEX(h);
-               for (i = 0; i < top; i++) {
-                       id = (IDATUM *) GETDATUM(h, i);
-                       BLSWAP(id->i_size);
-                       BLSWAP(id->i_pgno);
-                       BLSWAP(id->i_flags);
-                       if (id->i_flags & D_BIGKEY) {
-                               (void) bcopy((char *) &(id->i_bytes[0]),
-                                             (char *) &chain,
-                                             sizeof(chain));
-                               BLSWAP(chain);
-                               (void) bcopy((char *) &chain,
-                                             (char *) &(id->i_bytes[0]),
-                                             sizeof(chain));
-                       }
-                       BLSWAP(h->h_linp[i]);
-               }
-       }
-       BLSWAP(h->h_flags);
-       BLSWAP(h->h_pgno);
-       BLSWAP(h->h_prevpg);
-       BLSWAP(h->h_nextpg);
-       BLSWAP(h->h_lower);
-       BLSWAP(h->h_upper);
-
-       return (RET_SUCCESS);
-}
-
-int
-_bt_pgin(h, _lorder)
-       BTHEADER *h;
-       char *_lorder;
-{
-       int i;
-       int top;
-       int lorder;
-       DATUM *d;
-       IDATUM *id;
-       size_t chain;
+                       } while ((e->index = NEXTINDEX(h)) == 0);
+               --e->index;
+       } while (__bt_cmp(t, key, e) == 0);
 
        /*
 
        /*
-        *  If btree pages are to be stored in the host byte order, don't
-        *  bother swapping.
+        * Reach here with the last page that was looked at pinned, which may
+        * or may not be the same as the last (or original) match page.  If
+        * it's not useful, release it.
         */
         */
-       lorder = (int) _lorder;
-       if (lorder == BYTE_ORDER)
-               return (RET_SUCCESS);
+done1: if (h->pgno != save.page->pgno)
+               mpool_put(t->bt_mp, h, 0);
 
 
-       BLSWAP(h->h_upper);
-       BLSWAP(h->h_lower);
-       BLSWAP(h->h_nextpg);
-       BLSWAP(h->h_prevpg);
-       BLSWAP(h->h_pgno);
-       BLSWAP(h->h_flags);
-
-       if (h->h_flags & F_LEAF) {
-               if (h->h_flags & F_CONT) {
-                       if (h->h_prevpg == P_NONE) {
-                               size_t longsz;
-
-                               (void) bcopy((char *) &(h->h_linp[0]),
-                                             (char *) &longsz,
-                                             sizeof(longsz));
-                               BLSWAP(longsz);
-                               (void) bcopy((char *) &longsz,
-                                             (char *) &(h->h_linp[0]),
-                                             sizeof(longsz));
-                       }
-               } else {
-                       top = NEXTINDEX(h);
-                       for (i = 0; i < top; i++) {
-                               BLSWAP(h->h_linp[i]);
-                               d = (DATUM *) GETDATUM(h, i);
-                               BLSWAP(d->d_dsize);
-                               BLSWAP(d->d_ksize);
-                               BLSWAP(d->d_flags);
-                               if (d->d_flags & D_BIGKEY) {
-                                       (void) bcopy((char *) &(d->d_bytes[0]),
-                                                     (char *) &chain,
-                                                     sizeof(chain));
-                                       BLSWAP(chain);
-                                       (void) bcopy((char *) &chain,
-                                                     (char *) &(d->d_bytes[0]),
-                                                     sizeof(chain));
-                               }
-                               if (d->d_flags & D_BIGDATA) {
-                                       (void) bcopy((char *) &(d->d_bytes[d->d_ksize]),
-                                                     (char *) &chain,
-                                                     sizeof(chain));
-                                       BLSWAP(chain);
-                                       (void) bcopy((char *) &chain,
-                                                     (char *) &(d->d_bytes[d->d_ksize]),
-                                                     sizeof(chain));
+       /*
+        * If still haven't found a record, the only possibility left is the
+        * next one.  Move forward one slot, skipping empty pages and check.
+        */
+       if (!found) {
+               h = save.page;
+               if (++save.index == NEXTINDEX(h)) {
+                       do {
+                               pg = h->nextpg;
+                               mpool_put(t->bt_mp, h, 0);
+                               if (pg == P_INVALID) {
+                                       *exactp = 0;
+                                       return (e);
                                }
                                }
-                       }
-               }
-       } else {
-               top = NEXTINDEX(h);
-               for (i = 0; i < top; i++) {
-                       BLSWAP(h->h_linp[i]);
-                       id = (IDATUM *) GETDATUM(h, i);
-                       BLSWAP(id->i_size);
-                       BLSWAP(id->i_pgno);
-                       BLSWAP(id->i_flags);
-                       if (id->i_flags & D_BIGKEY) {
-                               (void) bcopy((char *) &(id->i_bytes[0]),
-                                             (char *) &chain,
-                                             sizeof(chain));
-                               BLSWAP(chain);
-                               (void) bcopy((char *) &chain,
-                                             (char *) &(id->i_bytes[0]),
-                                             sizeof(chain));
-                       }
-               }
-       }
-       return (RET_SUCCESS);
-}
-
-/*
- *  _BT_ALLOCPG -- allocate a new page in the btree.
- *
- *     This is called when we split a page, to get space to do the split.
- *     For disk btrees, these pages are released when the split is done.
- *     For memory btrees, they are not.
- *
- *     Parameters:
- *             t -- tree in which to do split
- *
- *     Returns:
- *             Pointer to the newly-allocated page
- */
-
-BTHEADER *
-_bt_allocpg(t)
-       BTREE_P t;
-{
-       BTHEADER *h = t->bt_curpage;
-       BTHEADER *nh;
-       int nbytes;
-
-       /* if we have a cache, let the cache code do the work */
-       if (ISDISK(t) && ISCACHE(t)) {
-               nh = (BTHEADER *) lrugetnew(t->bt_s.bt_d.d_cache,
-                                           (int) (t->bt_npages + 1),
-                                           &nbytes);
-       } else {
-               nh = (BTHEADER *) malloc((unsigned) t->bt_psize);
-       }
-
-       if (nh != (BTHEADER *) NULL) {
-               nh->h_pgno = nh->h_prevpg = nh->h_nextpg = P_NONE;
-               nh->h_flags = h->h_flags;
-               nh->h_lower = (index_t)
-                               (((char *) &(nh->h_linp[0])) - ((char *) nh));
-               nh->h_upper = t->bt_psize;
-       }
-
-       return (nh);
-}
-
-/*
- *  _BT_WRITE -- Write a specific page to a btree file.
- *
- *     Parameters:
- *             t -- btree to get the page
- *             h -- page to write
- *             relflag -- (int) this page may/may not be released
- *
- *     Returns:
- *             RET_SUCCESS, RET_ERROR.
- *
- *     Side Effects:
- *             Writes a metadata page if none has been written yet.
- */
-
-int
-_bt_write(t, h, relflag)
-       BTREE_P t;
-       BTHEADER *h;
-       int relflag;
-{
-       long pos;
-       int htindex;
-       HTBUCKET *b;
-       char *cache;
-       pgno_t pgno;
-
-       h->h_flags &= ~F_DIRTY;
-       if (ISDISK(t)) {
-
-               /* if we haven't done so yet, write the metadata */
-               if (!(t->bt_flags & BTF_METAOK)) {
-                       if (_bt_wrtmeta(t) == RET_ERROR)
-                               return (RET_ERROR);
-               }
-
-               pgno = h->h_pgno;
-
-
-               /* if we have a cache, let the cache code do the work */
-               if ((cache = t->bt_s.bt_d.d_cache) != (char *) NULL) {
-                       if (lruwrite(cache, (int) pgno) < 0)
-                               return (RET_ERROR);
-                       if (relflag && lrurelease(cache, (int) pgno) < 0)
-                               return (RET_ERROR);
-                               
-               } else {
-                       (void) _bt_pgout(h, (char *) t->bt_lorder);
-                       /* now write the current page */
-                       pos = (long) (pgno * t->bt_psize);
-                       if (lseek(t->bt_s.bt_d.d_fd, pos, L_SET) != pos)
-                               return (RET_ERROR);
-                       if (write(t->bt_s.bt_d.d_fd, (char *) h, (int) t->bt_psize)
-                           < t->bt_psize)
-                               return (RET_ERROR);
-                       if (!relflag)
-                               (void) _bt_pgin(h, (char *) t->bt_lorder);
+                               if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
+                                       return (NULL);
+                       } while ((save.index = NEXTINDEX(h)) == 0);
+                       save.page = h;
                }
                }
-       } else {
-               /* in-memory btree */
-               htindex = HASHKEY(h->h_pgno);
-
-               /* see if we need to overwrite existing entry */
-               for (b = t->bt_s.bt_ht[htindex];
-                    b != (HTBUCKET *) NULL;
-                    b = b->ht_next) {
-                       if (b->ht_pgno == h->h_pgno) {
-                               b->ht_page = h;
-                               return (RET_SUCCESS);
-                       }
+               if (__bt_cmp(t, key, &save) != 0) {
+                       *exactp = 0;
+                       return (e);
                }
                }
-
-               /* new entry, write it */
-               b = (HTBUCKET *) malloc((unsigned) sizeof (HTBUCKET));
-               if (b == (HTBUCKET *) NULL)
-                       return (RET_ERROR);
-
-               b->ht_pgno = h->h_pgno;
-               b->ht_page = h;
-               b->ht_next = t->bt_s.bt_ht[htindex];
-               t->bt_s.bt_ht[htindex] = b;
-       }
-       return (RET_SUCCESS);
-}
-
-/*
- *  _BT_RELEASE -- Release a locked-down cache page
- *
- *     During page splits, we want to force pages out to the cache
- *     before we actually release them, in some cases.  This routine
- *     releases such a page when it is no longer needed.
- *
- *     Parameters:
- *             t -- btree in which to release page
- *             h -- page to release
- *
- *     Returns:
- *             RET_SUCCESS, RET_ERROR.
- *
- *     Side Effects:
- *             None.
- */
-
-int
-_bt_release(t, h)
-       BTREE_P t;
-       BTHEADER *h;
-{
-       if (ISDISK(t) && ISCACHE(t)) {
-               if (lrurelease(t->bt_s.bt_d.d_cache, (int) (h->h_pgno)) < 0)
-                       return (RET_ERROR);
        }
        }
-       return (RET_SUCCESS);
-}
-
-/*
- *  _BT_WRTMETA -- Write metadata to the btree.
- *
- *     Parameters:
- *             t -- tree to which to write
- *
- *     Returns:
- *             RET_SUCCESS, RET_ERROR.
- */
-
-int
-_bt_wrtmeta(t)
-       BTREE_P t;
-{
-       BTMETA m;
-
-       if (lseek(t->bt_s.bt_d.d_fd, 0l, L_SET) != 0l)
-               return (RET_ERROR);
-
-       /* lorder has to be in host-independent format */
-       m.m_lorder = (u_long) htonl((long) t->bt_lorder);
-
-       m.m_magic = BTREEMAGIC;
-       m.m_version = BTREEVERSION;
-       m.m_psize = t->bt_psize;
-       m.m_free = t->bt_free;
-       m.m_flags = t->bt_flags & BTF_NODUPS;
-
-       if (t->bt_lorder != BYTE_ORDER) {
-               BLSWAP(m.m_magic);
-               BLSWAP(m.m_version);
-               BLSWAP(m.m_psize);
-               BLSWAP(m.m_free);
-               BLSWAP(m.m_flags);
-       }
-
-       if (write(t->bt_s.bt_d.d_fd, (char *) &m, sizeof(m))
-           != sizeof(m)) {
-               return (RET_ERROR);
-       }
-
-       t->bt_flags |= BTF_METAOK;
-
-       return (RET_SUCCESS);
+       *e = save;
+       *exactp = 1;
+       return (e);
 }
 }
index 642153e..cecb623 100644 (file)
  */
 
 #if defined(LIBC_SCCS) && !defined(lint)
  */
 
 #if defined(LIBC_SCCS) && !defined(lint)
-static char sccsid[] = "@(#)bt_open.c  5.9 (Berkeley) %G%";
+static char sccsid[] = "@(#)bt_open.c  5.10 (Berkeley) %G%";
 #endif /* LIBC_SCCS and not lint */
 
 /*
 #endif /* LIBC_SCCS and not lint */
 
 /*
- *  btree.c -- implementation of btree access method for 4.4BSD.
+ * Implementation of btree access method for 4.4BSD.
  *
  *
- *     The design here is based on that of the btree access method used
- *     in the Postgres database system at UC Berkeley.  The implementation
- *     is wholly independent of the Postgres code.
- *
- *     This implementation supports btrees on disk (supply a filename) or
- *     in memory (don't).  Public interfaces defined here are:
- *
- *             btree_open()    -- wrapper; returns a filled DB struct for
- *                                a btree.
- *
- *             bt_open()       -- open a new or existing btree.
- *             bt_get()        -- fetch data from a tree by key.
- *             bt_seq()        -- do a sequential scan on a tree.
- *             bt_put()        -- add data to a tree by key.
- *             bt_delete()     -- remove data from a tree by key.
- *             bt_close()      -- close a btree.
- *             bt_sync()       -- force btree pages to disk (disk trees only).
+ * The design here was originally based on that of the btree access method
+ * used in the Postgres database system at UC Berkeley.  This implementation
+ * is wholly independent of the Postgres code.
  */
 
 #include <sys/param.h>
 #include <sys/stat.h>
  */
 
 #include <sys/param.h>
 #include <sys/stat.h>
-#include <signal.h>
-#include <errno.h>
 #include <fcntl.h>
 #include <fcntl.h>
+#include <errno.h>
+#include <limits.h>
+#define        __DBINTERFACE_PRIVATE
 #include <db.h>
 #include <db.h>
-#include <stdlib.h>
-#include <string.h>
+#include <stdio.h>
 #include <unistd.h>
 #include <unistd.h>
+#include <stdlib.h>
 #include "btree.h"
 
 #include "btree.h"
 
-BTREEINFO _DefaultBTInfo = {
-       0,      /* flags */
-       0,      /* cachesize */
-       0,      /* psize */
-       strcmp, /* compare */
-       0
-};
+static int nroot __P((BTREE *));
+static int tmp __P((void));
 
 /*
 
 /*
- *  BTREE_OPEN -- Wrapper routine to open a btree.
+ * __BT_OPEN -- Open a btree.
  *
  *
- *     Creates and fills a DB struct, and calls the routine that actually
- *     opens the btree.
+ * Creates and fills a DB struct, and calls the routine that actually
+ * opens the btree.
  *
  *
- *     Parameters:
- *             f:  filename to open
- *             flags:  flag bits passed to open
- *             mode:  permission bits, used if O_CREAT specified
- *             b:  BTREEINFO pointer
+ * Parameters:
+ *     fname:  filename (NULL for in-memory trees)
+ *     flags:  open flag bits
+ *     mode:   open permission bits
+ *     b:      BTREEINFO pointer
  *
  *
- *     Returns:
- *             Filled-in DBT on success; NULL on failure, with errno
- *             set as appropriate.
+ * Returns:
+ *     NULL on failure, pointer to DB on success.
  *
  *
- *     Side Effects:
- *             Allocates memory for the DB struct.
  */
  */
-
 DB *
 DB *
-btree_open(f, flags, mode, b)
-       const char *f;
-       int flags;
-       int mode;
-       const BTREEINFO *b;
+__bt_open(fname, flags, mode, openinfo)
+       const char *fname;
+       int flags, mode;
+       const BTREEINFO *openinfo;
 {
 {
-       DB *db;
-       BTREE t;
-
-       if ((db = (DB *) malloc((unsigned) sizeof(DB))) == (DB *) NULL)
-               return ((DB *) NULL);
-
-       if ((t = bt_open(f, flags, mode, b)) == (BTREE) NULL) {
-               (void) free ((char *) db);
-               return ((DB *) NULL);
-       }
-
-       db->internal = (char *) t;
-       db->close = bt_close;
-       db->del = bt_delete;
-       db->get = bt_get;
-       db->put = bt_put;
-       db->seq = bt_seq;
-       db->sync = bt_sync;
-       db->type = DB_BTREE;
-
-       return (db);
-}
-
-/*
- *  BT_OPEN -- Open a btree.
- *
- *     This routine creates the correct kind (disk or in-memory) of
- *     btree and initializes it as required.
- *
- *     Parameters:
- *             f -- name of btree (NULL for in-memory btrees)
- *             flags -- flags passed to open()
- *             mode -- mode passed to open ()
- *             b -- BTREEINFO structure, describing btree
- *
- *     Returns:
- *             (Opaque) pointer to the btree.  On failure, returns NULL
- *             with errno set as appropriate.
- *
- *     Side Effects:
- *             Allocates memory, opens files.
- */
-
-BTREE
-bt_open(f, flags, mode, b)
-       char *f;
-       int flags;
-       int mode;
-       BTREEINFO *b;
-{
-       BTREE_P t;
-       HTABLE ht;
-       int nbytes;
-       int fd;
-       CURSOR *c;
+       BTREE *t;
+       DB *dbp;
        BTMETA m;
        BTMETA m;
-       struct stat buf;
-
-       /* use the default info if none was provided */
-       if (b == (BTREEINFO *) NULL)
-               b = &_DefaultBTInfo;
-
-       if ((t = (BTREE_P) malloc((unsigned) sizeof *t)) == (BTREE_P) NULL)
-               return ((BTREE) NULL);
-
-       if (b->compare)
-               t->bt_compare = b->compare;
-       else
-               t->bt_compare = strcmp;
-
-       t->bt_fname = f;
-       t->bt_curpage = (BTHEADER *) NULL;
-       t->bt_free = P_NONE;
-       c = &(t->bt_cursor);
-       c->c_pgno = P_NONE;
-       c->c_index = 0;
-       c->c_flags = (u_char) NULL;
-       c->c_key = (char *) NULL;
-       t->bt_stack = (BTSTACK *) NULL;
-       t->bt_flags = 0;
+       BTREEINFO b;
+       pgno_t ncache;
+       struct stat sb;
+       int nr;
 
        /*
 
        /*
-        *  If no file name was supplied, this is an in-memory btree.
-        *  Otherwise, it's a disk-based btree.
+        * Intention is to make sure all of the user's selections are okay
+        * here and then use them without checking.  Can't be complete, since
+        * we don't know the right page size, lorder or flags until the backing
+        * file is opened.  Also, the file's page size can cause the cachesize
+        * to change.
         */
         */
-       if (f == (char *) NULL) {
-               /* in memory */
-               if ((t->bt_psize = b->psize) < MINPSIZE) {
-                       if (t->bt_psize != 0) {
-                               (void) free ((char *) t);
-                               errno = EINVAL;
-                               return ((BTREE) NULL);
-                       }
-                       t->bt_psize = getpagesize();
-               }
-
-               nbytes = HTSIZE * sizeof(HTBUCKET *);
-               if ((ht = (HTABLE) malloc((unsigned) nbytes))
-                   == (HTABLE) NULL) {
-                       (void) free((char *) t);
-                       return ((BTREE) NULL);
-               }
-               (void) bzero((char *) ht, nbytes);
-               t->bt_s.bt_ht = ht;
-               t->bt_npages = 0;
-               t->bt_lorder = BYTE_ORDER;
-               if (!(b->flags & R_DUP))
-                       t->bt_flags |= BTF_NODUPS;
-       } else {
-               /* on disk */
-               if ((fd = open(f, O_RDONLY, 0)) < 0) {
-                       /* doesn't exist yet, be sure page is big enough */
-                       if ((t->bt_psize = b->psize) < sizeof(BTHEADER)
-                           && b->psize != 0) {
-                               (void) free((char *) t);
-                               errno = EINVAL;
-                               return ((BTREE) NULL);
-                       }
-                       if (b->lorder == 0)
-                               b->lorder = BYTE_ORDER;
-
-                       if (b->lorder != BIG_ENDIAN
-                           && b->lorder != LITTLE_ENDIAN) {
-                               (void) free((char *) t);
-                               errno = EINVAL;
-                               return ((BTREE) NULL);
-                       }
-                       t->bt_lorder = b->lorder;
-                       if (!(b->flags & R_DUP))
-                               t->bt_flags |= BTF_NODUPS;
-               } else {
-                       /* exists, get page size from file */
-                       if (read(fd, &m, sizeof(m)) < sizeof(m)) {
-                               (void) close(fd);
-                               (void) free((char *) t);
-                               errno = EINVAL;
-                               return ((BTREE) NULL);
-                       }
-
-                       /* lorder always stored in host-independent format */
-                       NTOHL(m.m_lorder);
-                       if (m.m_lorder != BIG_ENDIAN
-                           && m.m_lorder != LITTLE_ENDIAN) {
-                               (void) free((char *) t);
-                               errno = EINVAL;
-                               return ((BTREE) NULL);
-                       }
-                       t->bt_lorder = m.m_lorder;
-
-                       if (t->bt_lorder != BYTE_ORDER) {
-                               BLSWAP(m.m_magic);
-                               BLSWAP(m.m_version);
-                               BLSWAP(m.m_psize);
-                               BLSWAP(m.m_free);
-                               BLSWAP(m.m_flags);
-                       }
-
-                       if (m.m_magic != BTREEMAGIC
-                           || m.m_version != BTREEVERSION
-                           || m.m_psize < MINPSIZE) {
-                               (void) close(fd);
-                               (void) free((char *) t);
-#ifndef EFTYPE
-#define EFTYPE -100
+       if (openinfo) {
+               b = *openinfo;
+
+               /* Flags: R_DUP. */
+               if (b.flags && b.flags != R_DUP)
+                       goto einval;
+
+               /*
+                * Page size must be index_t aligned and >= MINPSIZE.  Default
+                * page size is set farther on, based on the underlying file
+                * transfer size.
+                */
+               if (b.psize &&
+                   (b.psize < MINPSIZE || b.psize > MAX_PAGE_OFFSET ||
+                   b.psize & sizeof(index_t) - 1))
+                               goto einval;
+#ifdef notdef
+               if (b.maxkeypage && b.maxkeypage < 1) 
+                       goto einval;
+
+               if (b.minkeypage) {
+                       if (b.minkeypage < 2)
+                               goto einval;
+               } else
+                       b.minkeypage = 2;
+#else
+               b.maxkeypage = DEFMAXKEYPAGE;
+               b.minkeypage = DEFMINKEYPAGE;
 #endif
 #endif
-                               errno = EFTYPE;
-                               return ((BTREE) NULL);
-                       }
-                       t->bt_psize = m.m_psize;
-                       t->bt_free = m.m_free;
-                       t->bt_flags |= (m.m_flags & BTF_NODUPS) | BTF_METAOK;
-                       (void) close(fd);
-               }
-
-               /* now open the file the way the user wanted it */
-               if ((t->bt_s.bt_d.d_fd = open(f, flags, mode)) < 0) {
-                       (void) free ((char *) t);
-                       return ((BTREE) NULL);
-               }
-
-               /* access method files are always close-on-exec */
-               if (fcntl(t->bt_s.bt_d.d_fd, F_SETFL, 1) == -1) {
-                       (void) close(t->bt_s.bt_d.d_fd);
-                       (void) free ((char *) t);
-                       return ((BTREE) NULL);
-               }
-
-               /* get number of pages, page size if necessary */
-               (void) fstat(t->bt_s.bt_d.d_fd, &buf);
-               if (t->bt_psize == 0)
-                       t->bt_psize = buf.st_blksize;
-               t->bt_npages = (pgno_t) (buf.st_size / t->bt_psize);
-
-               /* page zero is metadata, doesn't count */
-               if (t->bt_npages > 0)
-                       --(t->bt_npages);
-
-               if (b->cachesize == 0)
-                       b->cachesize = DEFCACHE;
-
-               /* get an lru buffer cache, if the user asked for one */
-               if ((b->cachesize / t->bt_psize) > 0) {
-                       BTDISK *d = &(t->bt_s.bt_d);
-
-                       d->d_cache = lruinit(d->d_fd,
-                                            (int) (b->cachesize / t->bt_psize),
-                                            (int) t->bt_psize,
-                                            (char *) t->bt_lorder,
-                                            _bt_pgin, _bt_pgout);
-
-                       if (d->d_cache == (char *) NULL) {
-                               (void) free((char *) t);
-                               return ((BTREE) NULL);
-                       }
+               
+               /* If no comparison, use default comparison and prefix. */
+               if (b.compare == NULL) {
+                       b.compare = __bt_defcmp;
+                       if (b.prefix == NULL)
+                               b.prefix = __bt_defpfx;
                }
                }
-       }
-
-       /* remember if tree was opened for write */
-       if (((flags & O_WRONLY) == O_WRONLY)
-           || ((flags & O_RDWR) == O_RDWR))
-               t->bt_flags |= BTF_ISWRITE;
-
-       return ((BTREE) t);
-}
-
-/*
- *  BT_GET -- Get an entry from a btree.
- *
- *     Does a key lookup in the tree to find the specified key, and returns
- *     the key/data pair found.
- *
- *     Parameters:
- *             tree -- btree in which to do lookup
- *             key -- key to find
- *             data -- pointer to DBT in which to return data
- *             flag -- ignored
- *
- *     Returns:
- *             RET_SUCCESS, RET_ERROR, or RET_SPECIAL if the key is not
- *             found.  If key is not found, nothing is stored in the
- *             return DBT 'data'.
- *
- *     Side Effects:
- *             None.
- *
- *     Warnings:
- *             Return data is statically allocated, and will be overwritten
- *             at the next call.
- */
-
-int
-bt_get(dbp, key, data, flag)
-       DB *dbp; 
-       DBT *key, *data;
-       u_long flag;
-{
-       BTREE_P t = (BTREE_P) (dbp->internal);
-       BTHEADER *h;
-       DATUM *d;
-       BTITEM *item;
 
 
-#ifdef lint
-       flag = flag;
-#endif /* lint */
-
-       /* lookup */
-       item = _bt_search(t, key);
-
-       /* clear parent pointer stack */
-       while (_bt_pop(t) != P_NONE)
-               continue;
-
-       if (item == (BTITEM *) NULL)
-               return (RET_ERROR);
-
-       h = (BTHEADER *) t->bt_curpage;
-       data->size = 0;
-       data->data = (u_char *) NULL;
-
-       /* match? */
-       if (VALIDITEM(t, item)
-           && _bt_cmp(t, key->data, item->bti_index) == 0) {
-               d = (DATUM *) GETDATUM(h, item->bti_index);
-               return (_bt_buildret(t, d, data, key));
+               if (b.lorder == 0)
+                       b.lorder = BYTE_ORDER;
+               else if (b.lorder != BIG_ENDIAN && b.lorder != LITTLE_ENDIAN)
+                       goto einval;
+       } else {
+               b.flags = 0;
+               b.maxkeypage = DEFMAXKEYPAGE;
+               b.minkeypage = DEFMINKEYPAGE;
+               b.compare = __bt_defcmp;
+               b.prefix = __bt_defpfx;
+               b.lorder = BYTE_ORDER;
        }
 
        }
 
-       /* nope */
-       return (RET_SPECIAL);
-}
-
-/*
- *  BT_PUT -- Add an entry to a btree.
- *
- *     The specified (key, data) pair is added to the tree.  If the tree
- *     was created for unique keys only, then duplicates will not be
- *     entered.  If the requested key exists in the tree, it will be over-
- *     written unless the flags parameter is R_NOOVERWRITE, in which case
- *     the update will not be done.  If duplicate keys are permitted in the
- *     tree, duplicates will be inserted and will not overwrite existing
- *     keys.  Nodes are split as required.
- *
- *     Parameters:
- *             tree -- btree in which to put the new entry
- *             key -- key component to add
- *             data -- data corresponding to key
- *             flag -- R_NOOVERWRITE or zero.
- *
- *     Returns:
- *             RET_SUCCESS, RET_ERROR, or RET_SPECIAL if the
- *             NOOVERWRITE flag was set and the specified key exists
- *             in the database.
- *
- *     Side Effects:
- *             None.
- */
-
-int
-bt_put(dbp, key, data, flag)
-       DB *dbp;
-       DBT *key, *data;
-       u_long flag;
-{
-       BTREE_P t;
-       BTITEM *item;
-
-       t = (BTREE_P)dbp->internal;
-
-       /* look for this key in the tree */
-       item = _bt_search(t, key);
+       /* Allocate and initialize DB and BTREE structures. */
+       if ((t = malloc(sizeof(BTREE))) == NULL)
+               goto err;
+       t->bt_fd = -1;                  /* Don't close unopened fd on error. */
+       if ((t->bt_dbp = dbp = malloc(sizeof(DB))) == NULL)
+               goto err;
+       t->bt_bcursor.pgno = P_INVALID;
+       t->bt_bcursor.index = 0;
+       t->bt_stack = NULL;
+       t->bt_sp = t->bt_maxstack = 0;
+       t->bt_kbuf = t->bt_dbuf = NULL;
+       t->bt_kbufsz = t->bt_dbufsz = 0;
+       t->bt_maxkeypage = b.maxkeypage;
+       t->bt_minkeypage = b.minkeypage;
+       t->bt_order = NOT;
+       t->bt_cmp = b.compare;
+       t->bt_pfx = b.prefix;
+
+       dbp->type = DB_BTREE;
+       dbp->internal = t;
+       dbp->close = __bt_close;
+       dbp->del = __bt_delete;
+       dbp->get = __bt_get;
+       dbp->put = __bt_put;
+       dbp->seq = __bt_seq;
+       dbp->sync = __bt_sync;
 
        /*
 
        /*
-        *  If this tree was originally created without R_DUP, then duplicate
-        *  keys are not allowed.  We need to check this at insertion time.
+        * If no file name was supplied, this is an in-memory btree and we
+        * open a backing temporary file.  Otherwise, it's a disk-based tree.
         */
         */
+       if (fname) {
+#define        USEFLAGS        (O_CREAT|O_EXCL|O_RDONLY|O_RDWR|O_TRUNC|O_WRONLY)
+               if ((t->bt_fd = open(fname, flags & USEFLAGS, mode)) < 0)
+                       goto err;
+               if ((flags & O_ACCMODE) == O_RDONLY)
+                       SET(t, BTF_RDONLY);
 
 
-       if (VALIDITEM(t, item) && _bt_cmp(t, key->data, item->bti_index) == 0) {
-               if ((t->bt_flags & BTF_NODUPS) && flag == R_NOOVERWRITE) {
-                       if (_bt_delone(t, item->bti_index) == RET_ERROR) {
-                               while (_bt_pop(t) != P_NONE)
-                                       continue;
-                               return (RET_ERROR);
-                       }
-               }
+       } else {
+               if ((t->bt_fd = tmp()) == -1)
+                       goto err;
+               SET(t, BTF_INMEM);
        }
 
        }
 
-       return (_bt_insert(t, item, key, data, flag));
-}
-
-/*
- *  BT_DELETE -- delete a key from the tree.
- *
- *     Deletes all keys (and their associated data items) matching the
- *     supplied key from the tree.  If the flags entry is R_CURSOR, then
- *     the current item in the active scan is deleted.
- *
- *     Parameters:
- *             tree -- btree from which to delete key
- *             key -- key to delete
- *             flags -- R_CURSOR or zero
- *
- *     Returns:
- *             RET_SUCCESS, RET_ERROR, or RET_SPECIAL if the specified
- *             key was not in the tree.
- *
- *     Side Effects:
- *             None.
- */
-
-int
-bt_delete(dbp, key, flags)
-       DB *dbp;
-       DBT *key;
-       u_long flags;
-{
-       BTREE_P t;
-       BTHEADER *h;
-       BTITEM *item;
-       int ndel = 0;
-
-       t = (BTREE_P)dbp->internal;
-
-       if (flags == R_CURSOR)
-               return (_bt_crsrdel(t));
-
-       /* find the first matching key in the tree */
-       item = _bt_first(t, key);
-       h = t->bt_curpage;
-
-       /* don't need the descent stack for deletes */
-       while (_bt_pop(t) != P_NONE)
-               continue;
-
-       /* delete all matching keys */
-       for (;;) {
-               while (VALIDITEM(t, item)
-                      && (_bt_cmp(t, key->data, item->bti_index) == 0)) {
-                       if (_bt_delone(t, item->bti_index) == RET_ERROR)
-                               return (RET_ERROR);
-                       ndel++;
+       if (fcntl(t->bt_fd, F_SETFL, 1) == -1)
+               goto err;
+
+       if (fstat(t->bt_fd, &sb))
+               goto err;
+       if (sb.st_size) {
+               nr = read(t->bt_fd, &m, sizeof(BTMETA));
+               if (nr < 0)
+                       goto err;
+               if (nr != sizeof(BTMETA))
+                       goto eftype;
+
+               /*
+                * Read in the meta-data.  This can change the notion of what
+                * the lorder, page size and flags are, and, when the page size
+                * changes the cachesize value can change as well.
+                *
+                * Lorder is always stored in host-independent format.
+                */
+               NTOHL(m.m_lorder);
+               if (m.m_lorder != BIG_ENDIAN && m.m_lorder != LITTLE_ENDIAN)
+                       goto eftype;
+               if (m.m_lorder != BYTE_ORDER) {
+                       BLSWAP(m.m_magic);
+                       BLSWAP(m.m_version);
+                       BLSWAP(m.m_psize);
+                       BLSWAP(m.m_free);
+                       BLSWAP(m.m_nrecs);
+                       BLSWAP(m.m_flags);
                }
                }
+               if (m.m_magic != BTREEMAGIC || m.m_version != BTREEVERSION)
+                       goto eftype;
+               if (m.m_psize < MINPSIZE || m.m_psize > MAX_PAGE_OFFSET ||
+                   m.m_psize & sizeof(index_t) - 1)
+                       goto eftype;
+               if (m.m_flags | ~SAVEMETA)
+                       goto eftype;
+
+               b.psize = m.m_psize;
+               t->bt_flags = m.m_flags;
+               t->bt_free = m.m_free;
+               t->bt_lorder = m.m_lorder;
+               t->bt_nrecs = m.m_nrecs;
+       } else {
+               /*
+                * Set the page size to the best value for I/O to this file.
+                * Don't overflow the page offset type.
+                */
+               if (b.psize == 0) {
+                       b.psize = sb.st_blksize;
+                       if (b.psize < MINPSIZE)
+                               b.psize = MINPSIZE;
+                       if (b.psize > MAX_PAGE_OFFSET)
+                               b.psize = MAX_PAGE_OFFSET;
+               }
+               t->bt_flags = b.flags & R_DUP ? 0 : BTF_NODUPS;
+               t->bt_free = P_INVALID;
+               t->bt_lorder = b.lorder;
+               t->bt_nrecs = 0;
+               SET(t, BTF_METADIRTY);
+       }
 
 
-               if (VALIDITEM(t, item) || h->h_nextpg == P_NONE)
-                       break;
-
-               /* next page, if necessary */
-               do {
-                       if (_bt_getpage(t, h->h_nextpg) == RET_ERROR)
-                               return (RET_ERROR);
-                       h = t->bt_curpage;
-               } while (NEXTINDEX(h) == 0 && h->h_nextpg != P_NONE);
+       t->bt_psize = b.psize;
 
 
-               item->bti_pgno = h->h_pgno;
-               item->bti_index = 0;
+       /* Set the cache size; must be a multiple of the page size. */
+       if (b.cachesize && b.cachesize & b.psize - 1)
+               b.cachesize += (~b.cachesize & b.psize - 1) + 1;
+       if (b.cachesize < b.psize * MINCACHE)
+               b.cachesize = b.psize * MINCACHE;
 
 
-               if (!VALIDITEM(t, item)
-                   || _bt_cmp(t, key->data, item->bti_index) != 0)
-                       break;
-       }
+       /* Calculate number of pages to cache. */
+       ncache = (b.cachesize + t->bt_psize - 1) / t->bt_psize;
 
 
-       /* flush changes to disk */
-       if (ISDISK(t)) {
-               if (h->h_flags & F_DIRTY) {
-                       if (_bt_write(t, t->bt_curpage, NORELEASE) == RET_ERROR)
-                               return (RET_ERROR);
-               }
-       }
+       /*
+        * The btree data structure requires that at least two keys can fit
+        * on a page, but other than that there's no fixed requirement.  The
+        * user can specify the minimum number per page, and we translate
+        * that into the maximum number of bytes a key can use before being
+        * placed on an overflow page.
+        */
+       t->bt_minkeypage = (t->bt_psize - BTDATAOFF) / b.minkeypage;
 
 
-       if (ndel == 0)
-               return (RET_SPECIAL);
+       /* Initialize the buffer pool. */
+       if ((t->bt_mp =
+           mpool_open(NULL, t->bt_fd, t->bt_psize, ncache)) == NULL)
+               goto err;
+       mpool_filter(t->bt_mp, __bt_pgin, __bt_pgout, t);
 
 
-       return (RET_SUCCESS);
-}
+       /* Create a root page if new tree. */
+       if (nroot(t) == RET_ERROR)
+               goto err;
 
 
-/*
- *  BT_SYNC -- sync the btree to disk.
- *
- *     Parameters:
- *             tree -- btree to sync.
- *
- *     Returns:
- *             RET_SUCCESS, RET_ERROR.
- */
+       return (dbp);
 
 
-bt_sync(dbp)
-       DB *dbp;
-{
-       BTREE_P t;
-       BTHEADER *h;
-       pgno_t pgno;
+einval:        errno = EINVAL;
+       goto err;
 
 
-       t = (BTREE_P)dbp->internal;
-
-       /* if this is an in-memory btree, syncing is a no-op */
-       if (!ISDISK(t))
-               return (RET_SUCCESS);
+eftype:        errno = EFTYPE;
+       goto err;
 
 
-       h = (BTHEADER *) t->bt_curpage;
-       h->h_flags &= ~F_DIRTY;
-
-       if (ISCACHE(t)) {
-               pgno = t->bt_curpage->h_pgno;
-               if (_bt_write(t, h, RELEASE) == RET_ERROR)
-                       return(RET_ERROR);
-               if (lrusync(t->bt_s.bt_d.d_cache) < RET_ERROR)
-                       return (RET_ERROR);
-               if (_bt_getpage(t, pgno) == RET_ERROR)
-                       return (RET_ERROR);
-       } else {
-               if (_bt_write(t, h, NORELEASE) == RET_ERROR)
-                       return (RET_ERROR);
+err:   if (t) {
+               if (t->bt_dbp)
+                       free(t->bt_dbp);
+               if (t->bt_fd != -1)
+                       (void)close(t->bt_fd);
+               free(t);
        }
        }
-
-       return (fsync(t->bt_s.bt_d.d_fd));
+       return (NULL);
 }
 
 /*
 }
 
 /*
- *  BT_SEQ -- Sequential scan interface.
- *
- *     This routine supports sequential scans on the btree.  If called with
- *     flags set to R_CURSOR, or if no seq scan has been initialized in the
- *     current tree, we initialize the scan.  Otherwise, we advance the scan
- *     and return the next item.
- *
- *     Scans can be either keyed or non-keyed.  Keyed scans basically have
- *     a starting point somewhere in the middle of the tree.  Non-keyed
- *     scans start at an endpoint.  Also, scans can be backward (descending
- *     order), forward (ascending order), or no movement (keep returning
- *     the same item).
+ * NROOT -- Create the root of a new tree.
  *
  *
- *     Flags is checked every time we enter the routine, so the user can
- *     change directions on an active scan if desired.  The key argument
- *     is examined only when we initialize the scan, in order to position
- *     it properly.
+ * Parameters:
+ *     t:      tree
  *
  *
- *     Items are returned via the key and data arguments passed in.
- *
- *     Parameters:
- *             tree -- btree in which to do scan
- *             key -- key, used to position scan on initialization, and
- *                    used to return key components to the user.
- *             data -- used to return data components to the user.
- *             flags -- specify R_CURSOR, R_FIRST, R_LAST, R_NEXT, or
- *                      R_PREV.
- *
- *     Returns:
- *             RET_SUCCESS, RET_ERROR, or RET_SPECIAL if no more data
- *             exists in the tree in the specified direction.
- *
- *     Side Effects:
- *             Changes the btree's notion of the current position in the
- *             scan.
- *
- *     Warnings:
- *             The key and data items returned are static and will be
- *             overwritten by the next bt_get or bt_seq.
+ * Returns:
+ *     RET_ERROR, RET_SUCCESS
  */
  */
-
-int
-bt_seq(dbp, key, data, flags)
-       DB *dbp;
-       DBT *key, *data;
-       u_long flags;
+static int
+nroot(t)
+       BTREE *t;
 {
 {
-       BTREE_P t;
-       BTHEADER *h;
-       DATUM *d;
-       int status;
-
-       t = (BTREE_P)dbp->internal;
+       PAGE *meta, *root;
+       pgno_t npg;
 
 
-       /* do we need to initialize the scan? */
-       if (flags == R_CURSOR || flags == R_LAST || flags == R_FIRST
-           || !(t->bt_flags & BTF_SEQINIT)) {
-
-               /* initialize it */
-               status = _bt_seqinit(t, key, flags);
-       } else {
-               /* just advance the current scan pointer */
-               status = _bt_seqadvance(t, flags);
+       if ((meta = mpool_get(t->bt_mp, 0, 0)) != NULL) {
+               mpool_put(t->bt_mp, meta, 0);
+               return (RET_SUCCESS);
        }
        }
+       if (errno != EINVAL)
+               return (RET_ERROR);
 
 
-       key->size = data->size = 0;
-       key->data = data->data = (u_char *) NULL;
-
-       h = t->bt_curpage;
-
-       /* is there a valid item at the current scan location? */
-       if (status == RET_SPECIAL) {
-               if (flags == R_NEXT) {
-                       if (t->bt_cursor.c_index >= NEXTINDEX(h)) {
-                               if (NEXTINDEX(h) > 0)
-                                       t->bt_cursor.c_index = NEXTINDEX(h) - 1;
-                               else
-                                       t->bt_cursor.c_index = 0;
-                       }
-               } else {
-                       t->bt_cursor.c_index = 0;
-               }
-               return (RET_SPECIAL);
-       } else if (status == RET_ERROR)
+       if ((meta = mpool_new(t->bt_mp, &npg)) == NULL)
                return (RET_ERROR);
 
                return (RET_ERROR);
 
-       /* okay, return the data */
-       d = (DATUM *) GETDATUM(h, t->bt_cursor.c_index);
+       if ((root = mpool_new(t->bt_mp, &npg)) == NULL)
+               return (RET_ERROR);
 
 
-       return (_bt_buildret(t, d, data, key));
+       if (npg != P_ROOT)
+               return (RET_ERROR);
+       root->pgno = npg;
+       root->prevpg = root->nextpg = P_INVALID;
+       root->lower = BTDATAOFF;
+       root->upper = t->bt_psize;
+       root->flags = P_BLEAF;
+       mpool_put(t->bt_mp, meta, MPOOL_DIRTY);
+       mpool_put(t->bt_mp, root, MPOOL_DIRTY);
+       return (RET_SUCCESS);
 }
 
 }
 
-/*
- *  BT_CLOSE -- Close a btree
- *
- *     Parameters:
- *             tree -- tree to close
- *
- *     Returns:
- *             RET_SUCCESS, RET_ERROR.
- *
- *     Side Effects:
- *             Frees space occupied by the tree.
- */
-
-int
-bt_close(dbp)
-       DB *dbp;
+static int
+tmp()
 {
 {
-       struct HTBUCKET *b, *sb;
-       BTREE_P t;
-       BTHEADER *h;
-       HTABLE ht;
-       int fd, i;
-       char *cache;
-
-       t = (BTREE_P)dbp->internal;
-
-       if (t->bt_cursor.c_key != (char *) NULL)
-               (void) free(t->bt_cursor.c_key);
-
-       if (!ISDISK(t)) {
-               /* in-memory tree, release hash table memory */
-               ht = t->bt_s.bt_ht;
-               for (i = 0; i < HTSIZE; i++) {
-                       if ((b = ht[i]) == (struct HTBUCKET *) NULL)
-                               break;
-                       do {
-                               sb = b;
-                               (void) free((char *) b->ht_page);
-                               b = b->ht_next;
-                               (void) free((char *) sb);
-                       } while (b != (struct HTBUCKET *) NULL);
-               }
-               (void) free ((char *) ht);
-               (void) free ((char *) t);
-               return (RET_SUCCESS);
-       }
-
-       if ((t->bt_flags & BTF_ISWRITE) && !(t->bt_flags & BTF_METAOK)) {
-               if (_bt_wrtmeta(t) == RET_ERROR)
-                       return (RET_ERROR);
-       }
-
-       if (t->bt_curpage != (BTHEADER *) NULL) {
-               h = t->bt_curpage;
-               if (h->h_flags & F_DIRTY) {
-                       if (_bt_write(t, h, RELEASE) == RET_ERROR)
-                               return (RET_ERROR);
-               } else {
-                       if (_bt_release(t, h) == RET_ERROR)
-                               return (RET_ERROR);
-               }
-
-               /* flush and free the cache, if there is one */
-               if (ISCACHE(t)) {
-                       cache = t->bt_s.bt_d.d_cache;
-                       if (lrusync(cache) == RET_ERROR)
-                               return (RET_ERROR);
-                       lrufree(cache);
-               }
-               (void) free ((char *) h);
-       }
-
-       fd = t->bt_s.bt_d.d_fd;
-       (void) free ((char *) t);
-       return (close(fd));
+       sigset_t set, oset;
+       int fd;
+       char *envtmp;
+       char path[MAXPATHLEN];
+
+       envtmp = getenv("TMPDIR");
+       (void)snprintf(path,
+           sizeof(path), "%s/bt.XXXXXX", envtmp ? envtmp : "/tmp");
+
+       sigfillset(&set);
+       (void)sigprocmask(SIG_BLOCK, &set, &oset);
+       if ((fd = mkstemp(path)) != -1)
+               (void)unlink(path);
+       (void)sigprocmask(SIG_SETMASK, &oset, NULL);
+       return(fd);
 }
 }
index 3976cd2..b813304 100644 (file)
  */
 
 #if defined(LIBC_SCCS) && !defined(lint)
  */
 
 #if defined(LIBC_SCCS) && !defined(lint)
-static char sccsid[] = "@(#)bt_overflow.c      5.2 (Berkeley) %G%";
+static char sccsid[] = "@(#)bt_overflow.c      5.3 (Berkeley) %G%";
 #endif /* LIBC_SCCS and not lint */
 
 #endif /* LIBC_SCCS and not lint */
 
-#include <sys/types.h>
+#include <sys/param.h>
 #include <db.h>
 #include <db.h>
+#include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include "btree.h"
 
 /*
 #include <stdlib.h>
 #include <string.h>
 #include "btree.h"
 
 /*
- *  _BT_GETBIG -- Get big data from indirect pages.
+ * Big key/data code.
  *
  *
- *     This routine chases indirect blocks for the big object at the 
- *     specified page to a buffer, and return the address of the buffer.
+ * Big key and data entries are stored on linked lists of pages.  The initial
+ * reference is byte string stored with the key or data and is the page number
+ * and size.  The actual record is stored in a chain of pages linked by the
+ * nextpg field of the PAGE header.
  *
  *
- *     Parameters:
- *             t -- btree with the indirect blocks
- *             pgno -- page number that starts the chain
- *             p -- (char **) to get the address of the buffer containing
- *                  the key or datum.
- *             sz -- pointer to an int to get the size of the instantiated
- *                   object.
+ * The first page of the chain has a special property.  If the record is used
+ * by an internal page, it cannot be deleted and the P_PRESERVE bit will be set
+ * in the header.
  *
  *
- *     Returns:
- *             RET_SUCCESS, RET_ERROR.
- *
- *     Side Effects:
- *             None.
+ * XXX
+ * A single DBT is written to each chain, so a lot of space on the last page
+ * is wasted.  This is a fairly major bug for some data sets.
  */
 
  */
 
-int
-_bt_getbig(t, pgno, p, sz)
-       BTREE_P t;
-       pgno_t pgno;
-       char **p;
-       int *sz;
-{
-       pgno_t save;
-       size_t nbytes;
-       size_t nhere;
-       BTHEADER *h;
-       char *top, *from, *where;
-
-       save = t->bt_curpage->h_pgno;
-       if (_bt_getpage(t, pgno) == RET_ERROR)
-               return (RET_ERROR);
-
-       h = t->bt_curpage;
-
-       bcopy((char *) &(h->h_linp[0]),
-             (char *) &nbytes,
-             (size_t) sizeof(nbytes));
-
-       if ((*p = (char *) malloc(nbytes)) == (char *) NULL)
-               return (RET_ERROR);
-
-       *sz = nbytes;
-       from = ((char *) (&h->h_linp[0])) + sizeof(nbytes);
-       top = ((char *) h) + t->bt_psize;
-
-       /* need more space for data? */
-
-       where = *p;
-
-       while (nbytes > 0) {
-               nhere = (int) (top - from);
-               if (nhere > nbytes) {
-                       (void) bcopy(from, where, (size_t) nbytes);
-                       nbytes = 0;
-               } else {
-                       (void) bcopy(from, where, nhere);
-                       where += nhere;
-                       nbytes -= nhere;
-                       if (_bt_getpage(t, h->h_nextpg) == RET_ERROR)
-                               return (RET_ERROR);
-                       h = t->bt_curpage;
-                       top = ((char *) h) + t->bt_psize;
-                       from = (char *) &(h->h_linp[0]);
-               }
-       }
-
-       if (_bt_getpage(t, save) == RET_ERROR)
-               return (RET_ERROR);
-
-       return (RET_SUCCESS);
-}
-
 /*
 /*
- *  _BT_DELINDIR -- Delete a chain of indirect blocks from the btree.
- *
- *     When a large item is deleted from the tree, this routine puts the
- *     space that it occupied onto the free list for later reuse.  The
- *     bt_free entry in the btree structure points at the head of this list.
- *     This value is also stored on disk in the btree's metadata.
- *
- *     Parameters:
- *             t -- btree from which to delete pages
- *             chain -- page number that starts the chain.
+ * __OVFL_GET -- Get an overflow key/data item.
  *
  *
- *     Returns:
- *             RET_SUCCESS, RET_ERROR.
+ * Parameters:
+ *     t:      tree
+ *     p:      pointer to { pgno_t, size_t }
+ *     buf:    storage address
+ *     bufsz:  storage size
  *
  *
- *     Side Effects:
- *             Invalidates the current on-disk version of the btree's
- *             metadata (if any), and updates the free list appropriately.
+ * Returns:
+ *     RET_ERROR, RET_SUCCESS
  */
  */
-
 int
 int
-_bt_delindir(t, chain)
-       BTREE_P t;
-       pgno_t chain;
+__ovfl_get(t, p, ssz, buf, bufsz)
+       BTREE *t;
+       void *p;
+       size_t *ssz;
+       char **buf;
+       size_t *bufsz;
 {
 {
-       BTHEADER *h;
-       pgno_t save;
-       pgno_t oldfree;
-
-       h = t->bt_curpage;
-       save = h->h_pgno;
-       if (_bt_getpage(t, chain) == RET_ERROR)
-               return (RET_ERROR);
+       PAGE *h;
+       pgno_t pg;
+       size_t nb, plen, sz;
+
+       pg = *(pgno_t *)p;
+       *ssz = sz = *(size_t *)((char *)p + sizeof(pgno_t));
+
+#ifdef DEBUG
+       if (pg == P_INVALID || sz == 0)
+               abort();
+#endif
+       /* Make the buffer bigger as necessary. */
+       if (*bufsz < sz) {
+               if ((*buf = realloc(*buf, sz)) == NULL)
+                       return (RET_ERROR);
+               *bufsz = sz;
+       }
 
        /*
 
        /*
-        *  If some internal node is pointing at this chain, don't
-        *  delete it.
+        * Step through the linked list of pages, copying the data on each one
+        * into the buffer.  Never copy more than the data's length.
         */
         */
-
-       if (t->bt_curpage->h_flags & F_PRESERVE) {
-               if (_bt_getpage(t, save) == RET_ERROR)
+       plen = t->bt_psize - BTDATAOFF;
+       for (p = *buf;; p = (char *)p + nb, pg = h->nextpg) {
+               if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
                        return (RET_ERROR);
                        return (RET_ERROR);
-               return (RET_SUCCESS);
-       }
-
-       /* if there's nothing on the free list, this is easy... */
-       if (t->bt_free == P_NONE) {
-               t->bt_free = chain;
-       } else {
-               oldfree = t->bt_free;
 
 
-               /* find the end of the data chain for the deleted datum */
-               t->bt_free = chain;
-               do {
-                       if (_bt_getpage(t, chain) == RET_ERROR)
-                               return (RET_ERROR);
-                       h = t->bt_curpage;
-                       if (h->h_nextpg != P_NONE)
-                               chain = h->h_nextpg;
-               } while (h->h_nextpg != P_NONE);
+               nb = MIN(sz, plen);
+               bcopy((char *)h + BTDATAOFF, p, nb);
+               mpool_put(t->bt_mp, h, 0);
 
 
-               /* link freed pages into free list */
-               h->h_nextpg = oldfree;
-               if (_bt_write(t, h, RELEASE) == RET_ERROR)
-                       return (RET_ERROR);
-               if (_bt_getpage(t, oldfree) == RET_ERROR)
-                       return (RET_ERROR);
-               h = t->bt_curpage;
-               h->h_prevpg = chain;
-               if (_bt_write(t, h, RELEASE) == RET_ERROR)
-                       return (RET_ERROR);
+               if ((sz -= nb) == 0)
+                       break;
        }
        }
-
-       /* restore the tree's current page pointer */
-       if (_bt_getpage(t, save) == RET_ERROR)
-               return (RET_ERROR);
-
-       /* we have trashed the tree metadata; rewrite it later */
-       t->bt_flags &= ~BTF_METAOK;
-
        return (RET_SUCCESS);
 }
 
 /*
        return (RET_SUCCESS);
 }
 
 /*
- *  _BT_INDIRECT -- Write a series of indirect pages for big objects.
- *
- *     A chain of indirect pages looks like
- *
- *        +-------------------+   +---------------------+
- *        |hdr|size|          |   |hdr|                 |
- *        +---+----+          |   +---+                 |
- *        |   ... data ...    |   |   ... data ...      |    ...
- *        |                   |   |                     |
- *        +-------------------+   +---------------------+
- *
- *     where hdr is a standard btree page header (with the indirect bit
- *     set), size on the first page is the real size of the datum, and
- *     data are bytes of the datum, split across as many pages as necessary.
- *     Indirect pages are chained together with the h_prevpg and h_nextpg
- *     entries of the page header struct.
- *
- *     A single DBT is written per chain, so space on the last page is
- *     wasted.
+ * __OVFL_PUT -- Store an overflow key/data item.
  *
  *
- *     We return the page number of the start of the chain.
+ * Parameters:
+ *     t:      tree
+ *     data:   DBT to store
+ *     pgno:   storage page number
  *
  *
- *     When a big object is deleted from a tree, the space that it occupied
- *     is placed on a free list for later reuse.  This routine checks that
- *     free list before allocating new pages to the big datum being inserted.
- *
- *     Parameters:
- *             t -- btree in which to store indirect blocks
- *             data -- DBT with the big datum in it
- *             pgno -- place to put the starting page number of the chain
- *
- *     Returns:
- *             RET_SUCCESS, RET_ERROR.
- *
- *     Side Effects:
- *             Current page is changed on return.
+ * Returns:
+ *     RET_ERROR, RET_SUCCESS
  */
  */
-
 int
 int
-_bt_indirect(t, data, pgno)
-       BTREE_P t;
-       DBT *data;
-       pgno_t *pgno;
+__ovfl_put(t, dbt, pg)
+       BTREE *t;
+       const DBT *dbt;
+       pgno_t *pg;
 {
 {
-       pgno_t prev;
-       char *top;
-       char *where;
-       char *from;
-       size_t dsize;
-       pgno_t nextchn;
-       int ischain;
-       BTHEADER *cur;
-
-       /* get set for first page in chain */
-       prev = P_NONE;
-       dsize = data->size;
-       from = (char *) data->data;
+       PAGE *h, *last;
+       void *p;
+       pgno_t npg;
+       size_t nb, plen, sz;
 
 
-       /* if there are blocks on the free list, use them first */
-       if ((*pgno = t->bt_free) == P_NONE) {
-               if ((cur = _bt_allocpg(t)) == (BTHEADER *) NULL)
-                       return (RET_ERROR);
-
-               ischain = 0;
-               *pgno = cur->h_pgno = ++(t->bt_npages);
-       } else {
-               if (_bt_getpage(t, *pgno) == RET_ERROR)
+       /*
+        * Allocate pages and copy the key/data record into them.  Store the
+        * number of the first page in the chain.
+        */
+       plen = t->bt_psize - BTDATAOFF;
+       for (last = NULL, p = dbt->data, sz = dbt->size;;
+           p = (char *)p + plen, last = h) {
+               if ((h = mpool_new(t->bt_mp, &npg)) == NULL)
                        return (RET_ERROR);
                        return (RET_ERROR);
-               ischain = 1;
-               cur = t->bt_curpage;
-       }
 
 
-       cur->h_flags = F_CONT|F_LEAF;
-       (void) bcopy((char *) &dsize, (char *) &cur->h_linp[0], sizeof(size_t));
-       where = ((char *) (&cur->h_linp[0])) + sizeof(size_t);
-
-       /* fill and write pages in the chain */
-       for (;;) {
-               int nhere;
-
-               top = ((char *) cur) + t->bt_psize;
-               cur->h_prevpg = prev;
-               nextchn = cur->h_nextpg;
-               nhere = (int) (top - where);
-
-               if (nhere >= dsize) {
-                       (void) bcopy(from, where, (int) dsize);
-                       cur->h_nextpg = P_NONE;
-                       dsize = 0;
-               } else {
-                       (void) bcopy(from, where, (int) nhere);
-                       dsize -= nhere;
-                       from += nhere;
-                       if (nextchn == P_NONE)
-                               cur->h_nextpg = t->bt_npages + 1;
-                       prev = cur->h_pgno;
-               }
+               h->pgno = npg;
+               h->nextpg = h->prevpg = P_INVALID;
+               h->lower = h->upper = 0;
 
 
-               /* current page is ready to go; write it out */
-               if (_bt_write(t, cur, RELEASE) == RET_ERROR)
-                       return (RET_ERROR);
+               nb = MIN(sz, plen);
+               bcopy(p, (char *)h + BTDATAOFF, nb);
 
 
-               /* free the current page, if appropriate */
-               if (ISDISK(t) && !ISCACHE(t) && !ischain) {
-                       (void) free ((char *) cur);
-               }
+               if (last) {
+                       last->nextpg = h->pgno;
+                       last->flags |= P_OVERFLOW;
+                       mpool_put(t->bt_mp, last, MPOOL_DIRTY);
+               } else
+                       *pg = h->pgno;
 
 
-               /* done? */
-               if (dsize == 0)
+               if ((sz -= nb) == 0) {
+                       mpool_put(t->bt_mp, h, MPOOL_DIRTY);
                        break;
                        break;
-
-               /* allocate another page */
-               if (nextchn == P_NONE) {
-                       if ((cur = _bt_allocpg(t)) == (BTHEADER *) NULL)
-                               return (RET_ERROR);
-                       ischain = 0;
-                       cur->h_pgno = ++(t->bt_npages);
-               } else {
-                       if (_bt_getpage(t, nextchn) == RET_ERROR)
-                               return (RET_ERROR);
-                       ischain = 1;
-                       cur = t->bt_curpage;
                }
                }
-               cur->h_flags = F_LEAF | F_CONT;
-
-               where = (char *) (&cur->h_linp[0]);
-       }
-
-       /* if we used pages from the free list, record changes to it */
-       if (*pgno == t->bt_free) {
-               t->bt_free = nextchn;
-               t->bt_flags &= ~BTF_METAOK;
        }
        }
-
        return (RET_SUCCESS);
 }
 
 /*
        return (RET_SUCCESS);
 }
 
 /*
- *  _BT_MARKCHAIN -- Mark a chain of pages as used by an internal node.
- *
- *     Chains of indirect blocks pointed to by leaf nodes get reclaimed
- *     when the item that points to them gets deleted.  Chains pointed
- *     to by internal nodes never get deleted.  This routine marks a
- *     chain as pointed to by an internal node.
+ * __OVFL_DELETE -- Delete an overflow chain.
  *
  *
- *     Parameters:
- *             t -- tree in which to mark
- *             chain -- number of first page in chain
+ * Parameters:
+ *     t:      tree
+ *     p:      pointer to { pgno_t, size_t }
  *
  *
- *     Returns:
- *             RET_SUCCESS, RET_ERROR.
- *
- *     Side Effects:
- *             None.
+ * Returns:
+ *     RET_ERROR, RET_SUCCESS
  */
  */
-
 int
 int
-_bt_markchain(t, chain)
-       BTREE_P t;
-       pgno_t chain;
+__ovfl_delete(t, p)
+       BTREE *t;
+       void *p;
 {
 {
-       pgno_t save;
-
-       save = t->bt_curpage->h_pgno;
-
-       if (_bt_getpage(t, chain) == RET_ERROR)
+       PAGE *h;
+       pgno_t pg;
+       size_t plen, sz;
+
+       pg = *(pgno_t *)p;
+       sz = *(size_t *)((char *)p + sizeof(pgno_t));
+
+#ifdef DEBUG
+       if (pg == P_INVALID || sz == 0)
+               abort();
+#endif
+       if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
                return (RET_ERROR);
 
                return (RET_ERROR);
 
-       t->bt_curpage->h_flags |= (F_DIRTY|F_PRESERVE);
-
-       if (_bt_getpage(t, save) == RET_ERROR)
-               return (RET_ERROR);
+       /* Don't delete chains used by internal pages. */
+       if (h->flags & P_PRESERVE) {
+               mpool_put(t->bt_mp, h, 0);
+               return (RET_SUCCESS);
+       }
 
 
+       /* Step through the chain, calling the free routine for each page. */
+       plen = t->bt_psize - BTDATAOFF;
+       for (;; sz -= plen) {
+               if (sz >= plen)
+                       break;
+               pg = h->nextpg;
+               /* XXX mpool_free(t->bt_mp, h->pgno); */
+               if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
+                       return (RET_ERROR);
+       }
        return (RET_SUCCESS);
 }
        return (RET_SUCCESS);
 }
index c15b811..89ce70f 100644 (file)
  */
 
 #if defined(LIBC_SCCS) && !defined(lint)
  */
 
 #if defined(LIBC_SCCS) && !defined(lint)
-static char sccsid[] = "@(#)bt_put.c   5.3 (Berkeley) %G%";
+static char sccsid[] = "@(#)bt_put.c   5.4 (Berkeley) %G%";
 #endif /* LIBC_SCCS and not lint */
 
 #include <sys/types.h>
 #endif /* LIBC_SCCS and not lint */
 
 #include <sys/types.h>
+#include <errno.h>
 #include <db.h>
 #include <db.h>
+#include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include "btree.h"
 
 #include <stdlib.h>
 #include <string.h>
 #include "btree.h"
 
+static EPG *bt_fast __P((BTREE *, const DBT *, const DBT *, int *));
+
 /*
 /*
- *  _BT_INSERT -- Insert a new user datum in the btree.
- *
- *     This routine is called by bt_put, the public interface, once the
- *     location for the new item is known.  We do the work here, and
- *     handle splits if necessary.
+ * __BT_PUT -- Add a btree item to the tree.
  *
  *
- *     Parameters:
- *             t -- btree in which to do the insertion.
- *             item -- BTITEM describing location of new datum
- *             key -- key to insert
- *             data -- data associated with key
- *             flag -- magic cookie passed recursively to bt_put if we
- *                     have to do a split
+ * Parameters:
+ *     dbp:    pointer to access method
+ *     key:    key
+ *     data:   data
+ *     flag:   R_NOOVERWRITE
  *
  *
- *     Returns:
- *             RET_SUCCESS, RET_ERROR.
+ * Returns:
+ *     RET_ERROR, RET_SUCCESS and RET_SPECIAL if the key is already in the
+ *     tree and R_NOOVERWRITE specified.
  */
  */
-
 int
 int
-_bt_insert(t, item, key, data, flag)
-       BTREE_P t;
-       BTITEM *item;
-       DBT *key;
-       DBT *data;
-       int flag;
+__bt_put(dbp, key, data, flags)
+       const DB *dbp;
+       const DBT *key, *data;
+       u_int flags;
 {
 {
-       index_t index;
-       BTHEADER *h;
-       DATUM *d;
-       int nbytes;
-       int status;
-       pgno_t pgno;
-       int keysize, datasize;
-       int bigkey, bigdata;
-
-       if (_bt_getpage(t, item->bti_pgno) == RET_ERROR)
+       BTREE *t;
+       DBT tkey, tdata;
+       EPG *e;
+       PAGE *h;
+       index_t index, nxtindex;
+       pgno_t pg;
+       size_t nbytes;
+       int dflags, exact;
+       char *dest, db[NOVFLSIZE], kb[NOVFLSIZE];
+
+       if (flags && flags != R_NOOVERWRITE) {
+               errno = EINVAL;
                return (RET_ERROR);
                return (RET_ERROR);
-       h = t->bt_curpage;
-
-       if (TOOBIG(t, data->size)) {
-               bigdata = TRUE;
-               datasize = sizeof(pgno_t);
-       } else {
-               bigdata = FALSE;
-               datasize = data->size;
        }
        }
-
-       if (TOOBIG(t, key->size)) {
-               bigkey = TRUE;
-               keysize = sizeof(pgno_t);
-       } else {
-               bigkey = FALSE;
-               keysize = key->size;
+       t = dbp->internal;
+       if (ISSET(t, BTF_RDONLY)) {
+               errno = EPERM;
+               return (RET_ERROR);
        }
        }
-
-       nbytes = keysize + datasize + (sizeof(DATUM) - sizeof(char));
-       nbytes = LONGALIGN(nbytes) + sizeof(index_t);
-
-       /* if there's not enough room here, split the page */
-       if ((h->h_upper - h->h_lower) < nbytes) {
-               if (_bt_split(t) == RET_ERROR)
+       
+       /*
+        * If the key/data won't fit on a page, store it on indirect pages.
+        *
+        * XXX
+        * If the insert fails later on, these pages aren't recovered.
+        */
+       dflags = 0;
+       if (key->size >= t->bt_minkeypage) {
+               if (__ovfl_put(t, key, &pg) == RET_ERROR)
                        return (RET_ERROR);
                        return (RET_ERROR);
-
-               /* okay, try again (empty the stack first, though) */
-               while (_bt_pop((BTREE) t) != P_NONE)
-                       continue;
-
-               return (bt_put((BTREE) t, key, data, flag));
+               tkey.data = kb;
+               tkey.size = NOVFLSIZE;
+               *(pgno_t *)kb = pg;
+               *(size_t *)(kb + sizeof(pgno_t)) = key->size;
+               dflags |= P_BIGKEY;
+               key = &tkey;
        }
        }
-
-       /* put together a leaf page datum from the key/data pair */
-       index = item->bti_index;
-       nbytes = keysize + datasize + (sizeof(DATUM) - sizeof(char));
-
-       if ((d = (DATUM *) malloc((unsigned) nbytes)) == (DATUM *) NULL)
-               return (RET_ERROR);
-
-       d->d_ksize = keysize;
-       d->d_dsize = datasize;
-       d->d_flags = 0;
-
-       if (bigkey) {
-               if (_bt_indirect(t, key, &pgno) == RET_ERROR)
-                       return (RET_ERROR);
-               (void) bcopy((char *) &pgno, &(d->d_bytes[0]), sizeof(pgno));
-               d->d_flags |= D_BIGKEY;
-               if (_bt_getpage(t, item->bti_pgno) == RET_ERROR)
+       if (data->size >= t->bt_minkeypage) {
+               if (__ovfl_put(t, data, &pg) == RET_ERROR)
                        return (RET_ERROR);
                        return (RET_ERROR);
-       } else {
-               if (d->d_ksize > 0) {
-                       (void) bcopy((char *) key->data,
-                                     (char *) &(d->d_bytes[0]),
-                                     (int) d->d_ksize);
-               }
+               tdata.data = db;
+               tdata.size = NOVFLSIZE;
+               *(pgno_t *)db = pg;
+               *(size_t *)(db + sizeof(pgno_t)) = data->size;
+               dflags |= P_BIGDATA;
+               data = &tdata;
        }
 
        }
 
-       if (bigdata) {
-               if (_bt_indirect(t, data, &pgno) == RET_ERROR)
+       /* bt_fast and __bt_search pin the returned page. */
+       if (t->bt_order == NOT || (e = bt_fast(t, key, data, &exact)) == NULL)
+               if ((e = __bt_search(t, key, &exact)) == NULL)
                        return (RET_ERROR);
                        return (RET_ERROR);
-               (void) bcopy((char *) &pgno,
-                            &(d->d_bytes[keysize]),
-                            sizeof(pgno));
-               d->d_flags |= D_BIGDATA;
-               if (_bt_getpage(t, item->bti_pgno) == RET_ERROR)
-                       return (RET_ERROR);
-       } else {
-               if (d->d_dsize > 0) {
-                       (void) bcopy((char *) data->data,
-                                     (char *) &(d->d_bytes[keysize]),
-                                     (int) d->d_dsize);
-               }
-       }
-
-       /* do the insertion */
-       status = _bt_insertat(t, (char *) d, index);
-
-       (void) free((char *) d);
-
-       return (status);
-}
-
-/*
- *  _BT_INSERTI -- Insert IDATUM on current page in the btree.
- *
- *     This routine handles insertions to internal pages after splits
- *     lower in the tree.  On entry, t->bt_curpage is the page to get
- *     the new IDATUM.  We are also given pgno, the page number of the
- *     IDATUM that is immediately left of the new IDATUM's position.
- *     This guarantees that the IDATUM for the right half of the page
- *     after a split goes next to the IDATUM for its left half.
- *
- *     Parameters:
- *             t -- tree in which to do insertion.
- *             id -- new IDATUM to insert
- *             pgno -- page number of IDATUM left of id's position
- *
- *     Returns:
- *             RET_SUCCESS, RET_ERROR.
- */
 
 
-int
-_bt_inserti(t, id, pgno)
-       BTREE_P t;
-       IDATUM *id;
-       pgno_t pgno;
-{
-       BTHEADER *h = t->bt_curpage;
-       index_t next, i;
-       IDATUM *idx;
-       char *key;
-       pgno_t chain;
-       int free_key;
-       int ignore;
+       h = e->page;
+       index = e->index;
 
 
-       if (id->i_flags & D_BIGKEY) {
-               free_key = TRUE;
-               bcopy(&(id->i_bytes[0]), (char *) &chain, sizeof(chain));
-               if (_bt_getbig(t, chain, &key, &ignore) == RET_ERROR)
+       /*
+        * Add the specified key/data pair to the tree.  If an identical key
+        * is already in the tree, and R_NOOVERWRITE is set, an error is
+        * returned.  If R_NOOVERWRITE is not set, the key is either added (if
+        * duplicates are permitted) or an error is returned.
+        *
+        * Pages are split as required.
+        */
+       switch (flags) {
+       case R_NOOVERWRITE:
+               if (!exact)
+                       break;
+               /*
+                * One special case is if the cursor references the record and
+                * it's been flagged for deletion.  If so, we delete it and
+                * pretend it was never there.  Since the cursor will move to
+                * the next record the inserted record won't be seen.
+                */
+               if (ISSET(t, BTF_DELCRSR) && t->bt_bcursor.pgno == h->pgno &&
+                   t->bt_bcursor.index == index) {
+                       UNSET(t, BTF_DELCRSR);
+                       goto delete;
+               }
+               BT_CLR(t);
+               mpool_put(t->bt_mp, h, 0);
+               return (RET_SPECIAL);
+       default:
+               if (!exact || NOTSET(t, BTF_NODUPS))
+                       break;
+delete:                if (__bt_dleaf(t, h, index) == RET_ERROR) {
+                       BT_CLR(t);
+                       mpool_put(t->bt_mp, h, 0);
                        return (RET_ERROR);
                        return (RET_ERROR);
-       } else {
-               free_key = FALSE;
-               key = &(id->i_bytes[0]);
+               }
+               break;
        }
        }
-       i = _bt_binsrch(t, key);
-
-       next = NEXTINDEX(h);
-       while (i < next && _bt_cmp(t, key, i) >= 0)
-               i++;
 
 
-       if (free_key)
-               (void) free(key);
-
-       /* okay, now we're close; find adjacent IDATUM */
-       for (;;) {
-               idx = (IDATUM *) GETDATUM(h,i);
-               if (idx->i_pgno == pgno) {
-                       i++;
-                       break;
+       /*
+        * If not enough room, or the user has put a ceiling on the number of
+        * keys permitted in the page, split the page.  The split code will
+        * insert the key and data and unpin the current page.  If inserting
+        * into the offset array, shift the pointers up.
+        */
+       nbytes = NBLEAFDBT(key->size, data->size);
+       if (h->upper - h->lower < nbytes + sizeof(index_t) ||
+           t->bt_maxkeypage && t->bt_maxkeypage < NEXTINDEX(h))
+               return (__bt_split(t, h, key, data, dflags, nbytes, index));
+
+       if (index < (nxtindex = NEXTINDEX(h)))
+               bcopy(h->linp + index, h->linp + index + 1,
+                   (nxtindex - index) * sizeof(index_t));
+       h->lower += sizeof(index_t);
+
+       h->linp[index] = h->upper -= nbytes;
+       dest = (char *)h + h->upper;
+       WR_BLEAF(dest, key, data, dflags);
+
+       if (t->bt_order == NOT)
+               if (h->nextpg == P_INVALID) {
+                       if (index == NEXTINDEX(h) - 1) {
+                               t->bt_order = FORWARD;
+                               t->bt_last.index = index;
+                               t->bt_last.pgno = h->pgno;
+                       }
+               } else if (h->prevpg == P_INVALID) {
+                       if (index == 0) {
+                               t->bt_order = BACK;
+                               t->bt_last.index = 0;
+                               t->bt_last.pgno = h->pgno;
+                       }
                }
                }
-               --i;
-       }
 
 
-       /* correctly positioned, do the insertion */
-       return (_bt_insertat(t, (char *) id, i));
+       BT_CLR(t);
+       mpool_put(t->bt_mp, h, MPOOL_DIRTY);
+       SET(t, BTF_MODIFIED);
+       return (RET_SUCCESS);
 }
 
 }
 
+#ifdef STATISTICS
+u_long bt_cache_hit, bt_cache_miss;
+#endif
+
 /*
 /*
- *  _BT_INSERTAT -- Insert a datum at a given location on the current page.
- *
- *     This routine does insertions on both leaf and internal pages.
- *
- *     Parameters:
- *             t -- tree in which to do insertion.
- *             p -- DATUM or IDATUM to insert.
- *             index -- index in line pointer array to put this item.
+ * BT_FAST -- Do a quick check for sorted data.
  *
  *
- *     Returns:
- *             RET_SUCCESS, RET_ERROR.
+ * Parameters:
+ *     t:      tree
+ *     key:    key to insert
  *
  *
- *     Side Effects:
- *             Will rearrange line pointers to make space for the new
- *             entry.  This means that any scans currently active are
- *             invalid after this.
- *
- *     Warnings:
- *             There must be sufficient room for the new item on the page.
+ * Returns:
+ *     EPG for new record or NULL if not found.
  */
  */
-
-int
-_bt_insertat(t, p, index)
-       BTREE_P t;
-       char *p;
-       index_t index;
+static EPG *
+bt_fast(t, key, data, exactp)
+       BTREE *t;
+       const DBT *key, *data;
+       int *exactp;
 {
 {
-       IDATUM *id = (IDATUM *) p;
-       DATUM *d = (DATUM *) p;
-       BTHEADER *h;
-       CURSOR *c;
-       index_t nxtindex;
-       char *src, *dest;
-       int nbytes;
-
-       /* insertion may confuse an active scan.  fix it. */
-       c = &(t->bt_cursor);
-       if (t->bt_flags & BTF_SEQINIT && t->bt_curpage->h_pgno == c->c_pgno)
-               if (_bt_fixscan(t, index, d, INSERT) == RET_ERROR)
-                       return (RET_ERROR);
-
-       h = t->bt_curpage;
-       nxtindex = (index_t) NEXTINDEX(h);
+       EPG e;
+       PAGE *h;
+       size_t nbytes;
+       int cmp;
+
+       if ((h = mpool_get(t->bt_mp, t->bt_last.pgno, 0)) == NULL) {
+               t->bt_order = NOT;
+               return (NULL);
+       }
+       e.page = h;
+       e.index = t->bt_last.index;
 
        /*
 
        /*
-        *  If we're inserting at the middle of the line pointer array,
-        *  copy pointers that will follow the new one up on the page.
+        * If won't fit in this page or have too many keys in this page, have
+        * to search to get split stack.
         */
         */
-
-       if (index < nxtindex) {
-               src = (char *) &(h->h_linp[index]);
-               dest = (char *) &(h->h_linp[index + 1]);
-               nbytes = (h->h_lower - (src - ((char *) h)))
-                        + sizeof(h->h_linp[0]);
-               (void) bcopy(src, dest, nbytes);
-       }
-
-       /* compute size and copy data to page */
-       if (h->h_flags & F_LEAF) {
-               nbytes = d->d_ksize + d->d_dsize
-                        + (sizeof(DATUM) - sizeof(char));
+       nbytes =
+           NBLEAFDBT(key->size >= t->bt_minkeypage ? NOVFLSIZE : key->size,
+           data->size >= t->bt_minkeypage ? NOVFLSIZE : data->size);
+       if (h->upper - h->lower < nbytes + sizeof(index_t) ||
+           t->bt_maxkeypage && t->bt_maxkeypage < NEXTINDEX(h))
+               goto miss;
+
+       if (t->bt_order == FORWARD) {
+               if (e.page->nextpg != P_INVALID)
+                       goto miss;
+               if (e.index != NEXTINDEX(h) - 1)
+                       goto miss;
+               if ((cmp = __bt_cmp(t, key, &e)) < 0)
+                       goto miss;
+               t->bt_last.index = ++e.index;
        } else {
        } else {
-               nbytes = id->i_size + (sizeof(IDATUM) - sizeof(char));
+               if (e.page->prevpg != P_INVALID)
+                       goto miss;
+               if (e.index != 0)
+                       goto miss;
+               if ((cmp = __bt_cmp(t, key, &e)) > 0)
+                       goto miss;
+               t->bt_last.index = 0;
        }
        }
-       dest = (((char *) h) + h->h_upper) - LONGALIGN(nbytes);
-       (void) bcopy((char *) p, dest, nbytes);
-
-       /* update statistics */
-       dest -= (int) h;
-       h->h_linp[index] = (index_t) dest;
-       h->h_upper = (index_t) dest;
-       h->h_lower += sizeof(index_t);
-
-       /* we're done */
-       h->h_flags |= F_DIRTY;
-
-       return (RET_SUCCESS);
+       *exactp = cmp == 0;
+#ifdef STATISTICS
+       ++bt_cache_hit;
+#endif
+       return (&e);
+
+miss:  
+#ifdef STATISTICS
+       ++bt_cache_miss;
+#endif
+       t->bt_order = NOT;
+       mpool_put(t->bt_mp, h, 0);
+       return (NULL);
 }
 }
index 5effbd6..575cb18 100644 (file)
  */
 
 #if defined(LIBC_SCCS) && !defined(lint)
  */
 
 #if defined(LIBC_SCCS) && !defined(lint)
-static char sccsid[] = "@(#)bt_search.c        5.2 (Berkeley) %G%";
+static char sccsid[] = "@(#)bt_search.c        5.3 (Berkeley) %G%";
 #endif /* LIBC_SCCS and not lint */
 
 #include <sys/types.h>
 #include <db.h>
 #endif /* LIBC_SCCS and not lint */
 
 #include <sys/types.h>
 #include <db.h>
+#include <stdio.h>
 #include "btree.h"
 
 /*
 #include "btree.h"
 
 /*
- *  _BT_FIRST -- Find the first item in the tree that matches the supplied
- *              key.
+ * __BT_SEARCH -- Search a btree for a key.
  *
  *
- *     This routine supports deletion.  When the user supplies a key to
- *     be deleted, we find the first one, and iteratively delete all the
- *     matching ones that follow it.
+ * Parameters:
+ *     t:      tree to search
+ *     key:    key to find
+ *     exactp: pointer to exact match flag
  *
  *
- *     Parameters:
- *             t -- btree in which to find first occurrence
- *             key -- key to find
+ * Returns:
+ *     EPG for matching record, if any, or the EPG for the location of the
+ *     key, if it were inserted into the tree.
  *
  *
- *     Returns:
- *             The BTITEM for the matching item.  If there's no match,
- *             this may point to the first item > than the supplied key,
- *             or off the end of the page.
- *
- *     Warnings:
- *             The BTITEM returned is in static space and will be overwritten
- *             by the next search of any kind in any btree.
- */
-
-BTITEM *
-_bt_first(t, key)
-       BTREE_P t;
-       DBT *key;
-{
-       BTHEADER *h;
-       BTITEM *item;
-       index_t next;
-       int r;
-
-       /* find any matching item */
-       item = _bt_search(t, key);
-       h = t->bt_curpage;
-       next = NEXTINDEX(h);
-
-       /* if we're off the end of the page, search failed and we're done */
-       if (item->bti_index >= next)
-               return (item);
-
-       /* as long as we have an exact match, walk backwards */
-       while ((r = _bt_cmp(t, key->data, item->bti_index)) == 0) {
-
-               /* at start of page? */
-               if (item->bti_index == 0) {
-
-                       /* if no prev page, we're done */
-                       if (h->h_prevpg == P_NONE)
-                               return (item);
-
-                       /* walk backward, skipping empty pages */
-                       do {
-                               if (_bt_getpage(t, h->h_prevpg) == RET_ERROR)
-                                       return ((BTITEM *) NULL);
-                               h = t->bt_curpage;
-                       } while (NEXTINDEX(h) == 0 && h->h_prevpg != P_NONE);
-
-                       if (NEXTINDEX(h) != 0)
-                               item->bti_index = NEXTINDEX(h) - 1;
-                       else
-                               item->bti_index = 0;
-
-                       item->bti_pgno = h->h_pgno;
-               } else {
-                       item->bti_index--;
-               }
-       }
-
-       /* if we went too far backwards, step forward one entry */
-       if (r > 0) {
-               if (++(item->bti_index) >= NEXTINDEX(h)
-                   && h->h_nextpg != P_NONE) {
-
-                       /* walk forward, skipping empty pages */
-                       do {
-                               if (_bt_getpage(t, h->h_nextpg) == RET_ERROR)
-                                       return ((BTITEM *) NULL);
-                               h = t->bt_curpage;
-                       } while (h->h_nextpg != P_NONE && NEXTINDEX(h) == 0);
-
-                       item->bti_index = 0;
-                       item->bti_pgno = h->h_pgno;
-               }
-       }
-
-       /* got it */
-       return (item);
-}
-
-/*
- *  _BT_SEARCH, _BT_SEARCHR -- Search for a particular key in the tree.
- *
- *     Parameters:
- *             t -- btree in which to search
- *             key -- key to find
- *
- *     Returns:
- *             BTITEM for matching item, if any, or the BTITEM for the
- *             location of the key, if it were in the tree.
- *
- *     Warnings:
- *             The BTITEM returned is in static memory, and will be
- *             overwritten by the next search of any kind in any tree.
+ * Warnings:
+ *     The EPG returned is in static memory, and will be overwritten by the
+ *     next search of any kind in any tree.
  */
  */
-
-BTITEM *
-_bt_search(t, key)
-       BTREE_P t;
-       DBT *key;
-{
-       /* we want to start all of our searches at the root */
-       if (_bt_getpage(t, (pgno_t) P_ROOT) == RET_ERROR)
-               return ((BTITEM *) NULL);
-
-       return (_bt_searchr(t, key));
-}
-
-BTITEM *
-_bt_searchr(t, key)
-       BTREE_P t;
-       DBT *key;
-{
-       BTHEADER *h = t->bt_curpage;
-       index_t index;
-       IDATUM *id;
-       DATUM *d;
-       static BTITEM item;
-
-       /* do a binary search on the current page */
-       index = _bt_binsrch(t, key->data);
-
-       /*
-        *  At this point, the binary search terminated because the endpoints
-        *  got too close together, or we have a match.  Figure out which
-        *  case applies and decide what to do based on the page type.
-        */
-       if (h->h_flags & F_LEAF) {
-               item.bti_pgno = h->h_pgno;
-               item.bti_index = index;
-               if (index < NEXTINDEX(h))
-                       d = (DATUM *) GETDATUM(h,index);
-               else
-                       d = (DATUM *) NULL;
-
-               item.bti_datum = d;
-               return(&item);
-       } else {
-               id = (IDATUM *) GETDATUM(h, index);
-               if (_bt_push(t, h->h_pgno) == RET_ERROR)
-                       return ((BTITEM *) NULL);
-               if (_bt_getpage(t, id->i_pgno) == RET_ERROR)
-                       return ((BTITEM *) NULL);
-               return (_bt_searchr(t, key));
-       }
-}
-
-/*
- *  _BT_BINSRCH -- Do a binary search for a given key on the current page.
- *
- *     Searches on internal pages are handled slightly differently from
- *     searches on leaf pages.  This is because internal page searches
- *     find the largest item <= key in the tree, and leaf searches find
- *     the smallest item >= key.  This guarantees that leaf page searches
- *     leave us pointing at the item's correct position, and internal
- *     searches descend the tree correctly.
- *
- *     Parameters:
- *             t -- tree to search
- *             key -- key we're looking for
- *
- *     Returns:
- *             Index of the line pointer array entry for the (closest)
- *             match to key on the current page, with "closest" as defined
- *             above.
- */
-
-index_t
-_bt_binsrch(t, key)
-       BTREE_P t;
-       char *key;
+EPG *
+__bt_search(t, key, exactp)
+       BTREE *t;
+       const DBT *key;
+       int *exactp;
 {
 {
-       index_t lbound, ubound, cur;
-       BTHEADER *h = t->bt_curpage;
-       int match = 0;
-       int r;
-
-       lbound = 0;
-       ubound = NEXTINDEX(h);
-       if (ubound > 0)
-               --ubound;
-
-       /* do a binary search on the current page */
-       while ((ubound - lbound) > 1) {
-               cur = lbound + ((ubound - lbound) / 2);
-               r = _bt_cmp(t, key, cur);
-
-               if (r > 0)
-                       lbound = cur + 1;
-               else if (r < 0)
-                       ubound = cur;
-               else {
-                       match++;
-                       break;
-               }
-       }
-
-       /*
-        *  At this point, the binary search terminated because the endpoints
-        *  got too close together, or we have a match.  Figure out which
-        *  case applies, decide what to do based on the page type (leaf or
-        *  internal), and do the right thing.
-        */
-       if (match) {
-               return (cur);
-       } else if (ubound != lbound) {
-               if (h->h_flags & F_LEAF) {
-                       r = _bt_cmp(t, key, lbound);
-                       if (r <= 0) {
-                               return (lbound);
+       register index_t index;
+       register int base, cmp, lim;
+       register PAGE *h;
+       pgno_t pg;
+       static EPG e;
+
+       for (pg = P_ROOT;;) {
+               if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
+                       return (NULL);
+
+               /* Do a binary search on the current page. */
+               e.page = h;
+               for (base = 0, lim = NEXTINDEX(h); lim; lim >>= 1) {
+                       e.index = index = base + (lim >> 1);
+                       if ((cmp = __bt_cmp(t, key, &e)) == 0) {
+                               if (h->flags & P_BLEAF) {
+                                       *exactp = 1;
+                                       return (&e);
+                               }
+                               goto next;
                        }
                        }
-               } else {
-                       r = _bt_cmp(t, key, ubound);
-
-                       /* for internal nodes, move as far left as possible */
-                       if (r < 0) {
-                               r = _bt_cmp(t, key, lbound);
-                               if (r < 0 && lbound > 0)
-                                       --lbound;
-                               return (lbound);
-                       } else {
-                               return (ubound);
+                       if (cmp > 0) {
+                               base = index + 1;
+                               --lim;
                        }
                }
                        }
                }
-       }
 
 
-       if (h->h_flags & F_LEAF) {
-               if (ubound < NEXTINDEX(h)) {
-                       r = _bt_cmp(t, key, ubound);
-                       if (r > 0)
-                               ubound++;
+               /*
+                * No match found.  Base is the smallest index greater than
+                * key but may be an illegal index.  Use base if it's a leaf
+                * page, decrement it by one if it's an internal page.  This
+                * is safe because internal pages can't be empty.
+                */
+               index = h->flags & P_BLEAF ? base : base - 1;
+
+               /* If it's a leaf page, we're done. */
+               if (h->flags & P_BLEAF) {
+                       e.index = index;
+                       *exactp = 0;
+                       return (&e);
                }
                }
-       } else {
-               /* for internal pages, move as far left as possible */
-               if (ubound == NEXTINDEX(h))
-                       ubound--;
 
 
-               while (_bt_cmp(t, key, ubound) < 0)
-                       ubound--;
+next:          if (bt_push(t, h->pgno, index) == RET_ERROR)
+                       return (NULL);
+               pg = GETBINTERNAL(h, index)->pgno;
+               mpool_put(t->bt_mp, h, 0);
        }
        }
-       return (ubound);
 }
 }
index 58b3816..a6bb759 100644 (file)
  */
 
 #if defined(LIBC_SCCS) && !defined(lint)
  */
 
 #if defined(LIBC_SCCS) && !defined(lint)
-static char sccsid[] = "@(#)bt_seq.c   5.4 (Berkeley) %G%";
+static char sccsid[] = "@(#)bt_seq.c   5.5 (Berkeley) %G%";
 #endif /* LIBC_SCCS and not lint */
 
 #include <sys/types.h>
 #include <errno.h>
 #include <db.h>
 #endif /* LIBC_SCCS and not lint */
 
 #include <sys/types.h>
 #include <errno.h>
 #include <db.h>
+#include <stdio.h>
 #include <stdlib.h>
 #include <stdlib.h>
+#include <stddef.h>
 #include "btree.h"
 
 #include "btree.h"
 
+static int      bt_seqadv __P((BTREE *, EPG *, int));
+static int      bt_seqset __P((BTREE *, EPG *, DBT *, int));
+
 /*
 /*
- *  _BT_SEQINIT -- Initialize a sequential scan on the btree.
+ * Sequential scan support.
  *
  *
- *     Sets the tree's notion of the current scan location correctly
- *     given a key and a direction.
+ * The tree can be scanned sequentially, starting from either end of the tree
+ * or from any specific key.  A scan request before any scanning is done is
+ * initialized as starting from the least node.
  *
  *
- *     Parameters:
- *             t -- tree in which to initialize scan
- *             key -- key for initial scan position
- *             flags -- R_NEXT, R_PREV
+ * Each tree has an EPGNO which has the current position of the cursor.  The
+ * cursor has to survive deletions/insertions in the tree without losing its
+ * position.  This is done by noting deletions without doing them, and then
+ * doing them when the cursor moves (or the tree is closed).
+ */
+
+/*
+ * __BT_SEQ -- Btree sequential scan interface.
  *
  *
- *     Returns:
- *             RET_SUCCESS, RET_ERROR, or RET_SPECIAL if there's no data
- *             in the tree to scan.
+ * Parameters:
+ *     dbp:    pointer to access method
+ *     key:    key for positioning and return value
+ *     data:   data return value
+ *     flags:  R_CURSOR, R_FIRST, R_LAST, R_NEXT, R_PREV.
  *
  *
- *     Side Effects:
- *             Changes current scan position for the tree.  Almost certainly
- *             changes current page, as well.  Sets BTF_SEQINIT bit in tree
- *             flags, so that we know we've initialized a scan.
+ * Returns:
+ *     RET_ERROR, RET_SUCCESS or RET_SPECIAL if there's no next key.
  */
  */
-
 int
 int
-_bt_seqinit(t, key, flags)
-       BTREE_P t;
-       DBT *key;
-       int flags;
+__bt_seq(dbp, key, data, flags)
+       const DB *dbp;
+       DBT *key, *data;
+       u_int flags;
 {
 {
-       BTITEM *item;
-       BTHEADER *h;
-       CURSOR *c;
-       IDATUM *id;
-       index_t last;
+       BTREE *t;
+       EPG e;
+       int status;
 
        /*
 
        /*
-        *  Figure out if we really have to search for the key that the
-        *  user supplied.  If key is null, then this is an unkeyed scan
-        *  and we can just start from an endpoint.
+        * If scan unitialized as yet, or starting at a specific record, set
+        * the scan to a specific key.  Both bt_seqset and bt_seqadv pin the
+        * page the cursor references if they're successful.
         */
         */
+       t = dbp->internal;
+       switch(flags) {
+       case R_NEXT:
+               if (ISSET(t, BTF_SEQINIT)) {
+                       status = bt_seqadv(t, &e, flags);
+                       break;
+               }
+               /* FALLTHROUGH */
+       case R_CURSOR:
+       case R_FIRST:
+               status = bt_seqset(t, &e, key, flags);
+               SET(t, BTF_SEQINIT);
+               break;
+       case R_PREV:
+               if (ISSET(t, BTF_SEQINIT)) {
+                       status = bt_seqadv(t, &e, flags);
+                       break;
+               }
+               /* FALLTHROUGH */
+       case R_LAST:
+               status = bt_seqset(t, &e, key, flags);
+               SET(t, BTF_SEQINIT);
+               break;
+       default:
+               errno = EINVAL;
+               return (RET_ERROR);
+       }
 
 
-       c = &(t->bt_cursor);
-
-       if (flags == R_CURSOR) {
-               if (key->data != (u_char *) NULL) {
+       if (status == RET_SUCCESS) {
+               status = __bt_ret(t, &e, key, data);
 
 
-                       /* key supplied, find first instance of it */
-                       item = _bt_first(t, key);
-                       c->c_index = item->bti_index;
-                       c->c_pgno = t->bt_curpage->h_pgno;
-               } else {
-                       errno = EINVAL;
-                       return (RET_ERROR);
+               /* Update the actual cursor. */
+               if (status == RET_SUCCESS) {
+                       t->bt_bcursor.pgno = e.page->pgno;
+                       t->bt_bcursor.index = e.index;
                }
                }
+               mpool_put(t->bt_mp, e.page, 0);
+       }
+       return (status);
+}
 
 
-       } else {
+/*
+ * BT_SEQSET -- Set the sequential scan to a specific key.
+ *
+ * Parameters:
+ *     t:      tree
+ *     ep:     storage for returned key
+ *     key:    key for initial scan position
+ *     flags:  R_CURSOR, R_FIRST, R_LAST, R_NEXT, R_PREV
+ *
+ * Side effects:
+ *     Pins the page the cursor references.
+ *
+ * Returns:
+ *     RET_ERROR, RET_SUCCESS or RET_SPECIAL if there's no next key.
+ */
+static int
+bt_seqset(t, ep, key, flags)
+       BTREE *t;
+       EPG *ep;
+       DBT *key;
+       int flags;
+{
+       EPG *e;
+       PAGE *h;
+       pgno_t pg;
+       int exact;
 
 
-               /*
-                *  Unkeyed scan.  For backward scans, find the last item
-                *  in the tree; for forward scans, find the first item.
-                */
+       /*
+        * Delete any already deleted record that we've been saving because
+        * the cursor pointed to it.  Since going to a specific key, should
+        * delete any logically deleted records so they aren't found.
+        */
+       if (ISSET(t, BTF_DELCRSR) && __bt_crsrdel(t, &t->bt_bcursor))
+               return (RET_ERROR);
 
 
-               if (_bt_getpage(t, (pgno_t) P_ROOT) == RET_ERROR)
+       /*
+        * If R_CURSOR set, find the first instance of the key in the tree and
+        * point the cursor at it.  Otherwise, find the first or the last record
+        * in the tree and point the cursor at it.  The cursor may not be moved
+        * until a new key has been found.
+        */
+       switch(flags) {
+       case R_CURSOR:                          /* Keyed scan. */
+               if (key->data == NULL || key->size == 0) {
+                       errno = EINVAL;
                        return (RET_ERROR);
                        return (RET_ERROR);
-               h = t->bt_curpage;
-               if (flags == R_LAST || flags == R_PREV) {
-
-                       /* backward scan */
-                       while (!(h->h_flags & F_LEAF)) {
-                               last = NEXTINDEX(h) - 1;
-                               id = (IDATUM *) GETDATUM(h,last);
-                               if (_bt_getpage(t, id->i_pgno) == RET_ERROR)
-                                       return (RET_ERROR);
-                               h = t->bt_curpage;
-                       }
-
-                       /* skip empty pages */
-                       while (NEXTINDEX(h) == 0 && h->h_prevpg != P_NONE) {
-                               if (_bt_getpage(t, h->h_prevpg) == RET_ERROR)
-                                       return (RET_ERROR);
-                               h = t->bt_curpage;
-                       }
-
-                       c->c_pgno = h->h_pgno;
-                       if (NEXTINDEX(h) > 0)
-                               c->c_index = NEXTINDEX(h) - 1;
-                       else
-                               c->c_index = 0;
-               } else if (flags == R_FIRST || flags == R_NEXT) {
-                       /* forward scan */
-                       while (!(h->h_flags & F_LEAF)) {
-                               id = (IDATUM *) GETDATUM(h,0);
-                               if (_bt_getpage(t, id->i_pgno) == RET_ERROR)
-                                       return (RET_ERROR);
-                               h = t->bt_curpage;
-                       }
+               }
+               e = __bt_first(t, key, &exact); /* Returns pinned page. */
+               if (e == NULL)
+                       return (RET_ERROR);
+               if (!exact) {
+                       mpool_put(t->bt_mp, e->page, 0);
+                       return (RET_SPECIAL);
+               }
+               *ep = *e;
+               break;
+       case R_FIRST:                           /* First record. */
+       case R_NEXT:
+               /* Walk down the left-hand side of the tree. */
+               for (pg = P_ROOT;;) {
+                       if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
+                               return (RET_ERROR);
+                       if (h->flags & (P_BLEAF|P_RLEAF))
+                               break;
+                       pg = GETBINTERNAL(h, 0)->pgno;
+                       mpool_put(t->bt_mp, h, 0);
+               }
 
 
-                       /* skip empty pages */
-                       while (NEXTINDEX(h) == 0 && h->h_nextpg != P_NONE) {
-                               if (_bt_getpage(t, h->h_nextpg) == RET_ERROR)
-                                       return (RET_ERROR);
-                               h = t->bt_curpage;
-                       }
+               /* Skip any empty pages. */
+               while (NEXTINDEX(h) == 0 && h->nextpg != P_INVALID) {
+                       pg = h->nextpg;
+                       mpool_put(t->bt_mp, h, 0);
+                       if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
+                               return (RET_ERROR);
+               }
 
 
-                       c->c_pgno = h->h_pgno;
-                       c->c_index = 0;
-               } else {
-                       /* no flags passed in */
-                       errno = EINVAL;
-                       return (RET_ERROR);
+               if (NEXTINDEX(h) == 0) {
+                       mpool_put(t->bt_mp, h, 0);
+                       return (RET_SPECIAL);
                }
                }
-       }
 
 
-       /* okay, scan is initialized */
-       t->bt_flags |= BTF_SEQINIT;
+               ep->page = h;
+               ep->index = 0;
+               break;
+       case R_LAST:                            /* Last record. */
+       case R_PREV:
+               /* Walk down the right-hand side of the tree. */
+               for (pg = P_ROOT;;) {
+                       if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
+                               return (RET_ERROR);
+                       if (h->flags & (P_BLEAF|P_RLEAF))
+                               break;
+                       pg = GETBINTERNAL(h, NEXTINDEX(h) - 1)->pgno;
+                       mpool_put(t->bt_mp, h, 0);
+               }
 
 
-       /* don't need the descent stack anymore */
-       while (_bt_pop(t) != P_NONE)
-               continue;
+               /* Skip any empty pages. */
+               while (NEXTINDEX(h) == 0 && h->prevpg != P_INVALID) {
+                       pg = h->prevpg;
+                       mpool_put(t->bt_mp, h, 0);
+                       if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
+                               return (RET_ERROR);
+               }
 
 
-       if (c->c_index == NEXTINDEX(t->bt_curpage))
-               return (RET_SPECIAL);
+               if (NEXTINDEX(h) == 0) {
+                       mpool_put(t->bt_mp, h, 0);
+                       return (RET_SPECIAL);
+               }
 
 
+               ep->page = h;
+               ep->index = NEXTINDEX(h) - 1;
+               break;
+       }
        return (RET_SUCCESS);
 }
 
 /*
        return (RET_SUCCESS);
 }
 
 /*
- *  _BT_SEQADVANCE -- Advance the sequential scan on this tree.
- *
- *     Moves the current location pointer for the scan on this tree one
- *     spot in the requested direction.
+ * BT_SEQADVANCE -- Advance the sequential scan.
  *
  *
- *     Parameters:
- *             t -- btree being scanned
- *             flags -- for R_NEXT, R_PREV
+ * Parameters:
+ *     t:      tree
+ *     flags:  R_NEXT, R_PREV
  *
  *
- *     Returns:
- *             RET_SUCCESS, RET_ERROR, or RET_SPECIAL if there is no
- *             more data in the specified direction.
+ * Side effects:
+ *     Pins the page the new key/data record is on.
  *
  *
- *     Side Effects:
- *             May change current page.
+ * Returns:
+ *     RET_ERROR, RET_SUCCESS or RET_SPECIAL if there's no next key.
  */
  */
-
-int
-_bt_seqadvance(t, flags)
-       BTREE_P t;
+static int
+bt_seqadv(t, e, flags)
+       BTREE *t;
+       EPG *e;
        int flags;
 {
        int flags;
 {
-       BTHEADER *h;
-       CURSOR *c;
+       EPGNO *c, delc;
+       PAGE *h;
        index_t index;
        index_t index;
+       pgno_t pg;
 
 
-       c = &(t->bt_cursor);
-       index = c->c_index;
+       /* Save the current cursor if going to delete it. */
+       c = &t->bt_bcursor;
+       if (ISSET(t, BTF_DELCRSR))
+               delc = *c;
 
 
-       if (_bt_getpage(t, c->c_pgno) == RET_ERROR)
+       if ((h = mpool_get(t->bt_mp, c->pgno, 0)) == NULL)
                return (RET_ERROR);
                return (RET_ERROR);
-       h = t->bt_curpage;
-
-       /* by the time we get here, don't need the cursor key anymore */
-       if (c->c_key != (char *) NULL)
-               (void) free(c->c_key);
-
-       if (flags == R_NEXT) {
-
-               /*
-                *  This is a forward scan.  If the cursor is pointing
-                *  at a virtual record (that is, it was pointing at
-                *  a record that got deleted), then we should return
-                *  the record it's pointing at now.  Otherwise, we
-                *  should advance the scan.  In either case, we need
-                *  to be careful not to run off the end of the current
-                *  page.
-                */
-
-               if (c->c_flags & CRSR_BEFORE) {
-
-                       if (index >= NEXTINDEX(h)) {
-                               /* out of items on this page, get next page */
-                               if (h->h_nextpg == P_NONE) {
-                                       /* tell caller we're done... */
-                                       c->c_index = NEXTINDEX(h);
-                                       return (RET_SPECIAL);
-                               }
-
-                               /* skip empty pages */
-                               do {
-                                       if (_bt_getpage(t, h->h_nextpg)
-                                           == RET_ERROR) {
-                                               c->c_index = NEXTINDEX(h);
-                                               return (RET_ERROR);
-                                       }
-                                       h = t->bt_curpage;
-                                       c->c_pgno = h->h_pgno;
-                               } while (NEXTINDEX(h) == 0
-                                        && h->h_nextpg != P_NONE);
 
 
-                               if (NEXTINDEX(h) == 0) {
-                                       /* tell caller we're done */
-                                       c->c_index = NEXTINDEX(h);
-                                       return (RET_SPECIAL);
-                               }
-                               index = 0;
-                       }
-                       c->c_flags &= ~CRSR_BEFORE;
-
-               } else if (++index >= NEXTINDEX(h)) {
-
-                       /* out of items on this page, get next page */
-                       if (h->h_nextpg == P_NONE) {
-                               /* tell caller we're done... */
-                               c->c_index = NEXTINDEX(h);
-                               return (RET_SPECIAL);
-                       }
-
-                       /* skip empty pages */
+       /*
+        * Find the next/previous record in the tree and point the cursor at it.
+        * The cursor may not be moved until a new key has been found.
+        */
+       index = c->index;
+       switch(flags) {
+       case R_NEXT:                    /* Next record. */
+               if (++index == NEXTINDEX(h)) {
                        do {
                        do {
-                               if (_bt_getpage(t, h->h_nextpg) == RET_ERROR) {
-                                       c->c_index = NEXTINDEX(h);
+                               pg = h->nextpg;
+                               mpool_put(t->bt_mp, h, 0);
+                               if (pg == P_INVALID)
+                                       return (RET_SPECIAL);
+                               if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
                                        return (RET_ERROR);
                                        return (RET_ERROR);
-                               }
-                               h = t->bt_curpage;
-                               c->c_pgno = h->h_pgno;
-                       } while (NEXTINDEX(h) == 0 && h->h_nextpg != P_NONE);
-
-                       if (NEXTINDEX(h) == 0) {
-                               /* tell caller we're done */
-                               c->c_index = NEXTINDEX(h);
-                               return (RET_SPECIAL);
-                       }
+                       } while (NEXTINDEX(h) == 0);
                        index = 0;
                }
                        index = 0;
                }
-       } else if (flags == R_PREV) {
-
-               /* for backward scans, life is substantially easier */
-               c->c_flags &= ~CRSR_BEFORE;
-               if (c->c_key != (char *) NULL) {
-                       (void) free(c->c_key);
-                       c->c_key = (char *) NULL;
-               }
-
-               if (index == 0) {
-
-                       /* we may be done */
-                       c->c_index = 0;
-
-                       /* out of items on this page, get next page */
-                       if (h->h_prevpg == P_NONE)
-                               return (RET_SPECIAL);
-
-                       /* skip empty pages */
+               break;
+       case R_PREV:                    /* Previous record. */
+               if (index-- == 0) {
                        do {
                        do {
-                               if (_bt_getpage(t, h->h_prevpg) == RET_ERROR)
+                               pg = h->prevpg;
+                               mpool_put(t->bt_mp, h, 0);
+                               if (pg == P_INVALID)
+                                       return (RET_SPECIAL);
+                               if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
                                        return (RET_ERROR);
                                        return (RET_ERROR);
-                               h = t->bt_curpage;
-                               c->c_pgno = h->h_pgno;
-                       } while (NEXTINDEX(h) == 0 && h->h_prevpg != P_NONE);
-
-                       if (NEXTINDEX(h) == 0)
-                               return (RET_SPECIAL);
-
+                       } while (NEXTINDEX(h) == 0);
                        index = NEXTINDEX(h) - 1;
                        index = NEXTINDEX(h) - 1;
-               } else
-                       --index;
-       } else {
-               /* must specify a direction */
-               errno = EINVAL;
-               return (RET_ERROR);
+               }
+               break;
        }
 
        }
 
-       c->c_index = index;
+       e->page = h;
+       e->index = index;
+
+       /*
+        * Delete any already deleted record that we've been saving because the
+        * cursor pointed to it.  This could cause the new index to be shifted
+        * down by one if the record we're deleting is on the same page and has
+        * a larger index.
+        */
+       if (ISSET(t, BTF_DELCRSR)) {
+               UNSET(t, BTF_DELCRSR);                  /* Don't try twice. */
+               if (c->pgno == delc.pgno && c->index > delc.index)
+                       --c->index;
+               if (__bt_crsrdel(t, &delc))
+                       return (RET_ERROR);
+       }
        return (RET_SUCCESS);
 }
        return (RET_SUCCESS);
 }
+
+/*
+ * __BT_CRSRDEL -- Delete the record referenced by the cursor.
+ *
+ * Parameters:
+ *     t:      tree
+ *
+ * Returns:
+ *     RET_ERROR, RET_SUCCESS
+ */
+int
+__bt_crsrdel(t, c)
+       BTREE *t;
+       EPGNO *c;
+{
+       PAGE *h;
+       int status;
+
+       UNSET(t, BTF_DELCRSR);                  /* Don't try twice. */
+       if ((h = mpool_get(t->bt_mp, c->pgno, 0)) == NULL)
+               return (RET_ERROR);
+       status = __bt_dleaf(t, h, c->index);
+       mpool_put(t->bt_mp, h, MPOOL_DIRTY);
+       return (status);
+}
index 4f2fe33..b864a9b 100644 (file)
  */
 
 #if defined(LIBC_SCCS) && !defined(lint)
  */
 
 #if defined(LIBC_SCCS) && !defined(lint)
-static char sccsid[] = "@(#)bt_split.c 5.2 (Berkeley) %G%";
+static char sccsid[] = "@(#)bt_split.c 5.3 (Berkeley) %G%";
 #endif /* LIBC_SCCS and not lint */
 
 #include <sys/types.h>
 #endif /* LIBC_SCCS and not lint */
 
 #include <sys/types.h>
+#define        __DBINTERFACE_PRIVATE
 #include <db.h>
 #include <db.h>
+#include <limits.h>
+#include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include "btree.h"
 
 #include <stdlib.h>
 #include <string.h>
 #include "btree.h"
 
+static int      bt_preserve __P((BTREE *, pgno_t));
+static PAGE    *bt_psplit __P((BTREE *, PAGE *, PAGE *, PAGE *, int *));
+static PAGE    *bt_page __P((BTREE *, PAGE *, PAGE **, PAGE **, int *));
+static PAGE    *bt_root __P((BTREE *, PAGE *, PAGE **, PAGE **, int *));
+static int      bt_rroot __P((BTREE *, PAGE *, PAGE *, PAGE *));
+static int      bt_broot __P((BTREE *, PAGE *, PAGE *, PAGE *));
+static recno_t  rec_total __P((PAGE *));
+
+#ifdef STATISTICS
+u_long bt_rootsplit, bt_split, bt_sortsplit, bt_pfxsaved;
+#endif
+
 /*
 /*
- *  _BT_SPLIT -- Split a page into two pages.
- *
- *     Splits are caused by insertions, and propogate up the tree in
- *     the usual way.  The root page is always page 1 in the file on
- *     disk, so root splits are handled specially.  On entry to this
- *     routine, t->bt_curpage is the page to be split.
+ * __BT_SPLIT -- Split the tree.
  *
  *
- *     Parameters:
- *             t -- btree in which to do split.
+ * Parameters:
+ *     t:      tree
+ *     h:      page to split
+ *     key:    key to insert
+ *     data:   data to insert
+ *     flags:  BIGKEY/BIGDATA flags
+ *     nbytes: length of insertion
+ *     skip:   index to leave open
  *
  *
- *     Returns:
- *             RET_SUCCESS, RET_ERROR.
- *
- *     Side Effects:
- *             Changes the notion of the current page.
+ * Returns:
+ *     RET_ERROR, RET_SUCCESS
  */
  */
-
 int
 int
-_bt_split(t)
-       BTREE_P t;
+__bt_split(t, h, key, data, flags, nbytes, skip)
+       BTREE *t;
+       PAGE *h;
+       const DBT *key, *data;
+       u_long flags;
+       size_t nbytes;
+       int skip;
 {
 {
-       BTHEADER *h;
-       BTHEADER *left, *right;
-       pgno_t nextpgno, parent;
-       int nbytes, len;
-       IDATUM *id;
-       DATUM *d;
-       char *src;
-       IDATUM *new;
-       pgno_t oldchain;
-       u_char flags;
-
-       h = (BTHEADER *) t->bt_curpage;
-
-       /* split root page specially, since it must remain page 1 */
-       if (h->h_pgno == P_ROOT) {
-               return (_bt_splitroot(t));
-       }
+       BINTERNAL *bi;
+       BLEAF *bl;
+       DBT a, b;
+       EPGNO *parent;
+       PAGE *l, *r, *lchild, *rchild;
+       index_t nxtindex;
+       size_t nksize;
+       int nosplit;
+       char *dest;
 
        /*
 
        /*
-        *  This is a little complicated.  We go to some trouble to
-        *  figure out which of the three possible cases -- in-memory tree,
-        *  disk tree (no cache), and disk tree (cache) -- we have, in order
-        *  to avoid unnecessary copying.  If we have a disk cache, then we
-        *  have to do some extra copying, though, since the cache code
-        *  manages buffers externally to this code.
+        * Split the page into two pages, l and r.  The split routines return
+        * a pointer to the page into which the key should be inserted and skip
+        * set to the offset which should be used.  Additionally, l and r are
+        * pinned.
         */
         */
-
-       if (ISDISK(t) && ISCACHE(t)) {
-               if ((left = (BTHEADER *) malloc((unsigned) t->bt_psize))
-                   == (BTHEADER *) NULL)
-                       return (RET_ERROR);
-               left->h_pgno = left->h_prevpg = left->h_nextpg = P_NONE;
-               left->h_flags = t->bt_curpage->h_flags;
-               left->h_lower = (index_t)
-                         (((char *) &(left->h_linp[0])) - ((char *) left));
-               left->h_upper = t->bt_psize;
-
-       } else {
-               if ((left = _bt_allocpg(t)) == (BTHEADER *) NULL)
-                       return (RET_ERROR);
-       }
-       left->h_pgno = h->h_pgno;
-
-       if ((right = _bt_allocpg(t)) == (BTHEADER *) NULL)
-               return (RET_ERROR);
-       right->h_pgno = ++(t->bt_npages);
-
-       /* now do the split */
-       if (_bt_dopsplit(t, left, right) == RET_ERROR)
+       h = h->pgno == P_ROOT ?
+           bt_root(t, h, &l, &r, &skip) : bt_page(t, h, &l, &r, &skip);
+       if (h == NULL)
                return (RET_ERROR);
 
                return (RET_ERROR);
 
-       right->h_prevpg = left->h_pgno;
-       nextpgno = right->h_nextpg = h->h_nextpg;
-       left->h_nextpg = right->h_pgno;
-       left->h_prevpg = h->h_prevpg;
-
-       /* okay, now use the left half of the page as the new page */
-       if (ISDISK(t) && ISCACHE(t)) {
-               (void) bcopy((char *) left, (char *) t->bt_curpage,
-                            (int) t->bt_psize);
-               (void) free ((char *) left);
-               left = t->bt_curpage;
+       /*
+        * Grab the space and insert the [rb]leaf structure.  Always a [rb]leaf
+        * structure since key inserts always cause a leaf page to split first.
+        */
+       h->linp[skip] = h->upper -= nbytes;
+       dest = (char *)h + h->upper;
+       if (ISSET(t, BTF_RECNO)) {
+               WR_RLEAF(dest, data, flags)
+               ++t->bt_nrecs;
+               SET(t, BTF_METADIRTY | BTF_MODIFIED);
        } else {
        } else {
-               (void) free((char *) t->bt_curpage);
-               t->bt_curpage = left;
+               WR_BLEAF(dest, key, data, flags)
+               SET(t, BTF_MODIFIED);
        }
 
        /*
        }
 
        /*
-        *  Write the new pages out.  We need them to stay where they are
-        *  until we're done updating the parent pages.
+        * Now we walk the parent page stack -- a LIFO stack of the pages that
+        * were traversed when we searched for the page that split.  Each stack
+        * entry is a page number and a page index offset.  The offset is for
+        * the page traversed on the search.  We've just split a page, so we
+        * have to insert a new key into the parent page.
+        *
+        * If the insert into the parent page causes it to split, may have to
+        * continue splitting all the way up the tree.  We stop if the root
+        * splits or the page inserted into didn't have to split to hold the
+        * new key.  Some algorithms replace the key for the old page as well
+        * as the new page.  We don't, as there's no reason to believe that the
+        * first key on the old page is any better than the key we have, and,
+        * in the case of a key being placed at index 0 causing the split, the
+        * key is unavailable.
+        *
+        * There are a maximum of 5 pages pinned at any time.  We keep the left
+        * and right pages pinned while working on the parent.   The 5 are the
+        * two children, left parent and right parent (when the parent splits)
+        * and the root page or the overflow key page when calling bt_preserve.
+        * This code must make sure that all pins are released other than the
+        * root page or overflow page which is unlocked elsewhere.
         */
         */
+       for (nosplit = 0; (parent = BT_POP(t)) != NULL;) {
+               lchild = l;
+               rchild = r;
 
 
-       if (_bt_write(t, left, NORELEASE) == RET_ERROR)
-               return (RET_ERROR);
-       if (_bt_write(t, right, NORELEASE) == RET_ERROR)
-               return (RET_ERROR);
+               /* Get the parent page. */
+               if ((h = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL)
+                       goto err2;
 
 
-       /* update 'prev' pointer of old neighbor of left */
-       if (nextpgno != P_NONE) {
-               if (_bt_getpage(t, nextpgno) == RET_ERROR)
-                       return (RET_ERROR);
-               h = t->bt_curpage;
-               h->h_prevpg = right->h_pgno;
-               h->h_flags |= F_DIRTY;
-       }
+               /* The new key goes ONE AFTER the index. */
+               skip = parent->index + 1;
 
 
-       if ((parent = _bt_pop(t)) != P_NONE) {
-               if (right->h_flags & F_LEAF) {
-                       d = (DATUM *) GETDATUM(right, 0);
-                       len = d->d_ksize;
-                       if (d->d_flags & D_BIGKEY) {
-                               bcopy(&(d->d_bytes[0]),
-                                     (char *) &oldchain,
-                                     sizeof(oldchain));
-                               if (_bt_markchain(t, oldchain) == RET_ERROR)
-                                       return (RET_ERROR);
-                               src = (char *) &oldchain;
-                               flags = D_BIGKEY;
-                       } else {
-                               src = (char *) &(d->d_bytes[0]);
-                               flags = 0;
+               /*
+                * Calculate the space needed on the parent page.
+                *
+                * Space hack when insertin into BINTERNAL pages.  Only need to
+                * retain the number of bytes that will distinguish between the
+                * new entry and the LAST entry on the page to its left.  If
+                * the keys compare equal, only need to retain one byte as a
+                * placeholder.  Special cases are that the entire key must be
+                * retained for the next-to-leftmost key on the leftmost page
+                * of each level, or the search will fail, and can't mess with
+                * overflow keys.
+                */
+               switch (rchild->flags & P_TYPE) {
+               case P_BINTERNAL:
+                       bi = GETBINTERNAL(rchild, 0);
+                       nbytes = NBINTERNAL(bi->ksize);
+                       if (t->bt_pfx && (h->prevpg != P_INVALID || skip > 1) &&
+                           !(bi->flags & P_BIGKEY)) {
+                               BINTERNAL *tbi;
+                               tbi =
+                                   GETBINTERNAL(lchild, NEXTINDEX(lchild) - 1);
+                               a.size = tbi->ksize;
+                               a.data = tbi->bytes;
+                               b.size = bi->ksize;
+                               b.data = bi->bytes;
+                               goto prefix;
                        }
                        }
-               } else {
-                       id = (IDATUM *) GETDATUM(right, 0);
-                       len = id->i_size;
-                       flags = id->i_flags;
-                       src = (char *) &(id->i_bytes[0]);
+                       break;
+               case P_BLEAF:
+                       bl = GETBLEAF(rchild, 0);
+                       nbytes = NBLEAF(bl);
+                       if (t->bt_pfx && (h->prevpg != P_INVALID || skip > 1) &&
+                           !(bl->flags & P_BIGKEY)) {
+                               BLEAF *tbl;
+                               size_t n;
+
+                               tbl = GETBLEAF(lchild, NEXTINDEX(lchild) - 1);
+                               a.size = tbl->ksize;
+                               a.data = tbl->bytes;
+                               b.size = bl->ksize;
+                               b.data = bl->bytes;
+prefix:                                nksize = t->bt_pfx(&a, &b);
+                               n = NBINTERNAL(nksize);
+                               if (n < nbytes) {
+#ifdef STATISTICS
+                                       bt_pfxsaved += nbytes - n;
+#endif
+                                       nbytes = n;
+                               } else
+                                       nksize = 0;
+                       } else
+                               nksize = 0;
+                       break;
+               case P_RINTERNAL:
+               case P_RLEAF:
+                       nbytes = NRINTERNAL;
+                       break;
                }
                }
-               nbytes = len + (sizeof(IDATUM) - sizeof(char));
-               new = (IDATUM *) malloc((unsigned) nbytes);
-               if (new == (IDATUM *) NULL)
-                       return (RET_ERROR);
-               new->i_size = len;
-               new->i_pgno = right->h_pgno;
-               new->i_flags = flags;
-               if (len > 0)
-                       (void) bcopy(src, (char *) &(new->i_bytes[0]), len);
-
-               nbytes = LONGALIGN(nbytes) + sizeof(index_t);
-               if (_bt_getpage(t, parent) == RET_ERROR)
-                       return (RET_ERROR);
 
 
-               h = t->bt_curpage;
+               /* Split the parent page if necessary or shift the indices. */
+               if (h->upper - h->lower < nbytes + sizeof(index_t)) {
+                       h = h->pgno == P_ROOT ?
+                           bt_root(t, h, &l, &r, &skip) :
+                           bt_page(t, h, &l, &r, &skip);
+                       if (h == NULL)
+                               goto err1;
+               } else {
+                       if (skip < (nxtindex = NEXTINDEX(h)))
+                               bcopy(h->linp + skip, h->linp + skip + 1,
+                                   (nxtindex - skip) * sizeof(index_t));
+                       h->lower += sizeof(index_t);
+                       nosplit = 1;
+               }
 
 
-               /*
-                *  Split the parent if we need to, then reposition the
-                *  tree's current page pointer for the new datum.
-                */
-               if ((h->h_upper - h->h_lower) < nbytes) {
-                       if (_bt_split(t) == RET_ERROR)
-                               return (RET_ERROR);
-                       if (_bt_reposition(t, new, parent, right->h_prevpg)
-                             == RET_ERROR)
-                               return (RET_ERROR);
+               /* Insert the key into the parent page. */
+               switch(rchild->flags & P_TYPE) {
+               case P_BINTERNAL:
+                       h->linp[skip] = h->upper -= nbytes;
+                       dest = (char *)h + h->linp[skip];
+                       bcopy(bi, dest, nbytes);
+                       if (nksize)
+                               ((BINTERNAL *)dest)->ksize = nksize;
+                       ((BINTERNAL *)dest)->pgno = rchild->pgno;
+                       break;
+               case P_BLEAF:
+                       h->linp[skip] = h->upper -= nbytes;
+                       dest = (char *)h + h->linp[skip];
+                       WR_BINTERNAL(dest, nksize ? nksize : bl->ksize,
+                           rchild->pgno, rchild->flags & P_OVERFLOW);
+                       bcopy(bl->bytes, dest, nksize ? nksize : bl->ksize);
+                       if (bl->flags & P_BIGKEY &&
+                           bt_preserve(t, *(pgno_t *)bl->bytes) == RET_ERROR)
+                               goto err1;
+                       break;
+               case P_RINTERNAL:
+                       /* Update both left and right page counts. */
+                       h->linp[skip] = h->upper -= nbytes;
+                       dest = (char *)h + h->linp[skip];
+                       ((RINTERNAL *)dest)->nrecs = rec_total(rchild);
+                       ((RINTERNAL *)dest)->pgno = rchild->pgno;
+                       dest = (char *)h + h->linp[skip - 1];
+                       ((RINTERNAL *)dest)->nrecs = rec_total(lchild);
+                       ((RINTERNAL *)dest)->pgno = lchild->pgno;
+                       break;
+               case P_RLEAF:
+                       /* Update both left and right page counts. */
+                       h->linp[skip] = h->upper -= nbytes;
+                       dest = (char *)h + h->linp[skip];
+                       ((RINTERNAL *)dest)->nrecs = NEXTINDEX(rchild);
+                       ((RINTERNAL *)dest)->pgno = rchild->pgno;
+                       dest = (char *)h + h->linp[skip - 1];
+                       ((RINTERNAL *)dest)->nrecs = NEXTINDEX(lchild);
+                       ((RINTERNAL *)dest)->pgno = lchild->pgno;
+                       break;
                }
 
                }
 
-               /* okay, now insert the new idatum */
-               if (_bt_inserti(t, new, right->h_prevpg) == RET_ERROR)
-                       return (RET_ERROR);
+               /* Unpin the held pages. */
+               if (nosplit) {
+                       mpool_put(t->bt_mp, h, MPOOL_DIRTY);
+                       break;
+               }
+               mpool_put(t->bt_mp, lchild, MPOOL_DIRTY);
+               mpool_put(t->bt_mp, rchild, MPOOL_DIRTY);
        }
 
        }
 
+       /* Unpin the held pages. */
+       mpool_put(t->bt_mp, l, MPOOL_DIRTY);
+       mpool_put(t->bt_mp, r, MPOOL_DIRTY);
+
        /*
        /*
-        *  Okay, split is done; don't need right page stapled down anymore.
-        *  The page we call 'left' above is the new version of the old
-        *  (split) page, so we can't release it.
+        * If it's a recno tree, increment the count on all remaining parent
+        * pages.  Otherwise, clear the stack.
         */
         */
+       if (ISSET(t, BTF_RECNO))
+               while  ((parent = BT_POP(t)) != NULL) {
+                       if ((h = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL)
+                               return (RET_ERROR);
+                       ++GETRINTERNAL(h, parent->index)->nrecs;
+                       mpool_put(t->bt_mp, h, MPOOL_DIRTY);
+               }
+       else
+               BT_CLR(t);
+       return (RET_SUCCESS);
 
 
-       if (_bt_release(t, right) == RET_ERROR)
-               return (RET_ERROR);
-       if (ISDISK(t) && !ISCACHE(t))
-               (void) free((char *) right);
+       /*
+        * If something fails in the above loop we were already walking back
+        * up the tree and the tree is now inconsistent.  Nothing much we can
+        * do about it but release any memory we're holding.
+        */
+err1:  mpool_put(t->bt_mp, lchild, MPOOL_DIRTY);
+       mpool_put(t->bt_mp, rchild, MPOOL_DIRTY);
 
 
-       return (RET_SUCCESS);
+err2:  mpool_put(t->bt_mp, l, 0);
+       mpool_put(t->bt_mp, r, 0);
+       __dbpanic(t->bt_dbp);
+       return (RET_ERROR);
 }
 
 /*
 }
 
 /*
- *  _BT_REPOSITION -- Reposition the current page pointer of a btree.
- *
- *     After splitting a node in the tree in order to make room for
- *     an insertion, we need to figure out which page after the split
- *     should get the item we want to insert.  This routine positions
- *     the tree's current page pointer appropriately.
+ * BT_PAGE -- Split a non-root page of a btree.
  *
  *
- *     Parameters:
- *             t -- tree to position
- *             new -- the item we want to insert
- *             parent -- parent of the node that we just split
- *             prev -- page number of item directly to the left of
- *                     new's position in the tree.
+ * Parameters:
+ *     t:      tree
+ *     h:      root page
+ *     lp:     pointer to left page pointer
+ *     rp:     pointer to right page pointer
+ *     skip:   pointer to index to leave open
  *
  *
- *     Returns:
- *             RET_SUCCESS, RET_ERROR.
- *
- *     Side Effects:
- *             None.
+ * Returns:
+ *     Pointer to page in which to insert or NULL on error.
  */
  */
-
-int
-_bt_reposition(t, new, parent, prev)
-       BTREE_P t;
-       IDATUM *new;
-       pgno_t parent;
-       pgno_t prev;
+static PAGE *
+bt_page(t, h, lp, rp, skip)
+       BTREE *t;
+       PAGE *h, **lp, **rp;
+       int *skip;
 {
 {
-       index_t i, next;
-       IDATUM *idx;
-
-       if (parent == P_ROOT) {
-
-               /*
-                *  If we just split the root page, then there are guaranteed
-                *  to be exactly two IDATUMs on it.  Look at both of them
-                *  to see if they point to the page that we want.
-                */
+       PAGE *l, *r, *tp;
+       pgno_t npg;
+
+#ifdef STATISTICS
+       ++bt_split;
+#endif
+       /* Put the new right page for the split into place. */
+       if ((r = mpool_new(t->bt_mp, &npg)) == NULL)
+               return (NULL);
+       r->pgno = npg;
+       r->lower = BTDATAOFF;
+       r->upper = t->bt_psize;
+       r->nextpg = h->nextpg;
+       r->prevpg = h->pgno;
+       r->flags = h->flags & P_TYPE;
 
 
-               if (_bt_getpage(t, parent) == RET_ERROR)
-                       return (RET_ERROR);
+       /*
+        * If we're splitting the last page on a level because we're appending
+        * a key to it (skip is NEXTINDEX()), it's likely that the data is
+        * sorted.  Adding an empty page on the side of the level is less work
+        * and can push the fill factor much higher than normal.  If we're
+        * wrong it's no big deal, we'll just do the split the right way next
+        * time.  It may look like it's equally easy to do a similar hack for
+        * reverse sorted data, that is, split the tree left, but it's not.
+        * Don't even try.
+        */
+       if (h->nextpg == P_INVALID && *skip == NEXTINDEX(h)) {
+#ifdef STATISTICS
+               ++bt_sortsplit;
+#endif
+               h->nextpg = r->pgno;
+               r->lower = BTDATAOFF + sizeof(index_t);
+               *skip = 0;
+               *lp = h;
+               *rp = r;
+               return (r);
+       }
 
 
-               next = NEXTINDEX(t->bt_curpage);
-               for (i = 0; i < next; i++) {
-                       idx = (IDATUM *) GETDATUM(t->bt_curpage, i);
-                       if (_bt_getpage(t, idx->i_pgno) == RET_ERROR)
-                               return (RET_ERROR);
-                       if (_bt_isonpage(t, new, prev) == RET_SUCCESS)
-                               return (RET_SUCCESS);
-                       if (_bt_getpage(t, parent) == RET_ERROR)
-                               return (RET_ERROR);
+       /* Put the new left page for the split into place. */
+       if ((l = malloc(t->bt_psize)) == NULL) {
+               mpool_put(t->bt_mp, r, 0);
+               return (NULL);
+       }
+       l->pgno = h->pgno;
+       l->nextpg = r->pgno;
+       l->prevpg = h->prevpg;
+       l->lower = BTDATAOFF;
+       l->upper = t->bt_psize;
+       l->flags = h->flags & P_TYPE;
+
+       /* Fix up the previous pointer of the page after the split page. */
+       if (h->nextpg != P_INVALID) {
+               if ((tp = mpool_get(t->bt_mp, h->nextpg, 0)) == NULL) {
+                       free(l);
+                       /* XXX mpool_free(t->bt_mp, r->pgno); */
+                       return (NULL);
                }
                }
-       } else {
+               tp->prevpg = r->pgno;
+               mpool_put(t->bt_mp, tp, 0);
+       }
 
 
-               /*
-                *  Get the parent page -- which is where the new item would
-                *  have gone -- and figure out whether the new item now goes
-                *  on the parent, or the page immediately to the right of
-                *  the parent.
-                */
+       /*
+        * Split right.  The key/data pairs aren't sorted in the btree page so
+        * it's simpler to copy the data from the split page onto two new pages
+        * instead of copying half the data to the right page and compacting
+        * the left page in place.  Since the left page can't change, we have
+        * to swap the original and the allocated left page after the split.
+        */
+       tp = bt_psplit(t, h, l, r, skip);
 
 
-               if (_bt_getpage(t, parent) == RET_ERROR)
-                       return (RET_ERROR);
-               if (_bt_isonpage(t, new, prev) == RET_SUCCESS)
-                       return (RET_SUCCESS);
-               if (_bt_getpage(t, t->bt_curpage->h_nextpg) == RET_ERROR)
-                       return (RET_ERROR);
-               if (_bt_isonpage(t, new, prev) == RET_SUCCESS)
-                       return (RET_SUCCESS);
-       }
-       return (RET_ERROR);
+       /* Move the new left page onto the old left page. */
+       bcopy(l, h, t->bt_psize);
+       if (tp == l)
+               tp = h;
+       free(l);
+
+       *lp = h;
+       *rp = r;
+       return (tp);
 }
 
 /*
 }
 
 /*
- *  _BT_ISONPAGE -- Is the IDATUM for a given page number on the current page?
+ * BT_RSPLIT -- Split the root page of a btree.
  *
  *
- *     This routine is used by _bt_reposition to decide whether the current
- *     page is the correct one on which to insert a new item.
+ * Parameters:
+ *     t:      tree
+ *     h:      root page
+ *     lp:     pointer to left page pointer
+ *     rp:     pointer to right page pointer
+ *     skip:   pointer to index to leave open
  *
  *
- *     Parameters:
- *             t -- tree to check
- *             new -- the item that will be inserted (used for binary search)
- *             prev -- page number of page whose IDATUM is immediately to
- *                     the left of new's position in the tree.
- *
- *     Returns:
- *             RET_SUCCESS, RET_ERROR (corresponding to TRUE, FALSE).
+ * Returns:
+ *     Pointer to page in which to insert or NULL on error.
  */
  */
-
-int
-_bt_isonpage(t, new, prev)
-       BTREE_P t;
-       IDATUM *new;
-       pgno_t prev;
+static PAGE *
+bt_root(t, h, lp, rp, skip)
+       BTREE *t;
+       PAGE *h, **lp, **rp;
+       int *skip;
 {
 {
-       BTHEADER *h = (BTHEADER *) t->bt_curpage;
-       index_t i, next;
-       IDATUM *idx;
-
-       i = _bt_binsrch(t, &(new->i_bytes[0]));
-       while (i != 0 && _bt_cmp(t, &(new->i_bytes[0]), i) == 0)
-               --i;
-       next = NEXTINDEX(h);
-       idx = (IDATUM *) GETDATUM(h, i);
-       while (i < next && idx->i_pgno != prev) {
-               i++;
-               idx = (IDATUM *) GETDATUM(h, i);
-       }
-       if (idx->i_pgno == prev)
-               return (RET_SUCCESS);
-       else
-               return (RET_ERROR);
+       PAGE *l, *r, *tp;
+       pgno_t lnpg, rnpg;
+
+#ifdef STATISTICS
+       ++bt_split;
+       ++bt_rootsplit;
+#endif
+       /* Put the new left and right pages for the split into place. */
+       if ((l = mpool_new(t->bt_mp, &lnpg)) == NULL ||
+           (r = mpool_new(t->bt_mp, &rnpg)) == NULL)
+               return (NULL);
+       l->pgno = lnpg;
+       r->pgno = rnpg;
+       l->nextpg = r->pgno;
+       r->prevpg = l->pgno;
+       l->prevpg = r->nextpg = P_INVALID;
+       l->lower = r->lower = BTDATAOFF;
+       l->upper = r->upper = t->bt_psize;
+       l->flags = r->flags = h->flags & P_TYPE;
+
+       /* Split the root page. */
+       tp = bt_psplit(t, h, l, r, skip);
+
+       /* Make the root page look right. */
+       if ((ISSET(t, BTF_RECNO) ?
+           bt_rroot(t, h, l, r) : bt_broot(t, h, l, r)) == RET_ERROR)
+               return (NULL);
+
+       *lp = l;
+       *rp = r;
+       return (tp);
 }
 
 /*
 }
 
 /*
- *  _BT_SPLITROOT -- Split the root of a btree.
- *
- *     The root page for a btree is always page one.  This means that in
- *     order to split the root, we need to do extra work.
+ * BT_RROOT -- Fix up the recno root page after the split.
  *
  *
- *     Parameters:
- *             t -- tree to split
+ * Parameters:
+ *     t:      tree
+ *     h:      root page
  *
  *
- *     Returns:
- *             RET_SUCCESS, RET_ERROR.
- *
- *     Side Effects:
- *             Splits root upward in the usual way, adding two new pages
- *             to the tree (rather than just one, as in usual splits).
+ * Returns:
+ *     RET_ERROR, RET_SUCCESS
  */
  */
-
-int
-_bt_splitroot(t)
-       BTREE_P t;
+static int
+bt_rroot(t, h, l, r)
+       BTREE *t;
+       PAGE *h, *l, *r;
 {
 {
-       BTHEADER *h = t->bt_curpage;
-       BTHEADER *left, *right;
-       IDATUM *id;
-       BTHEADER *where_h;
-       char *src, *dest;
-       int len, nbytes;
-       u_long was_leaf;
-       pgno_t oldchain;
-       u_char flags;
-
-       /* get two new pages for the split */
-       if ((left = _bt_allocpg(t)) == (BTHEADER *) NULL)
-               return (RET_ERROR);
-       left->h_pgno = ++(t->bt_npages);
-       if ((right = _bt_allocpg(t)) == (BTHEADER *) NULL)
-               return (RET_ERROR);
-       right->h_pgno = ++(t->bt_npages);
+       char *dest;
 
 
-       /* do the split */
-       if (_bt_dopsplit(t, left, right) == RET_ERROR)
-               return (RET_ERROR);
+       /* Insert the left and right keys, set the header information. */
+       h->linp[0] = h->upper = t->bt_psize - NRINTERNAL;
+       dest = (char *)h + h->upper;
+       WR_RINTERNAL(dest,
+           l->flags & P_RLEAF ? NEXTINDEX(l) : rec_total(l), l->pgno);
 
 
-       /* connect the new pages correctly */
-       right->h_prevpg = left->h_pgno;
-       left->h_nextpg = right->h_pgno;
+       h->linp[1] = h->upper -= NRINTERNAL;
+       dest = (char *)h + h->upper;
+       WR_RINTERNAL(dest,
+           r->flags & P_RLEAF ? NEXTINDEX(r) : rec_total(r), r->pgno);
 
 
-       /*
-        *  Write the child pages out now.  We need them to remain
-        *  where they are until we finish updating parent pages,
-        *  however.
-        */
+       h->lower = BTDATAOFF + 2 * sizeof(index_t);
 
 
-       if (_bt_write(t, left, NORELEASE) == RET_ERROR)
-               return (RET_ERROR);
-       if (_bt_write(t, right, NORELEASE) == RET_ERROR)
-               return (RET_ERROR);
+       /* Unpin the root page, set to recno internal page. */
+       h->flags &= ~P_TYPE;
+       h->flags |= P_RINTERNAL;
+       mpool_put(t->bt_mp, h, MPOOL_DIRTY);
 
 
-       /* now change the root page into an internal page */
-       was_leaf = (h->h_flags & F_LEAF);
-       h->h_flags &= ~F_LEAF;
-       h->h_lower = (index_t) (((char *) (&(h->h_linp[0]))) - ((char *) h));
-       h->h_upper = (index_t) t->bt_psize;
-       (void) bzero((char *) &(h->h_linp[0]), (int) (h->h_upper - h->h_lower));
-
-       /* put two new keys on root page */
-       where_h = left;
-       while (where_h) {
-               DATUM *data;
-               IDATUM *idata;
-
-               if (was_leaf) {
-                       data = (DATUM *) GETDATUM(where_h, 0);
-
-                       if (where_h == left) {
-                               len = 0;        /* first key in tree is null */
-                       } else {
-                               if (data->d_flags & D_BIGKEY) {
-                                       bcopy(&(data->d_bytes[0]),
-                                             (char *) &oldchain,
-                                             sizeof(oldchain));
-                                       if (_bt_markchain(t, oldchain) == RET_ERROR)
-                                               return (RET_ERROR);
-                                       src = (char *) &oldchain;
-                                       flags = D_BIGKEY;
-                               } else {
-                                       src = (char *) &(data->d_bytes[0]);
-                                       flags = 0;
-                               }
-                               len = data->d_ksize;
-                       }
-               } else {
-                       idata = (IDATUM *) GETDATUM(where_h, 0);
-                       len = idata->i_size;
-                       flags = idata->i_flags;
-                       src = &(idata->i_bytes[0]);
-               }
-               dest = ((char *) h) + h->h_upper;
-               nbytes = len + (sizeof (IDATUM) - sizeof(char));
-               dest -= LONGALIGN(nbytes);
-               id = (IDATUM *) dest;
-               id->i_size = len;
-               id->i_pgno = where_h->h_pgno;
-               id->i_flags = flags;
-               if (len > 0)
-                       (void) bcopy((char *) src, (char *) &(id->i_bytes[0]), len);
-               dest -= ((int) h);
-               h->h_linp[NEXTINDEX(h)] = (index_t) dest;
-               h->h_upper = (index_t) dest;
-               h->h_lower += sizeof(index_t);
-
-               /* next page */
-               if (where_h == left)
-                       where_h = right;
-               else
-                       where_h = (BTHEADER *) NULL;
-       }
+       return (RET_SUCCESS);
+}
 
 
-       if (_bt_release(t, left) == RET_ERROR)
-               return (RET_ERROR);
-       if (_bt_release(t, right) == RET_ERROR)
-               return (RET_ERROR);
+/*
+ * BT_BROOT -- Fix up the btree root page after the split.
+ *
+ * Parameters:
+ *     t:      tree
+ *     h:      root page
+ *
+ * Returns:
+ *     RET_ERROR, RET_SUCCESS
+ */
+static int
+bt_broot(t, h, l, r)
+       BTREE *t;
+       PAGE *h, *l, *r;
+{
+       BINTERNAL *bi;
+       BLEAF *bl;
+       size_t nbytes;
+       char *dest;
 
        /*
 
        /*
-        *  That's it, split is done.  If we're doing a non-cached disk
-        *  btree, we can free up the pages we allocated, as they're on
-        *  disk, now.
+        * If the root page was a leaf page, change it into an internal page.
+        * We copy the key we split on (but not the key's data, in the case of
+        * a leaf page) to the new root page.  If the key is on an overflow
+        * page, mark the overflow chain so it isn't deleted when the leaf copy
+        * of the key is deleted.
+        *
+        * The btree comparison code guarantees that the left-most key on any
+        * level of the tree is never used, so it doesn't need to be filled
+        * in.  (This is not just convenience -- if the insert index is 0, we
+        * don't *have* a key to fill in.)  The right key is available because
+        * the split code guarantees not to split on the skipped index.
         */
         */
-
-       if (ISDISK(t) && !ISCACHE(t)) {
-               (void) free ((char *) left);
-               (void) free ((char *) right);
+       nbytes = LALIGN(sizeof(size_t) + sizeof(pgno_t) + sizeof(u_char));
+       h->linp[0] = h->upper = t->bt_psize - nbytes;
+       dest = (char *)h + h->upper;
+       WR_BINTERNAL(dest, 0, l->pgno, 0);
+
+       switch(h->flags & P_TYPE) {
+       case P_BLEAF:
+               bl = GETBLEAF(r, 0);
+               nbytes = NBINTERNAL(bl->ksize);
+               h->linp[1] = h->upper -= nbytes;
+               dest = (char *)h + h->upper;
+               WR_BINTERNAL(dest, bl->ksize, r->pgno, 0);
+               bcopy(bl->bytes, dest, bl->ksize);
+
+               if (bl->flags & P_BIGKEY &&
+                   bt_preserve(t, *(pgno_t *)bl->bytes) == RET_ERROR)
+                       return (RET_ERROR);
+               break;
+       case P_BINTERNAL:
+               bi = GETBINTERNAL(r, 0);
+               nbytes = NBINTERNAL(bi->ksize);
+               h->linp[1] = h->upper -= nbytes;
+               dest = (char *)h + h->upper;
+               bcopy(bi, dest, nbytes);
+               ((BINTERNAL *)dest)->pgno = r->pgno;
+               break;
        }
        }
+       h->lower = BTDATAOFF + 2 * sizeof(index_t);
 
 
-       h->h_flags |= F_DIRTY;
+       /* Unpin the root page, set to btree internal page. */
+       h->flags &= ~P_TYPE;
+       h->flags |= P_BINTERNAL;
+       mpool_put(t->bt_mp, h, MPOOL_DIRTY);
 
        return (RET_SUCCESS);
 }
 
 /*
 
        return (RET_SUCCESS);
 }
 
 /*
- *  _BT_DOPSPLIT -- Do the work of splitting a page
- *
- *     This routine takes two page pointers and splits the data on the
- *     current page of the btree between them.
- *
- *     We do a lot of work here to handle duplicate keys on a page; we
- *     have to place these keys carefully to guarantee that later searches
- *     will find them correctly.  See comments in the code below for details.
+ * BT_PSPLIT -- Do the real work of splitting the page.
  *
  *
- *     Parameters:
- *             t -- tree to split
- *             left -- pointer to page to get left half of the data
- *             right -- pointer to page to get right half of the data
+ * Parameters:
+ *     t:      tree
+ *     h:      page to be split
+ *     l:      page to put lower half of data
+ *     r:      page to put upper half of data
+ *     skip:   pointer to index to leave open
  *
  *
- *     Returns:
- *             None.
+ * Returns:
+ *     Pointer to page in which to insert.
  */
  */
-
-int
-_bt_dopsplit(t, left, right)
-       BTREE_P t;
-       BTHEADER *left;
-       BTHEADER *right;
+static PAGE *
+bt_psplit(t, h, l, r, skip)
+       BTREE *t;
+       PAGE *h, *l, *r;
+       int *skip;
 {
 {
-       BTHEADER *h = t->bt_curpage;
-       size_t psize;
-       char *where;
-       BTHEADER *where_h;
-       index_t where_i;
-       int nbytes, dsize, fixedsize, freespc;
-       index_t i;
-       index_t save_lower, save_upper, save_i;
-       index_t switch_i;
-       char *save_key;
-       DATUM *d;
-       CURSOR *c;
-       index_t top;
-       int free_save;
-       pgno_t chain;
-       int ignore;
+       BINTERNAL *bi;
+       BLEAF *bl;
+       RLEAF *rl;
+       EPGNO *c;
+       PAGE *rval;
+       index_t half, sval;
+       size_t nbytes;
+       void *src;
+       int bigkeycnt, isbigkey, nxt, off, top;
 
        /*
 
        /*
-        *  Our strategy is to put half the bytes on each page.  We figure
-        *  out how many bytes we have total, and then move items until
-        *  the last item moved put at least 50% of the data on the left
-        *  page.
+        * Split the data to the left and right pages. Leave the skip index
+        * open and guarantee that the split doesn't happen on that index (the
+        * right key must be available for the parent page).  Additionally,
+        * make some effort not to split on an overflow key.  This makes it
+        * faster to process internal pages and can save space since overflow
+        * keys used by internal pages are never deleted.
         */
         */
-       save_key = (char *) NULL;
-       psize = (int) t->bt_psize;
-       where = ((char *) left) + psize;
-       where_h = left;
-       where_i = 0;
-       nbytes = psize - (int) ((char *) &(h->h_linp[0]) - ((char *) h));
-       freespc = nbytes;
-
-       top = NEXTINDEX(h);
-       if (h->h_flags & F_LEAF)
-               fixedsize = (sizeof(DATUM) - sizeof(char));
-       else
-               fixedsize = (sizeof(IDATUM) - sizeof(char));
-
-       save_key = (char *) NULL;
-
-       /* move data */
-       for (i = 0; i < top; i++) {
-
-               /*
-                *  Internal and leaf pages have different layouts for
-                *  data items, but in both cases the first entry in the
-                *  data item is a size_t.
-                */
-               d = (DATUM *) GETDATUM(h,i);
-               if (h->h_flags & F_LEAF) {
-                       dsize = d->d_ksize + d->d_dsize + fixedsize;
-               } else {
-                       dsize = d->d_ksize + fixedsize;
+       bigkeycnt = 0;
+       sval = *skip;
+       half = (t->bt_psize - BTDATAOFF) / 2;
+       for (nxt = off = 0, top = NEXTINDEX(h); nxt < top; ++off) {
+               if (sval == off)
+                       continue;
+               switch (h->flags & P_TYPE) {
+               case P_BINTERNAL:
+                       src = bi = GETBINTERNAL(h, nxt);
+                       nbytes = NBINTERNAL(bi->ksize);
+                       isbigkey = bi->flags & P_BIGKEY;
+                       break;
+               case P_BLEAF:
+                       src = bl = GETBLEAF(h, nxt);
+                       nbytes = NBLEAF(bl);
+                       isbigkey = bl->flags & P_BIGKEY;
+                       break;
+               case P_RINTERNAL:
+                       src = GETRINTERNAL(h, nxt);
+                       nbytes = NRINTERNAL;
+                       isbigkey = 0;
+                       break;
+               case P_RLEAF:
+                       src = rl = GETRLEAF(h, nxt);
+                       nbytes = NRLEAF(rl);
+                       isbigkey = 0;
+                       break;
                }
                }
+               ++nxt;
+               l->linp[off] = l->upper -= nbytes;
+               bcopy(src, (char *)l + l->upper, nbytes);
+
+               /* There's no empirical justification for the '3'. */
+               if (half < nbytes)
+                       if (!isbigkey || bigkeycnt == 3)
+                               break;
+                       else
+                               ++bigkeycnt;
+               else
+                       half -= nbytes;
+       }
+       l->lower += (off + 1) * sizeof(index_t);
 
 
-               /*
-                *  If a page contains duplicate keys, we have to be
-                *  careful about splits.  A sequence of duplicate keys
-                *  may not begin in the middle of one page and end in
-                *  the middle of another; it must begin on a page boundary,
-                *  in order for searches on the internal nodes to work
-                *  correctly.
-                */
-               if (where_h == left) {
-                       if (save_key == (char *) NULL) {
-                               if (h->h_flags & F_LEAF) {
-                                       if (d->d_flags & D_BIGKEY) {
-                                               free_save = TRUE;
-                                               bcopy(&(d->d_bytes[0]),
-                                                    (char *) &chain,
-                                                    sizeof(chain));
-                                               if (_bt_getbig(t, chain,
-                                                              &save_key,
-                                                              &ignore)
-                                                   == RET_ERROR)
-                                                       return (RET_ERROR);
-                                       } else {
-                                               free_save = FALSE;
-                                               save_key = (char *) &(d->d_bytes[0]);
-                                       }
-                               } else {
-                                       IDATUM *id = (IDATUM *) d;
-
-                                       if (id->i_flags & D_BIGKEY) {
-                                               free_save = TRUE;
-                                               bcopy(&(id->i_bytes[0]),
-                                                    (char *) &chain,
-                                                    sizeof(chain));
-                                               if (_bt_getbig(t, chain,
-                                                              &save_key,
-                                                              &ignore)
-                                                   == RET_ERROR)
-                                                       return (RET_ERROR);
-                                       } else {
-                                               free_save = FALSE;
-                                               save_key = (char *)
-                                                       &(id->i_bytes[0]);
-                                       }
-                               }
-                               save_i = 0;
-                               save_lower = where_h->h_lower;
-                               save_upper = where_h->h_upper;
-                       } else {
-                               if (_bt_cmp(t, save_key, i) != 0) {
-                                       save_lower = where_h->h_lower;
-                                       save_upper = where_h->h_upper;
-                                       save_i = i;
-                               }
-                               if (h->h_flags & F_LEAF) {
-                                       if (free_save)
-                                               (void) free(save_key);
-                                       if (d->d_flags & D_BIGKEY) {
-                                               free_save = TRUE;
-                                               bcopy(&(d->d_bytes[0]),
-                                                    (char *) &chain,
-                                                    sizeof(chain));
-                                               if (_bt_getbig(t, chain,
-                                                              &save_key,
-                                                              &ignore)
-                                                   == RET_ERROR)
-                                                       return (RET_ERROR);
-                                       } else {
-                                               free_save = FALSE;
-                                               save_key = (char *) &(d->d_bytes[0]);
-                                       }
-                               } else {
-                                       IDATUM *id = (IDATUM *) d;
-
-                                       if (id->i_flags & D_BIGKEY) {
-                                               free_save = TRUE;
-                                               bcopy(&(id->i_bytes[0]),
-                                                    (char *) &chain,
-                                                    sizeof(chain));
-                                               if (_bt_getbig(t, chain,
-                                                              &save_key,
-                                                              &ignore)
-                                                   == RET_ERROR)
-                                                       return (RET_ERROR);
-                                       } else {
-                                               free_save = FALSE;
-                                               save_key = (char *)
-                                                       &(id->i_bytes[0]);
-                                       }
-                               }
-                       }
+       /*
+        * If we're splitting the page that the cursor was on, have to adjust
+        * the cursor to point to the same record as before the split.
+        */
+       c = &t->bt_bcursor;
+       if (c->pgno == h->pgno)
+               if (c->index < off)
+                       c->pgno = l->pgno;
+               else {
+                       c->pgno = r->pgno;
+                       c->index -= off;
                }
 
                }
 
-               /* copy data and update page state */
-               where -= LONGALIGN(dsize);
-               (void) bcopy((char *) d, (char *) where, dsize);
-               where_h->h_upper = where_h->h_linp[where_i] =
-                       (index_t) (where - (int) where_h);
-               where_h->h_lower += sizeof(index_t);
-               where_i++;
-
-               /* if we've moved half, switch to the right-hand page */
-               nbytes -= LONGALIGN(dsize) + sizeof(index_t);
-               if ((freespc - nbytes) > nbytes) {
-                       nbytes = 2 * freespc;
-
-                       /* identical keys go on the same page */
-                       if (save_i > 0) {
-                               /* i gets incremented at loop bottom... */
-                               i = save_i - 1;
-                               where_h->h_lower = save_lower;
-                               where_h->h_upper = save_upper;
-                       }
-                       where = ((char *) right) + psize;
-                       where_h = right;
-                       switch_i = where_i;
-                       where_i = 0;
+       /*
+        * Decide which page to return, and adjust the skip index if the
+        * to-be-inserted-upon page has changed.
+        */
+       if (sval > off) {
+               rval = r;
+               *skip -= off + 1;
+       } else
+               rval = l;
+
+       for (off = 0; nxt < top; ++off) {
+               if (sval == nxt) {
+                       sval = 0;
+                       continue;
+               }
+               switch (h->flags & P_TYPE) {
+               case P_BINTERNAL:
+                       src = bi = GETBINTERNAL(h, nxt);
+                       nbytes = NBINTERNAL(bi->ksize);
+                       break;
+               case P_BLEAF:
+                       src = bl = GETBLEAF(h, nxt);
+                       nbytes = NBLEAF(bl);
+                       break;
+               case P_RINTERNAL:
+                       src = GETRINTERNAL(h, nxt);
+                       nbytes = NRINTERNAL;
+                       break;
+               case P_RLEAF:
+                       src = rl = GETRLEAF(h, nxt);
+                       nbytes = NRLEAF(rl);
+                       break;
                }
                }
+               ++nxt;
+               r->linp[off] = r->upper -= nbytes;
+               bcopy(src, (char *)r + r->upper, nbytes);
        }
        }
+       r->lower += off * sizeof(index_t);
 
 
-       /*
-        *  If there was an active scan on the database, and we just
-        *  split the page that the cursor was on, we may need to
-        *  adjust the cursor to point to the same entry as before the
-        *  split.
-        */
+       /* If the key is being appended to the page, adjust the index. */
+       if (sval == top)
+               r->lower += sizeof(index_t);
 
 
-       c = &(t->bt_cursor);
-       if ((t->bt_flags & BTF_SEQINIT)
-           && (c->c_pgno == h->h_pgno)
-           && (c->c_index >= switch_i)) {
-               c->c_pgno = where_h->h_pgno;
-               c->c_index -= where_i;
-       }
+       return (rval);
+}
+
+/*
+ * BT_PRESERVE -- Mark a chain of pages as used by an internal node.
+ *
+ * Chains of indirect blocks pointed to by leaf nodes get reclaimed when the
+ * record that references them gets deleted.  Chains pointed to by internal
+ * pages never get deleted.  This routine marks a chain as pointed to by an
+ * internal page.
+ *
+ * Parameters:
+ *     t:      tree
+ *     pg:     page number of first page in the chain.
+ *
+ * Returns:
+ *     RET_SUCCESS, RET_ERROR.
+ */
+static int
+bt_preserve(t, pg)
+       BTREE *t;
+       pgno_t pg;
+{
+       PAGE *h;
+
+       if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
+               return (RET_ERROR);
+       h->flags |= P_PRESERVE;
+       mpool_put(t->bt_mp, h, MPOOL_DIRTY);
        return (RET_SUCCESS);
 }
        return (RET_SUCCESS);
 }
+
+/*
+ * REC_TOTAL -- Return the number of recno entries below a page.
+ *
+ * Parameters:
+ *     h:      page
+ *
+ * Returns:
+ *     The number of recno entries below a page.
+ *
+ * XXX
+ * These values could be set by the bt_psplit routine.  The problem is that the
+ * entry has to be popped off of the stack etc. or the values have to be passed
+ * all the way back to bt_split/bt_rroot and it's not very clean.
+ */
+static recno_t
+rec_total(h)
+       PAGE *h;
+{
+       recno_t recs;
+       index_t nxt, top;
+
+       for (recs = 0, nxt = 0, top = NEXTINDEX(h); nxt < top; ++nxt)
+               recs += GETRINTERNAL(h, nxt)->nrecs;
+       return (recs);
+}
diff --git a/usr/src/lib/libc/db/btree/bt_stack.c b/usr/src/lib/libc/db/btree/bt_stack.c
new file mode 100644 (file)
index 0000000..c8fa9c9
--- /dev/null
@@ -0,0 +1,64 @@
+/*-
+ * Copyright (c) 1990 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Mike Olson.
+ *
+ * %sccs.include.redist.c%
+ */
+
+#if defined(LIBC_SCCS) && !defined(lint)
+static char sccsid[] = "@(#)bt_stack.c 5.1 (Berkeley) %G%";
+#endif /* LIBC_SCCS and not lint */
+
+#include <sys/types.h>
+#include <errno.h>
+#include <db.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "btree.h"
+
+/*
+ * When a page splits, a new record has to be inserted into its parent page.
+ * This page may have to split as well, all the way up to the root.  Since
+ * parent pointers in each page would be expensive, we maintain a stack of
+ * parent pages as we descend the tree.
+ *
+ * XXX
+ * This is a problem for multiple users -- if user a creates a stack, then user
+ * b splits the tree, then user a tries to split the tree, there's a new level
+ * in the tree that user b doesn't know about.
+ */
+
+/*
+ * BT_PUSH -- Push parent page info onto the stack (LIFO).
+ *
+ * Parameters:
+ *     t:      tree
+ *     pgno:   page
+ *     index:  page index
+ *
+ * Returns:
+ *     RET_ERROR, RET_SUCCESS
+ */
+int
+bt_push(t, pgno, index)
+       BTREE *t;
+       pgno_t pgno;
+       int index;
+{
+       if (t->bt_sp == t->bt_maxstack) {
+               t->bt_maxstack += 50;
+               if ((t->bt_stack = realloc(t->bt_stack,
+                   t->bt_maxstack * sizeof(EPGNO))) == NULL) {
+                       t->bt_maxstack -= 50;
+                       return (RET_ERROR);
+               }
+       }
+
+       t->bt_stack[t->bt_sp].pgno = pgno;
+       t->bt_stack[t->bt_sp].index = index;
+       ++t->bt_sp;
+       return (RET_SUCCESS);
+}
index 55d176e..0c7cc50 100644 (file)
  */
 
 #if defined(LIBC_SCCS) && !defined(lint)
  */
 
 #if defined(LIBC_SCCS) && !defined(lint)
-static char sccsid[] = "@(#)bt_utils.c 5.3 (Berkeley) %G%";
+static char sccsid[] = "@(#)bt_utils.c 5.4 (Berkeley) %G%";
 #endif /* LIBC_SCCS and not lint */
 
 #endif /* LIBC_SCCS and not lint */
 
-#include <sys/types.h>
+#include <sys/param.h>
 #include <db.h>
 #include <db.h>
+#include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include "btree.h"
 
 /*
 #include <stdlib.h>
 #include <string.h>
 #include "btree.h"
 
 /*
- *  _BT_BUILDRET -- Build return key/data pair as a result of search or scan.
+ * __BT_RET -- Build return key/data pair as a result of search or scan.
  *
  *
- *     This routine manages the statically allocated buffers in which we
- *     return data to the user.
+ * Parameters:
+ *     t:      tree
+ *     d:      LEAF to be returned to the user.
+ *     key:    user's key structure
+ *     data:   user's data structure
  *
  *
- *     Parameters:
- *             t -- btree from which to return datum
- *             d -- DATUM to be returned to the user.
- *             data -- data argument supplied by user for return
- *             key -- key argument supplied by user for return
- *
- *     Returns:
- *             RET_SUCCESS, RET_ERROR.
- *
- *     Side Effects:
- *             May free and reallocate static buffers, if the data
- *             we want to return is bigger than the space we have to
- *             do so.
+ * Returns:
+ *     RET_SUCCESS, RET_ERROR.
  */
  */
-
 int
 int
-_bt_buildret(t, d, data, key)
-       BTREE_P t;
-       DATUM *d;
-       DBT *data;
-       DBT *key;
+__bt_ret(t, e, key, data)
+       BTREE *t;
+       EPG *e;
+       DBT *key, *data;
 {
 {
-       static int _data_s = 0;
-       static int _key_s = 0;
-       static char *_data = (char *) NULL;
-       static char *_key = (char *) NULL;
-       pgno_t chain;
+       register BLEAF *bl;
 
 
-       if (d->d_flags & D_BIGKEY) {
-               if (_key != (char *) NULL)
-                       (void) free(_key);
-               (void) bcopy((char *) &(d->d_bytes[0]),
-                            (char *) &chain,
-                            sizeof(chain));
-               if (_bt_getbig(t, chain, &_key, &_key_s) == RET_ERROR)
+       bl = GETBLEAF(e->page, e->index);
+
+       if (bl->flags & P_BIGKEY) {
+               if (__ovfl_get(t, bl->bytes,
+                   &key->size, &t->bt_kbuf, &t->bt_kbufsz))
                        return (RET_ERROR);
                        return (RET_ERROR);
-               key->data = (u_char *)_key;
-               key->size = _key_s;
        } else {
        } else {
-               /* need more space for key? */
-               if (d->d_ksize > _key_s) {
-                       if (_key != (char *) NULL)
-                               (void) free (_key);
-                       if ((_key = (char *) malloc((unsigned) d->d_ksize))
-                           == (char *) NULL)
+               if (bl->ksize > t->bt_kbufsz) {
+                       if ((t->bt_kbuf =
+                           realloc(t->bt_kbuf, bl->ksize)) == NULL)
                                return (RET_ERROR);
                                return (RET_ERROR);
-                       _key_s = d->d_ksize;
+                       t->bt_kbufsz = bl->ksize;
                }
                }
-
-               key->data = (u_char *)_key;
-               if ((key->size = d->d_ksize) > 0)
-                       (void) bcopy(&(d->d_bytes[0]),
-                                    _key,
-                                    (int) d->d_ksize);
+               bcopy(bl->bytes, t->bt_kbuf, t->bt_kbufsz);
+               key->size = bl->ksize;
        }
        }
+       key->data = t->bt_kbuf;
 
 
-       if (d->d_flags & D_BIGDATA) {
-               if (_data != (char *) NULL)
-                       (void) free(_data);
-               (void) bcopy(&(d->d_bytes[d->d_ksize]),
-                            (char *) &chain,
-                            sizeof(chain));
-               if (_bt_getbig(t, chain, &_data, &_data_s) == RET_ERROR)
+       if (bl->flags & P_BIGDATA) {
+               if (__ovfl_get(t, bl->bytes + bl->ksize,
+                   &data->size, &t->bt_dbuf, &t->bt_dbufsz))
                        return (RET_ERROR);
                        return (RET_ERROR);
-               data->data = (u_char *)_data;
-               data->size = _data_s;
        } else {
        } else {
-               /* need more space for data? */
-               if (d->d_dsize > _data_s) {
-                       if (_data != (char *) NULL)
-                               (void) free (_data);
-                       if ((_data = (char *) malloc((unsigned) d->d_dsize))
-                           == (char *) NULL)
+               if (bl->dsize > t->bt_dbufsz) {
+                       if ((t->bt_dbuf =
+                           realloc(t->bt_dbuf, bl->dsize)) == NULL)
                                return (RET_ERROR);
                                return (RET_ERROR);
-                       _data_s = d->d_dsize;
+                       t->bt_dbufsz = bl->dsize;
                }
                }
-
-               data->data = (u_char *)_data;
-
-               if ((data->size = d->d_dsize) > 0)
-                       (void) bcopy(&(d->d_bytes[d->d_ksize]),
-                                     _data,
-                                     (size_t) (d->d_dsize));
+               bcopy(bl->bytes + bl->ksize, t->bt_dbuf, t->bt_dbufsz);
+               data->size = bl->dsize;
        }
        }
+       data->data = t->bt_dbuf;
 
        return (RET_SUCCESS);
 }
 
 /*
 
        return (RET_SUCCESS);
 }
 
 /*
- *  _BT_CMP -- Compare a key to a given item on the current page.
- *
- *     This routine is a wrapper for the user's comparison function.
+ * __BT_CMP -- Compare a key to a given record.
  *
  *
- *     Parameters:
- *             t -- tree in which to do comparison
- *             p -- pointer to one argument for the comparison function
- *             n -- index of item to supply second arg to comparison function
+ * Parameters:
+ *     t:      tree
+ *     k1:     DBT pointer of first arg to comparison
+ *     e:      pointer to EPG for comparison
  *
  *
- *     Returns:
- *             < 0 if p is < item at n
- *             = 0 if p is = item at n
- *             > 0 if p is > item at n
+ * Returns:
+ *     < 0 if k1 is < record
+ *     = 0 if k1 is = record
+ *     > 0 if k1 is > record
  */
  */
-
 int
 int
-_bt_cmp(t, p, n)
-       BTREE_P t;
-       char *p;
-       index_t n;
+__bt_cmp(t, k1, e)
+       BTREE *t;
+       const DBT *k1;
+       EPG *e;
 {
 {
-       BTHEADER *h;
-       IDATUM *id;
-       DATUM *d;
-       char *arg;
-       int ignore;
-       int free_arg;
-       pgno_t chain;
-       int r;
-
-       h = t->bt_curpage;
+       BINTERNAL *bi;
+       BLEAF *bl;
+       DBT k2;
+       PAGE *h;
+       void *bigkey;
 
        /*
 
        /*
-        *  The left-most key at any level of the tree on internal pages
-        *  is guaranteed (artificially, by the code here) to be less than
-        *  any user key.  This saves us from having to update the leftmost
-        *  key when the user inserts a new key in the tree smaller than
-        *  anything we've seen yet.
+        * The left-most key on internal pages, at any level of the tree, is
+        * guaranteed by the following code to be less than any user key.
+        * This saves us from having to update the leftmost key on an internal
+        * page when the user inserts a new key in the tree smaller than
+        * anything we've yet seen.
         */
         */
-
-       if (h->h_prevpg == P_NONE && !(h->h_flags & F_LEAF) && n == 0)
+       h = e->page;
+       if (e->index == 0 && h->prevpg == P_INVALID && !(h->flags & P_BLEAF))
                return (1);
 
                return (1);
 
-       if (h->h_flags & F_LEAF) {
-               d = (DATUM *) GETDATUM(h,n);
-               if (d->d_flags & D_BIGKEY) {
-                       free_arg = TRUE;
-                       bcopy(&(d->d_bytes[0]), (char *) &chain, sizeof(chain));
-                       if (_bt_getbig(t, chain, &arg, &ignore) == RET_ERROR)
-                               return (RET_ERROR);
-               } else {
-                       free_arg = FALSE;
-                       arg = &(d->d_bytes[0]);
+       bigkey = NULL;
+       if (h->flags & P_BLEAF) {
+               bl = GETBLEAF(h, e->index);
+               if (bl->flags & P_BIGKEY)
+                       bigkey = bl->bytes;
+               else {
+                       k2.data = bl->bytes;
+                       k2.size = bl->ksize;
                }
        } else {
                }
        } else {
-               id = (IDATUM *) GETDATUM(h,n);
-               if (id->i_flags & D_BIGKEY) {
-                       free_arg = TRUE;
-                       bcopy(&(id->i_bytes[0]),
-                             (char *) &chain,
-                             sizeof(chain));
-                       if (_bt_getbig(t, chain, &arg, &ignore) == RET_ERROR)
-                               return (RET_ERROR);
-               } else {
-                       free_arg = FALSE;
-                       arg = &(id->i_bytes[0]);
+               bi = GETBINTERNAL(h, e->index);
+               if (bi->flags & P_BIGKEY)
+                       bigkey = bi->bytes;
+               else {
+                       k2.data = bi->bytes;
+                       k2.size = bi->ksize;
                }
        }
                }
        }
-       r = (*(t->bt_compare))(p, arg);
 
 
-       if (free_arg)
-               (void) free(arg);
-
-       return (r);
+       if (bigkey) {
+               if (__ovfl_get(t, bigkey,
+                   &k2.size, &t->bt_dbuf, &t->bt_dbufsz))
+                       return (RET_ERROR);
+               k2.data = t->bt_dbuf;
+       }
+       return((*t->bt_cmp)(k1, &k2));
 }
 
 /*
 }
 
 /*
- *  _BT_PUSH/_BT_POP -- Push/pop a parent page number on the parent stack.
- *
- *     When we descend the tree, we keep track of parent pages in order
- *     to handle splits on insertions.
+ * __BT_DEFCMP -- Default comparison routine.
  *
  *
- *     Parameters:
- *             t -- tree for which to push parent
- *             pgno -- page number to push (_bt_push only)
+ * Parameters:
+ *     a:      DBT #1
+ *     b:      DBT #2
  *
  *
- *     Returns:
- *             RET_SUCCESS, RET_ERROR.
+ * Returns:
+ *     < 0 if a is < b
+ *     = 0 if a is = b
+ *     > 0 if a is > b
  */
  */
-
 int
 int
-_bt_push(t, pgno)
-       BTREE_P t;
-       pgno_t pgno;
-{
-       BTSTACK *new;
-
-       if ((new = (BTSTACK *) malloc((unsigned) sizeof(BTSTACK)))
-           ==  (BTSTACK *) NULL)
-               return (RET_ERROR);
-       new->bts_pgno = pgno;
-       new->bts_next = t->bt_stack;
-       t->bt_stack = new;
-
-       return (RET_SUCCESS);
-}
-
-pgno_t
-_bt_pop(t)
-       BTREE_P t;
-{
-       BTSTACK *s;
-       pgno_t p = P_NONE;
-
-       if ((s = t->bt_stack) != (BTSTACK *) NULL) {
-               p = s->bts_pgno;
-               t->bt_stack = s->bts_next;
-               (void) free ((char *) s);
-       }
-       return (p);
-}
-
-#ifdef DEBUG
-void
-_btdump(tree)
-       BTREE tree;
+__bt_defcmp(a, b)
+       const DBT *a, *b;
 {
 {
-       BTREE_P t = (BTREE_P) tree;
-       DATUM *d;
-       IDATUM *id;
-       BTHEADER *h;
-       pgno_t npages;
-       pgno_t i;
-       index_t cur, top;
-
-       npages = t->bt_npages;
-       (void) printf("\"%s\" fd %d pgsz %d curpg %d @ 0x%lx",
-               t->bt_fname, t->bt_s.bt_d.d_fd,
-               t->bt_psize, t->bt_curpage);
-       (void) printf("npg %d cmp 0x%lx flags=(", npages, t->bt_compare);
-       if (t->bt_flags & BTF_SEQINIT)
-               (void) printf("BTF_SEQINIT");
-       (void) printf(")\n");
-
-       for (i = P_ROOT; i <= npages; i++) {
-               if (_bt_getpage(t, i) == RET_ERROR)
-                       _punt();
-               h = t->bt_curpage;
-               top = NEXTINDEX(h);
-               (void) printf("    page %d:\n", i);
-               (void) printf("\tpgno %d prev %d next %d\n",
-                       h->h_pgno, h->h_prevpg, h->h_nextpg);
-               (void) printf("\tlower %d upper %d nextind %d flags (",
-                       h->h_lower, h->h_upper, top);
-               if (h->h_flags & F_LEAF)
-                       (void) printf("F_LEAF");
-               else
-                       (void) printf("<internal>");
-               if (h->h_flags & F_DIRTY)
-                       (void) printf("|F_DIRTY");
-               if (h->h_flags & F_PRESERVE)
-                       (void) printf("|F_PRESERVE");
-               if (h->h_flags & F_CONT) {
-                       (void) printf("|F_CONT)");
-                       if (h->h_prevpg == P_NONE) {
-                               size_t longsz;
-                               (void) bcopy((char *) &(h->h_linp[0]),
-                                             (char *) &longsz,
-                                             sizeof(longsz));
-                               printf("\n\t\t(chain start, data length %ld)",
-                                       longsz);
-                       }
-                       printf("\n");
-                       continue;
-               }
-               (void) printf(")\n");
-               for (cur = 0; cur < top; cur++) {
-                       (void) printf("\t  [%d] off %d ", cur, h->h_linp[cur]);
-                       if (h->h_flags & F_LEAF) {
-                               d = (DATUM *) GETDATUM(h,cur);
-                               (void) printf("ksize %d", d->d_ksize);
-                               if (d->d_flags & D_BIGKEY)
-                                       (void) printf(" (indirect)");
-                               (void) printf("; dsize %d", d->d_dsize);
-                               if (d->d_flags & D_BIGDATA)
-                                       (void) printf(" (indirect)");
-                       } else {
-                               id = (IDATUM *) GETDATUM(h,cur);
-                               (void) printf("size %d pgno %d",
-                                       id->i_size, id->i_pgno);
-                               if (id->i_flags & D_BIGKEY)
-                                       (void) printf(" (indirect)");
-                       }
-                       (void) printf("\n");
-               }
-               (void) printf("\n");
-       }
+       register u_char *p1, *p2;
+       register int diff, len;
+
+       len = MIN(a->size, b->size);
+       for (p1 = a->data, p2 = b->data; len--; ++p1, ++p2)
+               if (diff = *p1 - *p2)
+                       return(diff);
+       return(a->size - b->size);
 }
 }
-#endif /* DEBUG */
 
 
-#ifdef DEBUG
-_punt()
+/*
+ * __BT_DEFPFX -- Default prefix routine.
+ *
+ * Parameters:
+ *     a:      DBT #1
+ *     b:      DBT #2
+ *
+ * Returns:
+ *     Number of bytes needed to distinguish b from a.
+ */
+int
+__bt_defpfx(a, b)
+       const DBT *a, *b;
 {
 {
-       int pid;
-
-       pid = getpid();
-       (void) kill(pid, SIGILL);
+       register u_char *p1, *p2;
+       register int len;
+       int cnt;
+
+       cnt = 1;
+       len = MIN(a->size, b->size);
+       for (p1 = a->data, p2 = b->data; len--; ++p1, ++p2, ++cnt)
+               if (*p1 != *p2)
+                       return(cnt);
+       if (a->size == b->size)
+               return (a->size);
+       return(a->size + 1);
 }
 }
-#endif /* DEBUG */
index 078e319..565f29e 100644 (file)
 /*-
 /*-
- * Copyright (c) 1990 The Regents of the University of California.
+ * Copyright (c) 1991 The Regents of the University of California.
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Mike Olson.
  *
  * %sccs.include.redist.c%
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Mike Olson.
  *
  * %sccs.include.redist.c%
+ *
+ *     @(#)btree.h     5.3 (Berkeley) %G%
  */
 
  */
 
-/*
- *  @(#)btree.h        5.2 (Berkeley) %G%
- */
-
-typedef char   *BTREE;         /* should really be (void *) */ 
-
-/* #define     DEBUG */
-
-#define RET_ERROR      -1
-#define RET_SUCCESS     0
-#define RET_SPECIAL     1
-
-#ifndef TRUE
-#define TRUE   1
-#define FALSE  0
-#endif /* ndef TRUE */
-
-#ifndef NULL
-#define NULL   0
-#endif /* ndef NULL */
-
-/* these are defined in lrucache.c */
-extern char    *lruinit();
-extern char    *lruget();
-extern char    *lrugetnew();
-extern int     lrusync();
-extern int     lruwrite();
-extern int     lrurelease();
-extern void    lrufree();
-
-/* these are defined here */
-extern BTREE   bt_open();
-extern int     bt_close();
-extern int     bt_delete();
-extern int     bt_get();
-extern int     bt_put();
-extern int     bt_seq();
-extern int     bt_sync();
-
-/*
- *  Private types.  What you choose for these depends on how big you
- *  want to let files get, and how big you want to let pages get.
- */
-
-typedef u_long index_t;        /* so # bytes on a page fits in a long */
-typedef u_long pgno_t;         /* so # of pages in a btree fits in a long */
-
-/*
- *  When we do searches, we push the parent page numbers onto a stack
- *  as we descend the tree.  This is so that for insertions, we can
- *  find our way back up to do internal page insertions and splits.
- */
-
-typedef struct BTSTACK {
-       pgno_t          bts_pgno;
-       struct BTSTACK  *bts_next;
-} BTSTACK;
-
-/*
- *  Every btree page has a header that looks like this.  Flags are given
- *  in the #define's for the F_ flags (see below).
- */
-
-typedef struct BTHEADER {
-       pgno_t h_pgno;          /* page number of this page */
-       pgno_t h_prevpg;        /* left sibling */
-       pgno_t h_nextpg;        /* right sibling */
-
-#define F_LEAF         0x01    /* leaf page, contains user data */
-#define F_CONT         0x02    /* continuation page (large items) */
-#define F_DIRTY                0x04    /* need to write to disk */
-#define F_PRESERVE     0x08    /* never delete this chain of pages */
-
-       u_long h_flags;         /* page state */
-       index_t h_lower;        /* lower bound of free space on page */
-       index_t h_upper;        /* upper bound of free space on page */
-       index_t h_linp[1];      /* VARIABLE LENGTH DATA AT END OF STRUCT */
-} BTHEADER;
-
-/*
- *  HTBUCKETs are hash table buckets for looking up pages of in-memory
- *  btrees by page number.  We use this indirection, rather than direct
- *  pointers, so that the code for manipulating in-memory trees is the
- *  same as that for manipulating on-disk trees.
- */
-
-typedef struct HTBUCKET {
-       pgno_t          ht_pgno;
-       BTHEADER        *ht_page;
-       struct HTBUCKET *ht_next;
-} HTBUCKET;
-
-typedef HTBUCKET       **HTABLE;
-
-/* minimum size we'll let a page be */
-#define MINPSIZE       512
-
-/* default cache size, in bytes */
-#define DEFCACHE       (20 * 1024)
+#include <mpool.h>
 
 
-/* hash table size for in-memory trees */
-#define        HTSIZE          128
-
-/* generate a hash key from a page number */
-#define HASHKEY(pgno)  ((pgno - 1) % HTSIZE)
+#define        DEFMAXKEYPAGE   (0)             /* Maximum keys per page */
+#define        DEFMINKEYPAGE   (2)             /* Minimum keys per page */
+#define        MINCACHE        (5)             /* Minimum cached pages */
+#define        MINPSIZE        (512)           /* Minimum page size */
 
 /*
 
 /*
- *  Disk btrees have a file descriptor, and may also have an lru buffer
- *  cache, if the user asked for one.
+ * Page 0 of a btree file contains a BTMETA structure.  The rest of the first
+ * page is empty, so that all disk operations are page-aligned.  This page is
+ * also used as an out-of-band page, i.e. page pointers that point to nowhere
+ * point to page 0.  The m_nrecs field is used only the RECNO code.  This is 
+ * because the btree doesn't really need it and it requires that put or delete
+ * calls modify the meta data.
  */
  */
+#define        P_INVALID        0              /* Invalid tree page number. */
+#define        P_META           0              /* Tree meta-info page number. */
+#define        P_ROOT           1              /* Tree root page number. */
 
 
-typedef struct BTDISK {
-       int     d_fd;
-       char    *d_cache;
-} BTDISK;
+typedef struct BTMETA {
+       u_long  m_magic;                /* magic number */
+       u_long  m_version;              /* version */
+       u_long  m_psize;                /* page size */
+       u_long  m_free;                 /* page number of first free page */
+       u_long  m_nrecs;                /* R: number of records */
+#define        SAVEMETA        (BTF_NODUPS | BTF_RECNO)
+       u_long  m_flags;                /* bt_flags & SAVEMETA */
+       u_long  m_lorder;               /* byte order */
+} BTMETA;
 
 /*
 
 /*
- *  Cursors keep track of the current location in a sequential scan of
- *  the database.  Since btrees impose a total ordering on keys, we can
- *  walk forward or backward through the database from any point.  Cursors
- *  survive updates to the tree, and can be used to delete a particular
- *  record.
+ * There are five page layouts in the btree: btree internal pages, btree leaf
+ * pages, recno internal pages, recno leaf pages and overflow pages.  Each type
+ * of page starts with a page header as typed by PAGE.
  */
  */
-
-typedef struct CURSOR {
-       pgno_t          c_pgno;         /* pgno of current item in scan */
-       index_t         c_index;        /* index of current item in scan */
-       char            *c_key;         /* current key, used for updates */
-
-#define CRSR_BEFORE    0x01
-
-       u_char          c_flags;        /* to handle updates properly */
-} CURSOR;
+typedef struct PAGE {
+       pgno_t  pgno;                   /* this page's page number */
+       pgno_t  prevpg;                 /* left sibling */
+       pgno_t  nextpg;                 /* right sibling */
+
+#define        P_BINTERNAL     0x01            /* btree internal page */
+#define        P_BLEAF         0x02            /* leaf page */
+#define        P_OVERFLOW      0x04            /* overflow page */
+#define        P_RINTERNAL     0x08            /* recno internal page */
+#define        P_RLEAF         0x10            /* leaf page */
+#define P_TYPE         0x1f            /* type mask */
+
+#define        P_PRESERVE      0x20            /* never delete this chain of pages */
+       u_long  flags;
+
+       index_t lower;                  /* lower bound of free space on page */
+       index_t upper;                  /* upper bound of free space on page */
+       index_t linp[1];                /* long-aligned VARIABLE LENGTH DATA */
+} PAGE;
+
+/* First and next index. */
+#define        BTDATAOFF       (sizeof(PAGE) - sizeof(index_t))
+#define        NEXTINDEX(p)    (((p)->lower - BTDATAOFF) / sizeof(index_t))
 
 /*
 
 /*
- *  The private btree data structure.  The user passes a pointer to one of
- *  these when we are to manipulate a tree, but the BTREE type is opaque
- *  to him.
+ * For pages other than overflow pages, there is an array of offsets into the
+ * rest of the page immediately following the page header.  Each offset is to
+ * an item which is unique to the type of page.  The h_lower offset is just
+ * past the last filled-in index.  The h_upper offset is the first item on the
+ * page.  Offsets are from the beginning of the page.
+ *
+ * If an item is too big to store on a single page, a flag is set and the item
+ * is a { page, size } pair such that the page is the first page of an overflow
+ * chain with size bytes of item.  Overflow pages are simply bytes without any
+ * external structure.
+ *
+ * The size and page number fields in the items are long aligned so they can be
+ * manipulated without copying.
  */
  */
-
-typedef struct BTREEDATA_P {
-       char            *bt_fname;              /* NULL for in-memory trees */
-       union {
-               BTDISK  bt_d;                   /* for on-disk btrees */
-               HTABLE  bt_ht;                  /* hash table for mem trees */
-       } bt_s;
-       size_t          bt_psize;               /* page size for btree pages */
-       int             (*bt_compare)();        /* key comparison function */
-       pgno_t          bt_npages;              /* number of pages in tree */
-       BTHEADER        *bt_curpage;            /* current page contents */
-       pgno_t          bt_free;                /* free pg list for big data */
-       CURSOR          bt_cursor;              /* cursor for scans */
-       BTSTACK         *bt_stack;              /* parent stack for inserts */
-       u_long          bt_lorder;              /* byte order (endian.h) */
-
-#define BTF_METAOK     0x01    /* meta-data written to start of file */
-#define BTF_SEQINIT    0x02    /* we have called bt_seq */
-#define BTF_ISWRITE    0x04    /* tree was opened for write */
-#define BTF_NODUPS     0x08    /* tree created for unique keys */
-
-       u_long          bt_flags;               /* btree state */
-} BTREEDATA_P;
-
-typedef BTREEDATA_P    *BTREE_P;
+#define        LALIGN(l)       (((l) + sizeof(u_long) - 1) & ~(sizeof(u_long) - 1))
+#define        NOVFLSIZE       (sizeof(pgno_t) + sizeof(size_t))
 
 /*
 
 /*
- *  The first thing in a btree file is a BTMETA structure.  The rest of
- *  the first page is empty, so that all disk operations are page-aligned.
+ * For the btree internal pages, the item is a key.  BINTERNALs are {key, pgno}
+ * pairs, such that the key compares less than or equal to all of the records
+ * on that page.  For a tree without duplicate keys, an internal page with two
+ * consecutive keys, a and b, will have all records greater than or equal to a
+ * and less than b stored on the page associated with a.  Duplicate keys are
+ * somewhat special and can cause duplicate internal and leaf page records and
+ * some minor modifications of the above rule.
  */
  */
-
-typedef struct BTMETA {
-       u_long  m_magic;
-       u_long  m_version;
-       size_t  m_psize;
-       pgno_t  m_free;
-       u_long  m_flags;
-       u_long  m_lorder;
-} BTMETA;
-
-#define P_NONE         0               /* invalid page number in tree */
-#define P_ROOT         1               /* page number of root pg in btree */
-
-#define NORELEASE      0               /* don't release a page during write */
-#define RELEASE                1               /* release a page during write */
-
-#define INSERT         0               /* doing an insert operation */
-#define DELETE         1               /* doing a delete operation */
-
-/* get the next free index on a btree page */
-#define NEXTINDEX(p)   ((((int)(p)->h_lower) - ((int)((((char *)(&(p)->h_linp[0]))) - ((char *) (p)))))/(sizeof(index_t)))
-
-/* is a BTITEM actually on the btree page? */
-#define VALIDITEM(t, i)        ((i)->bti_index < NEXTINDEX((t)->bt_curpage))
-
-/* guarantee longword alignment so structure refs work */
-#define LONGALIGN(p) (((long)(p) + 3) & ~ 0x03)
-
-/* get a particular datum (or idatum) off a page */
-#define GETDATUM(h,i)   (((char *) h) + h->h_linp[i])
-
-/* is a {key,datum} too big to put on a single page? */
-#define TOOBIG(t, sz)  (sz >= t->bt_psize / 5)
-
-/* is this a disk tree or a memory tree? */
-#define ISDISK(t)      (t->bt_fname != (char *) NULL)
-
-/* does the disk tree use a cache? */
-#define ISCACHE(t)     (t->bt_s.bt_d.d_cache != (char *) NULL)
+typedef struct BINTERNAL {
+       size_t  ksize;                  /* key size */
+       pgno_t  pgno;                   /* page number stored on */
+#define        P_BIGDATA       0x01            /* overflow data */
+#define        P_BIGKEY        0x02            /* overflow key */
+       u_char  flags;
+       char    bytes[1];               /* data */
+} BINTERNAL;
+
+/* Get the page's BINTERNAL structure at index indx. */
+#define        GETBINTERNAL(pg, indx) \
+       ((BINTERNAL *)((char *)(pg) + (pg)->linp[indx]))
+
+/* Get the number of bytes in the entry. */
+#define NBINTERNAL(len) \
+       LALIGN(sizeof(size_t) + sizeof(pgno_t) + sizeof(u_char) + (len))
+
+/* Copy a BINTERNAL entry to the page. */
+#define        WR_BINTERNAL(p, size, pgno, flags) { \
+       *(size_t *)p = size; \
+       p += sizeof(size_t); \
+       *(pgno_t *)p = pgno; \
+       p += sizeof(pgno_t); \
+       *(u_char *)p = flags; \
+       p += sizeof(u_char); \
+}
 
 /*
 
 /*
- *  DATUMs are for user data -- one appears on leaf pages for every
- *  tree entry.  The d_bytes[] array contains the key first, then the data.
- *
- *  If either the key or the datum is too big to store on a single page,
- *  a bit is set in the flags entry, and the d_bytes[] array contains a
- *  pgno pointing to the page at which the data is actually stored.
- *
- *  Note on alignment:  every DATUM is guaranteed to be longword aligned
- *  on the disk page.  In order to force longword alignment of user key
- *  and data values, we must guarantee that the d_bytes[] array starts
- *  on a longword boundary.  This is the reason that d_flags is a u_long,
- *  rather than a u_char (it really only needs to be two bits big).  This
- *  is necessary because we call the user's comparison function with a
- *  pointer to the start of the d_bytes array.  We don't need to force
- *  longword alignment of the data following the key, since that is copied
- *  to a longword-aligned buffer before being returned to the user.
+ * For the recno internal pages, the item is a page number with the number of
+ * keys found on that page and below.
  */
  */
-
-typedef struct DATUM {
-       size_t d_ksize;         /* size of key */
-       size_t d_dsize;         /* size of data */
-
-#define D_BIGDATA      0x01    /* indirect datum ptr flag */
-#define D_BIGKEY       0x02    /* indirect key ptr flag */
-
-       u_long d_flags;         /* flags (indirect bit) */
-       char d_bytes[1];        /* VARIABLE LENGTH DATA AT END OF STRUCT */
-} DATUM;
-
-/* BTITEMs are used to return (page, index, datum) tuples from searches */
-typedef struct BTITEM {
-       pgno_t bti_pgno;
-       index_t bti_index;
-       DATUM *bti_datum;
-} BTITEM;
+typedef struct RINTERNAL {
+       recno_t nrecs;                  /* number of records */
+       pgno_t  pgno;                   /* page number stored below */
+} RINTERNAL;
+
+/* Get the page's RINTERNAL structure at index indx. */
+#define        GETRINTERNAL(pg, indx) \
+       ((RINTERNAL *)((char *)(pg) + (pg)->linp[indx]))
+
+/* Get the number of bytes in the entry. */
+#define NRINTERNAL \
+       LALIGN(sizeof(recno_t) + sizeof(pgno_t))
+
+/* Copy a RINTERAL entry to the page. */
+#define        WR_RINTERNAL(p, nrecs, pgno) { \
+       *(size_t *)p = nrecs; \
+       p += sizeof(recno_t); \
+       *(pgno_t *)p = pgno; \
+}
+
+/* For the btree leaf pages, the item is a key and data pair. */
+typedef struct BLEAF {
+       size_t  ksize;                  /* size of key */
+       size_t  dsize;                  /* size of data */
+       u_char  flags;                  /* P_BIGDATA, P_BIGKEY */
+       char    bytes[1];               /* data */
+} BLEAF;
+
+/* Get the page's BLEAF structure at index indx. */
+#define        GETBLEAF(pg, indx) \
+       ((BLEAF *)((char *)(pg) + (pg)->linp[indx]))
+
+/* Get the number of bytes in the entry. */
+#define NBLEAF(p) \
+       LALIGN(sizeof(size_t) + sizeof(size_t) + sizeof(u_char) + \
+           (p)->ksize + (p)->dsize)
+
+/* Get the number of bytes in the user's key/data pair. */
+#define NBLEAFDBT(ksize, dsize) \
+       LALIGN(sizeof(size_t) + sizeof(size_t) + sizeof(u_char) + \
+           (ksize) + (dsize))
+
+/* Copy a BLEAF entry to the page. */
+#define        WR_BLEAF(p, key, data, flags) { \
+       *(size_t *)p = key->size; \
+       p += sizeof(size_t); \
+       *(size_t *)p = data->size; \
+       p += sizeof(size_t); \
+       *(u_char *)p = flags; \
+       p += sizeof(u_char); \
+       bcopy(key->data, p, key->size); \
+       p += key->size; \
+       bcopy(data->data, p, data->size); \
+}
+
+/* For the recno leaf pages, the item is a data entry. */
+typedef struct RLEAF {
+       size_t  dsize;                  /* size of data */
+       u_char  flags;                  /* P_BIGDATA */
+       char    bytes[1];
+} RLEAF;
+
+/* Get the page's RLEAF structure at index indx. */
+#define        GETRLEAF(pg, indx) \
+       ((RLEAF *)((char *)(pg) + (pg)->linp[indx]))
+
+/* Get the number of bytes in the entry. */
+#define NRLEAF(p) \
+       LALIGN(sizeof(size_t) + sizeof(u_char) + (p)->dsize)
+
+/* Get the number of bytes from the user's data. */
+#define        NRLEAFDBT(dsize) \
+       LALIGN(sizeof(size_t) + sizeof(u_char) + (dsize))
+
+/* Copy a RLEAF entry to the page. */
+#define        WR_RLEAF(p, data, flags) { \
+       *(size_t *)p = data->size; \
+       p += sizeof(size_t); \
+       *(u_char *)p = flags; \
+       p += sizeof(u_char); \
+       bcopy(data->data, p, data->size); \
+}
 
 /*
 
 /*
- *  IDATUMs are for data stored on internal pages.  This is the (key, pgno)
- *  pair, such that key 'key' is the first entry on page 'pgno'.  If our
- *  internal page contains keys (a) and (b) next to each other, then all
- *  items >= to (a) and < (b) go on the same page as (a).  There are some
- *  gotchas with duplicate keys, however.  See the split code for details.
+ * A record in the tree is either a pointer to a page and an index in the page
+ * or a page number and an index.  These structures are used as a cursor, stack
+ * entry and search returns as well as to pass records to other routines.
  *
  *
- *  If a key is too big to fit on a single page, then the i_bytes[] array
- *  contains a pgno pointing to the start of a chain that actually stores
- *  the bytes.  Since items on internal pages are never deleted from the
- *  tree, these indirect chains are marked as special, so that they won't
- *  be deleted if the corresponding leaf item is deleted.
- *
- *  As for DATUMs, IDATUMs have a u_long flag entry (rather than u_char)
- *  in order to guarantee that user keys are longword aligned on the disk
- *  page.
+ * One comment about searches.  Internal page searches must find the largest
+ * record less than key in the tree so that descents work.  Leaf page searches
+ * must find the smallest record greater than key so that the returned index
+ * is the record's correct position for insertion.
  */
  */
-
-typedef struct IDATUM {
-       size_t i_size;
-       pgno_t i_pgno;
-       u_long i_flags;         /* see DATUM.d_flags, above */
-       char i_bytes[1];        /* VARIABLE LENGTH DATA AT END OF STRUCT */
-} IDATUM;
-
-/* all private interfaces have a leading _ in their names */
-extern BTITEM  *_bt_search();
-extern BTITEM  *_bt_searchr();
-extern BTHEADER        *_bt_allocpg();
-extern index_t _bt_binsrch();
-extern int     _bt_isonpage();
-extern BTITEM  *_bt_first();
-extern int     _bt_release();
-extern int     _bt_wrtmeta();
-extern int     _bt_delindir();
-extern int     _bt_pgout();
-extern int     _bt_pgin();
-extern int     _bt_fixscan();
-extern int     _bt_indirect();
-extern int     _bt_crsrdel();
-extern int     _bt_push();
-extern pgno_t  _bt_pop();
-extern int     strcmp();
-
+typedef struct EPGNO {
+       pgno_t  pgno;                   /* the page number */
+       index_t index;                  /* the index on the page */
+} EPGNO;
+
+typedef struct EPG {
+       PAGE    *page;                  /* the (pinned) page */
+       index_t  index;                 /* the index on the page */
+} EPG;
+
+/* The in-memory btree/recno data structure. */
+typedef struct BTREE {
+       MPOOL   *bt_mp;                 /* memory pool cookie */
+
+       DB      *bt_dbp;                /* pointer to enclosing DB */
+
+       EPGNO   bt_bcursor;             /* btree cursor */
+       recno_t bt_rcursor;             /* R: recno cursor */
+
+#define        BT_POP(t)       (t->bt_sp ? t->bt_stack + --t->bt_sp : NULL)
+#define        BT_CLR(t)       (t->bt_sp = 0)
+       EPGNO   *bt_stack;              /* stack of parent pages */
+       u_int   bt_sp;                  /* current stack pointer */
+       u_int   bt_maxstack;            /* largest stack */
+
+       char    *bt_kbuf;               /* key buffer */
+       size_t  bt_kbufsz;              /* key buffer size */
+       char    *bt_dbuf;               /* data buffer */
+       size_t  bt_dbufsz;              /* data buffer size */
+
+       int     bt_fd;                  /* tree file descriptor */
+       FILE    *bt_rfp;                /* R: record FILE pointer */
+       int     bt_rfd;                 /* R: record file descriptor */
+
+       pgno_t  bt_free;                /* next free page */
+       size_t  bt_psize;               /* page size */
+       int     bt_maxkeypage;          /* maximum keys per page */
+       size_t  bt_minkeypage;          /* minimum keys per page */
+       int     bt_lorder;              /* byte order */
+
+                                       /* sorted order */
+       enum { NOT, BACK, FORWARD, } bt_order;
+       EPGNO   bt_last;                /* last insert */
+
+                                       /* B: key comparison function */
+       int     (*bt_cmp) __P((const DBT *, const DBT *));
+                                       /* B: prefix comparison function */
+       int     (*bt_pfx) __P((const DBT *, const DBT *));
+
+                                       /* R: recno input function */
+       int     (*bt_irec) __P((struct BTREE *, recno_t));
+       recno_t bt_nrecs;               /* R: number of records in the tree */
+       caddr_t bt_smap;                /* R: start of mapped space */
+       caddr_t bt_emap;                /* R: end of mapped space */
+       size_t  bt_reclen;              /* R: fixed record length */
+       u_char  bt_bval;                /* R: delimiting byte/pad character */
+
+#define        BTF_DELCRSR     0x001           /* B: delete cursor when closes/moves */
+#define        BTF_FIXEDLEN    0x002           /* fixed length records */
+#define        BTF_INMEM       0x004           /* in-memory tree */
+#define        BTF_METADIRTY   0x008           /* B: need to write meta-data */
+#define        BTF_MODIFIED    0x010           /* tree modified */
+#define        BTF_NODUPS      0x020           /* no duplicate keys permitted */
+#define        BTF_RDONLY      0x040           /* read-only tree */
+#define        BTF_RECNO       0x080           /* record oriented tree */
+#define        BTF_SEQINIT     0x100           /* sequential scan initialized */
+       u_long          bt_flags;       /* btree state */
+} BTREE;
+
+#define        ISSET(t, f)     ((t)->bt_flags & (f))
+#define        NOTSET(t, f)    (!((t)->bt_flags & (f)))
+#define        SET(t, f)       ((t)->bt_flags |= (f))
+#define        UNSET(t, f)     ((t)->bt_flags &= ~(f))
+
+#include "extern.h"
diff --git a/usr/src/lib/libc/db/btree/extern.h b/usr/src/lib/libc/db/btree/extern.h
new file mode 100644 (file)
index 0000000..780e9c9
--- /dev/null
@@ -0,0 +1,57 @@
+/*-
+ * Copyright (c) 1991 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * %sccs.include.redist.c%
+ *
+ *     @(#)extern.h    5.1 (Berkeley) %G%
+ */
+
+int     __bt_close __P((DB *));
+int     __bt_cmp __P((BTREE *, const DBT *, EPG *));
+int     __bt_crsrdel __P((BTREE *, EPGNO *));
+int     __bt_defcmp __P((const DBT *, const DBT *));
+int     __bt_defpfx __P((const DBT *, const DBT *));
+int     __bt_delete __P((const DB *, const DBT *, u_int));
+int     __bt_dleaf __P((BTREE *, PAGE *, int));
+EPG    *__bt_first __P((BTREE *, DBT *, int *));
+int     __bt_get __P((const DB *, DBT *, DBT *, u_int));
+DB     *__bt_open __P((const char *, int, int, const BTREEINFO *));
+void    __bt_pgin __P((void *, pgno_t, void *));
+void    __bt_pgout __P((void *, pgno_t, void *));
+int     __bt_push __P((BTREE *, pgno_t, int));
+int     __bt_put __P((const DB *dbp, const DBT *, const DBT *, u_int));
+int     __bt_ret __P((BTREE *, EPG *, DBT *, DBT *));
+EPG    *__bt_search __P((BTREE *, const DBT *, int *));
+int     __bt_seq __P((const DB *, DBT *, DBT *, u_int));
+int     __bt_split __P((BTREE *, PAGE *,
+           const DBT *, const DBT *, u_long, size_t, int));
+int     __bt_sync __P((const DB *));
+
+int     __ovfl_delete __P((BTREE *, void *));
+int     __ovfl_get __P((BTREE *, void *, size_t *, char **, size_t *));
+int     __ovfl_put __P((BTREE *, const DBT *, pgno_t *));
+
+int     __rec_close __P((DB *));
+int     __rec_delete __P((const DB *, const DBT *, u_int));
+int     __rec_fmap __P((BTREE *, recno_t));
+int     __rec_fout __P((BTREE *));
+int     __rec_fpipe __P((BTREE *, recno_t));
+int     __rec_get __P((const DB *, DBT *, DBT *, u_int));
+int     __rec_iput __P((BTREE *, recno_t, const DBT *, u_int));
+int     __rec_put __P((const DB *dbp, const DBT *, const DBT *, u_int));
+int     __rec_ret __P((BTREE *, EPG *, DBT *));
+EPG    *__rec_search __P((BTREE *, recno_t, int *));
+int     __rec_seq __P((const DB *, DBT *, DBT *, u_int));
+int     __rec_sync __P((const DB *));
+int     __rec_vmap __P((BTREE *, recno_t));
+int     __rec_vout __P((BTREE *));
+int     __rec_vpipe __P((BTREE *, recno_t));
+
+#ifdef DEBUG
+void    __bt_dpage __P((PAGE *));
+void    __bt_dump __P((DB *));
+#endif
+#ifdef STATISTICS
+void    __bt_stat __P((DB *));
+#endif
diff --git a/usr/src/lib/libc/db/db/db.c b/usr/src/lib/libc/db/db/db.c
new file mode 100644 (file)
index 0000000..203da1e
--- /dev/null
@@ -0,0 +1,111 @@
+/*-
+ * Copyright (c) 1991 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * %sccs.include.redist.c%
+ */
+
+#if defined(LIBC_SCCS) && !defined(lint)
+static char sccsid[] = "@(#)db.c       5.1 (Berkeley) %G%";
+#endif /* LIBC_SCCS and not lint */
+
+#include <sys/types.h>
+#define        __DBINTERFACE_PRIVATE
+#include <db.h>
+#include <stdio.h>
+#include <stddef.h>
+#include <errno.h>
+#include "../btree/btree.h"
+
+DB *
+dbopen(fname, flags, mode, type, openinfo)
+       const char *fname;
+       int flags, mode;
+       DBTYPE type;
+       const void *openinfo;
+{
+       switch (type) {
+       case DB_BTREE:
+               return (__bt_open(fname, flags, mode, openinfo));
+       case DB_HASH:
+               return (__hash_open(fname, flags, mode, openinfo));
+       case DB_RECNO:
+               return (__rec_open(fname, flags, mode, openinfo));
+       }
+       errno = EINVAL;
+       return (NULL);
+}
+
+DB *
+__hash_open(fname, flags, mode, openinfo)
+       const char *fname;
+       int flags, mode;
+       const HASHINFO *openinfo;
+{ return (NULL); }
+
+static int __db_edel __P((const DB *, const DBT *, u_int));
+static int __db_eget __P((const DB *, DBT *, DBT *, u_int));
+static int __db_eput __P((const DB *dbp, const DBT *, const DBT *, u_int));
+static int __db_eseq __P((const DB *, DBT *, DBT *, u_int));
+static int __db_esync __P((const DB *));
+
+/*
+ * __DBPANIC -- Stop.
+ *
+ * Parameters:
+ *     dbp:    pointer to the DB structure.
+ */
+void
+__dbpanic(dbp)
+       DB *dbp;
+{
+       /* The only thing that can succeed is a close. */
+       dbp->del = __db_edel;
+       dbp->get = __db_eget;
+       dbp->put = __db_eput;
+       dbp->seq = __db_eseq;
+       dbp->sync = __db_esync;
+}
+
+static int
+__db_edel(dbp, key, flags)
+       const DB *dbp;
+       const DBT *key;
+       u_int flags;
+{
+       return (RET_ERROR);
+}
+
+static int
+__db_eget(dbp, key, data, flag)
+       const DB *dbp;
+       DBT *key, *data;
+       u_int flag;
+{
+       return (RET_ERROR);
+}
+
+static int
+__db_eput(dbp, key, data, uflags)
+       const DB *dbp;
+       const DBT *key, *data;
+       u_int uflags;
+{
+       return (RET_ERROR);
+}
+
+static int
+__db_eseq(dbp, key, data, flags)
+       const DB *dbp;
+       DBT *key, *data;
+       u_int flags;
+{
+       return (RET_ERROR);
+}
+
+static int
+__db_esync(dbp)
+       const DB *dbp;
+{
+       return (RET_ERROR);
+}
diff --git a/usr/src/lib/libc/db/recno/extern.h b/usr/src/lib/libc/db/recno/extern.h
new file mode 100644 (file)
index 0000000..c8aca53
--- /dev/null
@@ -0,0 +1,26 @@
+/*-
+ * Copyright (c) 1991 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * %sccs.include.redist.c%
+ *
+ *     @(#)extern.h    5.1 (Berkeley) %G%
+ */
+
+int     __rec_close __P((DB *));
+int     __rec_delete __P((const DB *, const DBT *, u_int));
+int     __rec_fmap __P((BTREE *, recno_t));
+int     __rec_fout __P((BTREE *));
+int     __rec_fpipe __P((BTREE *, recno_t));
+int     __rec_get __P((const DB *, DBT *, DBT *, u_int));
+int     __rec_iput __P((BTREE *, recno_t, const DBT *, u_int));
+int     __rec_put __P((const DB *dbp, const DBT *, const DBT *, u_int));
+int     __rec_ret __P((BTREE *, EPG *, DBT *));
+EPG    *__rec_search __P((BTREE *, recno_t, int *));
+int     __rec_seq __P((const DB *, DBT *, DBT *, u_int));
+int     __rec_sync __P((const DB *));
+int     __rec_vmap __P((BTREE *, recno_t));
+int     __rec_vout __P((BTREE *));
+int     __rec_vpipe __P((BTREE *, recno_t));
+
+#include "../btree/extern.h"
diff --git a/usr/src/lib/libc/db/recno/rec_close.c b/usr/src/lib/libc/db/recno/rec_close.c
new file mode 100644 (file)
index 0000000..b80ca0e
--- /dev/null
@@ -0,0 +1,97 @@
+/*-
+ * Copyright (c) 1990 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * %sccs.include.redist.c%
+ */
+
+#if defined(LIBC_SCCS) && !defined(lint)
+static char sccsid[] = "@(#)rec_close.c        5.1 (Berkeley) %G%";
+#endif /* LIBC_SCCS and not lint */
+
+#include <sys/param.h>
+#include <sys/uio.h>
+#include <errno.h>
+#include <db.h>
+#include <unistd.h>
+#include <stdio.h>
+#include "../btree/btree.h"
+
+/*
+ * __REC_CLOSE -- Close a recno tree.
+ *
+ * Parameters:
+ *     dbp:    pointer to access method
+ *
+ * Returns:
+ *     RET_ERROR, RET_SUCCESS
+ */
+int
+__rec_close(dbp)
+       DB *dbp;
+{
+       if (__rec_sync(dbp) == RET_ERROR)
+               return (RET_ERROR);
+       return (__bt_close(dbp));
+}
+
+/*
+ * __REC_SYNC -- sync the recno tree to disk.
+ *
+ * Parameters:
+ *     dbp:    pointer to access method
+ *
+ * Returns:
+ *     RET_SUCCESS, RET_ERROR.
+ *
+ * XXX
+ * Currently don't handle a key marked for deletion when the tree is synced.
+ * Should copy the page and write it out instead of the real page.
+ */
+int
+__rec_sync(dbp)
+       const DB *dbp;
+{
+       struct iovec iov[2];
+       BTREE *t;
+       DBT data, key;
+       recno_t scursor;
+       int status;
+
+       t = dbp->internal;
+
+       if (ISSET(t, BTF_INMEM) || NOTSET(t, BTF_MODIFIED))
+               return (RET_SUCCESS);
+
+       if (ISSET(t, BTF_RDONLY)) {
+               errno = EPERM;
+               return (RET_ERROR);
+       }
+
+       /* Suck any remaining records into the tree. */
+       if (t->bt_irec(t, MAX_REC_NUMBER) == RET_ERROR)
+               return (RET_ERROR);
+
+       /* Rewind the file descriptor. */
+       if (lseek(t->bt_rfd, 0L, SEEK_SET) != 0L)
+               return (RET_ERROR);
+
+       iov[1].iov_base = "\n";
+       iov[1].iov_len = 1;
+       scursor = t->bt_rcursor;
+
+       status = (dbp->seq)(dbp, &key, &data, R_FIRST);
+        while (status == RET_SUCCESS) {
+               iov[0].iov_base = data.data;
+               iov[0].iov_len = data.size;
+               if (writev(t->bt_rfd, iov, 2) != data.size + 1)
+                       return (RET_ERROR);
+                status = (dbp->seq)(dbp, &key, &data, R_NEXT);
+        }
+       t->bt_rcursor = scursor;
+       if (status != RET_ERROR) {
+               UNSET(t, BTF_MODIFIED);
+               return (RET_SUCCESS);
+       }
+       return (RET_ERROR);
+}
diff --git a/usr/src/lib/libc/db/recno/rec_delete.c b/usr/src/lib/libc/db/recno/rec_delete.c
new file mode 100644 (file)
index 0000000..9aa76d9
--- /dev/null
@@ -0,0 +1,172 @@
+/*-
+ * Copyright (c) 1990 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Mike Olson.
+ *
+ * %sccs.include.redist.c%
+ */
+
+#if defined(LIBC_SCCS) && !defined(lint)
+static char sccsid[] = "@(#)rec_delete.c       5.1 (Berkeley) %G%";
+#endif /* LIBC_SCCS and not lint */
+
+#include <sys/types.h>
+#include <errno.h>
+#include <db.h>
+#include <stdio.h>
+#include <string.h>
+#include "../btree/btree.h"
+
+static int rec_rdelete __P((BTREE *, recno_t));
+
+/*
+ * __REC_DELETE -- Delete the item(s) referenced by a key.
+ *
+ * Parameters:
+ *     dbp:    pointer to access method
+ *     key:    key to delete
+ *     flags:  R_CURSOR if deleting what the cursor references
+ *
+ * Returns:
+ *     RET_ERROR, RET_SUCCESS and RET_SPECIAL if the key not found.
+ */
+int
+__rec_delete(dbp, key, flags)
+       const DB *dbp;
+       const DBT *key;
+       u_int flags;
+{
+       BTREE *t;
+       recno_t nrec;
+       int status;
+
+       if ((nrec = *(recno_t *)key->data) == 0) {
+               errno = EINVAL;
+               return (RET_ERROR);
+       }
+       --nrec;
+
+       t = dbp->internal;
+       switch(flags) {
+       case 0:
+               status = rec_rdelete(t, nrec);
+               break;
+       case R_CURSOR:
+               status = rec_rdelete(t, t->bt_rcursor);
+               break;
+       default:
+               errno = EINVAL;
+               return (RET_ERROR);
+       }
+
+       if (status == RET_SUCCESS) {
+               --t->bt_nrecs;
+               SET(t, BTF_MODIFIED);
+       }
+       return (status);
+}
+
+/*
+ * REC_RDELETE -- Delete the data matching the specified key.
+ *
+ * Parameters:
+ *     tree:   tree
+ *     nrec:   record to delete
+ *
+ * Returns:
+ *     RET_ERROR, RET_SUCCESS and RET_SPECIAL if the key not found.
+ */
+static int
+rec_rdelete(t, nrec)
+       BTREE *t;
+       recno_t nrec;
+{
+       EPG *e;
+       EPGNO *parent;
+       PAGE *h;
+       int exact, status;
+
+       /* Find any matching record; __rec_search pins the page. */
+       e = __rec_search(t, nrec, &exact);
+       if (e == NULL || !exact) {
+               mpool_put(t->bt_mp, e->page, 0);
+               return (e == NULL ? RET_ERROR : RET_SPECIAL);
+       }
+
+       /* Delete the record. */
+       h = e->page;
+       status = __rec_dleaf(t, h, e->index);
+       if (status == RET_SUCCESS)
+               mpool_put(t->bt_mp, h, MPOOL_DIRTY);
+       else {
+               mpool_put(t->bt_mp, h, 0);
+               return (status);
+       }
+
+       /* Decrement the count on all parent pages. */
+       while  ((parent = BT_POP(t)) != NULL) {
+               if ((h = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL)
+                       return (RET_ERROR);
+               --GETRINTERNAL(h, parent->index)->nrecs;
+               mpool_put(t->bt_mp, h, MPOOL_DIRTY);
+       }
+       return (RET_SUCCESS);
+}
+
+/*
+ * __REC_DLEAF -- Delete a single record from a recno leaf page.
+ *
+ * Parameters:
+ *     t:      tree
+ *     index:  index on current page to delete
+ *
+ * Returns:
+ *     RET_SUCCESS, RET_ERROR.
+ */
+int
+__rec_dleaf(t, h, index)
+       BTREE *t;
+       PAGE *h;
+       int index;
+{
+       register RLEAF *rl;
+       register index_t *ip, offset;
+       register size_t nbytes;
+       register int cnt;
+       char *from;
+       void *to;
+
+       /*
+        * Delete a record from a recno leaf page.  Internal records are never
+        * deleted from internal pages, regardless of the records that caused
+        * them to be added being deleted.  Pages made empty by deletion are
+        * not reclaimed.  They are, however, made available for reuse.
+        *
+        * Pack the remaining entries at the end of the page, shift the indices
+        * down, overwriting the deleted record and its index.  If the record
+        * uses overflow pages, make them available for reuse.
+        */
+       to = rl = GETRLEAF(h, index);
+       if (rl->flags & P_BIGDATA && __ovfl_delete(t, rl->bytes) == RET_ERROR)
+               return (RET_ERROR);
+       nbytes = NRLEAF(rl);
+
+       /*
+        * Compress the key/data pairs.  Compress and adjust the [BR]LEAF
+        * offsets.  Reset the headers.
+        */
+       from = (char *)h + h->upper;
+       bcopy(from, from + nbytes, (char *)to - from);
+       h->upper += nbytes;
+
+       offset = h->linp[index];
+       for (cnt = &h->linp[index] - (ip = &h->linp[0]); cnt--; ++ip)
+               if (ip[0] < offset)
+                       ip[0] += nbytes;
+       for (cnt = &h->linp[NEXTINDEX(h)] - ip; --cnt; ++ip)
+               ip[0] = ip[1] < offset ? ip[1] + nbytes : ip[1];
+       h->lower -= sizeof(index_t);
+       return (RET_SUCCESS);
+}
diff --git a/usr/src/lib/libc/db/recno/rec_get.c b/usr/src/lib/libc/db/recno/rec_get.c
new file mode 100644 (file)
index 0000000..136d29f
--- /dev/null
@@ -0,0 +1,271 @@
+/*-
+ * Copyright (c) 1990 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * %sccs.include.redist.c%
+ */
+
+#if defined(LIBC_SCCS) && !defined(lint)
+static char sccsid[] = "@(#)rec_get.c  5.1 (Berkeley) %G%";
+#endif /* LIBC_SCCS and not lint */
+
+#include <sys/types.h>
+#include <errno.h>
+#include <db.h>
+#include <unistd.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "../btree/btree.h"
+
+/*
+ * __REC_GET -- Get a record from the btree.
+ *
+ * Parameters:
+ *     dbp:    pointer to access method
+ *     key:    key to find
+ *     data:   data to return
+ *     flag:   currently unused
+ *
+ * Returns:
+ *     RET_ERROR, RET_SUCCESS and RET_SPECIAL if the key not found.
+ */
+int
+__rec_get(dbp, key, data, flags)
+       const DB *dbp;
+       DBT *key, *data;
+       u_int flags;
+{
+       BTREE *t;
+       EPG *e;
+       recno_t nrec;
+       int exact, status;
+
+       if (flags || (nrec = *(recno_t *)key->data) == 0) {
+               errno = EINVAL;
+               return (RET_ERROR);
+       }
+
+       /*
+        * If we haven't seen this record yet, try to find it in the
+        * original file.
+        */
+       t = dbp->internal;
+       if (nrec > t->bt_nrecs && 
+          (status = t->bt_irec(t, nrec)) != RET_SUCCESS)
+                       return (status);
+
+       --nrec;
+       if ((e = __rec_search(t, nrec, &exact)) == NULL)
+               return (RET_ERROR);
+
+       if (!exact) {
+               mpool_put(t->bt_mp, e->page, 0);
+               return (RET_SPECIAL);
+       }
+
+       status = __rec_ret(t, e, data);
+       mpool_put(t->bt_mp, e->page, 0);
+       return (status);
+}
+
+/*
+ * __REC_FPIPE -- Get fixed length records from a pipe.
+ *
+ * Parameters:
+ *     t:      tree
+ *     cnt:    records to read
+ *
+ * Returns:
+ *     RET_ERROR, RET_SUCCESS
+ */
+int
+__rec_fpipe(t, top)
+       BTREE *t;
+       recno_t top;
+{
+       static int eof;
+       DBT data;
+       recno_t nrec;
+       size_t len;
+       int ch;
+       char *p;
+
+       if (eof)
+               return (RET_SPECIAL);
+
+       data.data = t->bt_dbuf;
+       data.size = t->bt_reclen;
+
+       if (t->bt_dbufsz < t->bt_reclen) {
+               if ((t->bt_dbuf = realloc(t->bt_dbuf, t->bt_reclen)) == NULL)
+                       return (RET_ERROR);
+               t->bt_dbufsz = t->bt_reclen;
+       }
+       for (nrec = t->bt_nrecs; nrec < top; ++nrec) {
+               for (p = t->bt_dbuf;; *p++ = ch)
+                       if ((ch = getc(t->bt_rfp)) == EOF || !len--) {
+                               if (__rec_iput(t, nrec, &data, 0)
+                                   != RET_SUCCESS)
+                                       return (RET_ERROR);
+                               break;
+                       }
+               if (ch == EOF)
+                       break;
+       }
+       if (nrec < top) {
+               eof = 1;
+               return (RET_SPECIAL);
+       }
+       return (RET_SUCCESS);
+}
+
+/*
+ * __REC_VPIPE -- Get variable length records from a pipe.
+ *
+ * Parameters:
+ *     t:      tree
+ *     cnt:    records to read
+ *
+ * Returns:
+ *     RET_ERROR, RET_SUCCESS
+ */
+int
+__rec_vpipe(t, top)
+       BTREE *t;
+       recno_t top;
+{
+       static int eof;
+       DBT data;
+       recno_t nrec;
+       index_t len;
+       size_t sz;
+       int bval, ch;
+       char *p;
+
+       if (eof)
+               return (RET_SPECIAL);
+
+       bval = t->bt_bval;
+       for (nrec = t->bt_nrecs; nrec < top; ++nrec) {
+               for (p = t->bt_dbuf, sz = t->bt_dbufsz;; *p++ = ch, --sz) {
+                       if ((ch = getc(t->bt_rfp)) == EOF || ch == bval) {
+                               data.data = t->bt_dbuf;
+                               data.size = p - t->bt_dbuf;
+                               if (__rec_iput(t, nrec, &data, 0)
+                                   != RET_SUCCESS)
+                                       return (RET_ERROR);
+                               break;
+                       }
+                       if (sz == 0) {
+                               len = p - t->bt_dbuf;
+                               sz = t->bt_dbufsz += 256;
+                               if ((t->bt_dbuf =
+                                   realloc(t->bt_dbuf, sz)) == NULL)
+                                       return (RET_ERROR);
+                               p = t->bt_dbuf + len;
+                       }
+               }
+               if (ch == EOF)
+                       break;
+       }
+       if (nrec < top) {
+               eof = 1;
+               return (RET_SPECIAL);
+       }
+       return (RET_SUCCESS);
+}
+
+/*
+ * __REC_FMAP -- Get fixed length records from a file.
+ *
+ * Parameters:
+ *     t:      tree
+ *     cnt:    records to read
+ *
+ * Returns:
+ *     RET_ERROR, RET_SUCCESS
+ */
+int
+__rec_fmap(t, top)
+       BTREE *t;
+       recno_t top;
+{
+       static int eof;
+       DBT data;
+       recno_t nrec;
+       caddr_t sp, ep;
+       size_t len;
+       char *p;
+
+       if (eof)
+               return (RET_SPECIAL);
+
+       sp = t->bt_smap;
+       ep = t->bt_emap;
+       data.data = t->bt_dbuf;
+       data.size = t->bt_reclen;
+
+       if (t->bt_dbufsz < t->bt_reclen) {
+               if ((t->bt_dbuf = realloc(t->bt_dbuf, t->bt_reclen)) == NULL)
+                       return (RET_ERROR);
+               t->bt_dbufsz = t->bt_reclen;
+       }
+       for (nrec = t->bt_nrecs; nrec < top; ++nrec) {
+               if (sp >= ep) {
+                       eof = 1;
+                       return (RET_SPECIAL);
+               }
+               len = t->bt_reclen;
+               for (p = t->bt_dbuf; sp < ep && len--; *p++ = *sp++);
+               memset(p, t->bt_bval, len);
+               if (__rec_iput(t, nrec, &data, 0) != RET_SUCCESS)
+                       return (RET_ERROR);
+       }
+       t->bt_smap = sp;
+       return (RET_SUCCESS);
+}
+
+/*
+ * __REC_VMAP -- Get variable length records from a file.
+ *
+ * Parameters:
+ *     t:      tree
+ *     cnt:    records to read
+ *
+ * Returns:
+ *     RET_ERROR, RET_SUCCESS
+ */
+int
+__rec_vmap(t, top)
+       BTREE *t;
+       recno_t top;
+{
+       static int eof;
+       DBT data;
+       recno_t nrec;
+       caddr_t sp, ep;
+       int bval;
+
+       if (eof)
+               return (RET_SPECIAL);
+
+       sp = t->bt_smap;
+       ep = t->bt_emap;
+       bval = t->bt_bval;
+
+       for (nrec = t->bt_nrecs; nrec < top; ++nrec) {
+               if (sp >= ep) {
+                       eof = 1;
+                       return (RET_SPECIAL);
+               }
+               for (data.data = sp; sp < ep && *sp != bval; ++sp);
+               data.size = sp - (caddr_t)data.data;
+               if (__rec_iput(t, nrec, &data, 0) != RET_SUCCESS)
+                       return (RET_ERROR);
+               ++sp;
+       }
+       t->bt_smap = sp;
+       return (RET_SUCCESS);
+}
diff --git a/usr/src/lib/libc/db/recno/rec_open.c b/usr/src/lib/libc/db/recno/rec_open.c
new file mode 100644 (file)
index 0000000..99363ad
--- /dev/null
@@ -0,0 +1,127 @@
+/*-
+ * Copyright (c) 1990 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Mike Olson.
+ *
+ * %sccs.include.redist.c%
+ */
+
+#if defined(LIBC_SCCS) && !defined(lint)
+static char sccsid[] = "@(#)rec_open.c 5.1 (Berkeley) %G%";
+#endif /* LIBC_SCCS and not lint */
+
+#include <sys/types.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <limits.h>
+#include <db.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <stddef.h>
+#include "../btree/btree.h"
+
+DB *
+__rec_open(fname, flags, mode, openinfo)
+       const char *fname;
+       int flags, mode;
+       const RECNOINFO *openinfo;
+{
+       BTREE *t;
+       BTREEINFO btopeninfo;
+       DB *dbp;
+       PAGE *h;
+       struct stat sb;
+       int rfd;
+
+       /* Open the user's file -- if this fails, we're done. */
+       if ((rfd = open(fname, flags, mode)) < 0)
+               return (NULL);
+
+       /* Create a btree in memory (backed by disk). */
+       if (openinfo) {
+               btopeninfo.flags = 0;
+               btopeninfo.cachesize = openinfo->cachesize;
+               btopeninfo.psize = 0;
+               btopeninfo.compare = NULL;
+               btopeninfo.lorder = openinfo->lorder;
+               dbp = __bt_open(NULL, O_RDWR, S_IRUSR | S_IWUSR, &btopeninfo);
+       } else
+               dbp = __bt_open(NULL, O_RDWR, S_IRUSR | S_IWUSR, NULL);
+       if (dbp == NULL) {
+               (void)close(rfd);
+               return (NULL);
+       }
+
+       /*
+        * Some fields in the tree structure are recno specific.  Fill them
+        * in and make the btree structure look like a recno structure.
+        */
+       t = dbp->internal;
+       if (openinfo) {
+               if (openinfo->flags & R_FIXEDLEN)
+                       t->bt_flags |= BTF_FIXEDLEN;
+
+               t->bt_reclen = openinfo->reclen;
+               if (t->bt_reclen == 0) {
+                       errno = EINVAL;
+                       goto err;
+               }
+
+               t->bt_bval = openinfo->bval;
+       } else
+               t->bt_bval = '\n';
+
+       t->bt_flags = BTF_RECNO;
+
+       /*
+        * In 4.4BSD stat(2) returns true for ISSOCK on pipes.  Until then,
+        * this is fairly close.  Pipes are read-only.
+        */
+       if (lseek(rfd, 0L, SEEK_CUR) == -1 && errno == ESPIPE) {
+               SET(t, BTF_RDONLY);
+               if ((t->bt_rfp = fdopen(rfd, "r")) == NULL)
+                       goto err;
+               t->bt_irec = ISSET(t, BTF_FIXEDLEN) ? __rec_fpipe : __rec_vpipe;
+       } else {
+               if (fstat(rfd, &sb))
+                       goto err;
+               if (!(flags & (O_RDWR | O_WRONLY)))
+                       SET(t, BTF_RDONLY);
+               if ((t->bt_smap = mmap(NULL, sb.st_size, PROT_READ, MAP_FILE,
+                   rfd, (off_t)0)) == NULL)
+                       goto err;
+               t->bt_emap = t->bt_smap + sb.st_size;
+               t->bt_rfd = rfd;
+               t->bt_irec = ISSET(t, BTF_FIXEDLEN) ? __rec_fmap : __rec_vmap;
+       }
+
+       /* Use the recno routines. */
+       dbp->close = __rec_close;
+       dbp->del = __rec_delete;
+       dbp->get = __rec_get;
+       dbp->put = __rec_put;
+       dbp->seq = __rec_seq;
+       dbp->sync = __rec_sync;
+
+       /* If the root page was created, reset the flags. */
+       if ((h = mpool_get(t->bt_mp, P_ROOT, 0)) == NULL)
+               goto err;
+       if ((h->flags & P_TYPE) == P_BLEAF) {
+               h->flags = h->flags & ~P_TYPE | P_RLEAF;
+               mpool_put(t->bt_mp, h, MPOOL_DIRTY);
+       } else
+               mpool_put(t->bt_mp, h, 0);
+
+       if (openinfo && openinfo->flags & R_SNAPSHOT &&
+           t->bt_irec(t, MAX_REC_NUMBER) == RET_ERROR)
+                goto err;
+       return (dbp);
+
+err:   __bt_close(dbp);
+       (void)close(rfd);
+       return (NULL);
+}
diff --git a/usr/src/lib/libc/db/recno/rec_put.c b/usr/src/lib/libc/db/recno/rec_put.c
new file mode 100644 (file)
index 0000000..3b70dc6
--- /dev/null
@@ -0,0 +1,197 @@
+/*-
+ * Copyright (c) 1990 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * %sccs.include.redist.c%
+ */
+
+#if defined(LIBC_SCCS) && !defined(lint)
+static char sccsid[] = "@(#)rec_put.c  5.1 (Berkeley) %G%";
+#endif /* LIBC_SCCS and not lint */
+
+#include <sys/types.h>
+#include <errno.h>
+#include <db.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "../btree/btree.h"
+
+/*
+ * __REC_PUT -- Add a recno item to the tree.
+ *
+ * Parameters:
+ *     dbp:    pointer to access method
+ *     key:    key
+ *     data:   data
+ *     flag:   R_NOOVERWRITE
+ *
+ * Returns:
+ *     RET_ERROR, RET_SUCCESS and RET_SPECIAL if the key is already in the
+ *     tree and R_NOOVERWRITE specified.
+ */
+int
+__rec_put(dbp, key, data, flags)
+       const DB *dbp;
+       const DBT *key, *data;
+       u_int flags;
+{
+       BTREE *t;
+       DBT tdata;
+       recno_t nrec;
+       int status;
+
+       if (flags &&
+           flags != R_IAFTER && flags != R_IBEFORE && flags != R_NOOVERWRITE ||
+           (nrec = *(recno_t *)key->data) == 0) {
+               errno = EINVAL;
+               return (RET_ERROR);
+       }
+
+       /*
+        * If skipping records, either get them from the original file or
+        * create empty ones.
+        */
+       t = dbp->internal;
+       if (nrec > t->bt_nrecs && t->bt_irec(t, nrec) == RET_ERROR)
+               return (RET_ERROR);
+       if (nrec > t->bt_nrecs) {
+               tdata.data = NULL;
+               tdata.size = 0;
+               while (nrec > t->bt_nrecs) {
+                       status = __rec_iput(t, nrec, &tdata, 0);
+                       if (status != RET_SUCCESS)
+                               return (RET_ERROR);
+               }
+       }
+       --nrec;
+       if ((status = __rec_iput(t, nrec, data, flags)) == RET_SUCCESS)
+               SET(t, BTF_MODIFIED);
+       return (status);
+}
+
+/*
+ * __REC_IPUT -- Add a recno item to the tree.
+ *
+ * Parameters:
+ *     t:      tree
+ *     nrec:   record number
+ *     data:   data
+ *     flag:   R_NOOVERWRITE
+ *
+ * Returns:
+ *     RET_ERROR, RET_SUCCESS and RET_SPECIAL if the key is already in the
+ *     tree and R_NOOVERWRITE specified.
+ */
+int
+__rec_iput(t, nrec, data, flags)
+       BTREE *t;
+       recno_t nrec;
+       const DBT *data;
+       u_int flags;
+{
+       DBT tdata;
+       EPG *e;
+       EPGNO *parent;
+       PAGE *h;
+       index_t index, nxtindex;
+       pgno_t pg;
+       size_t nbytes;
+       int dflags, exact;
+       char *dest, db[NOVFLSIZE];
+
+       /*
+        * If the data won't fit on a page, store it on indirect pages.
+        *
+        * XXX
+        * If the insert fails later on, these pages aren't recovered.
+        */
+       if (data->size >= t->bt_minkeypage) {
+               if (__ovfl_put(t, data, &pg) == RET_ERROR)
+                       return (RET_ERROR);
+               tdata.data = db;
+               tdata.size = NOVFLSIZE;
+               *(pgno_t *)db = pg;
+               *(size_t *)(db + sizeof(pgno_t)) = data->size;
+               dflags = P_BIGDATA;
+               data = &tdata;
+       } else
+               dflags = 0;
+
+       /* __rec_search pins the returned page. */
+       if ((e = __rec_search(t, nrec, &exact)) == NULL)
+               return (RET_ERROR);
+
+       h = e->page;
+       index = e->index;
+
+       /*
+        * Add the specified key/data pair to the tree.  If an identical key
+        * is already in the tree, and R_NOOVERWRITE is set, an error is
+        * returned.  If R_NOOVERWRITE is not set, the key is either added (if
+        * duplicates are permitted) or an error is returned.  The R_IAFTER
+        * and R_IBEFORE flags insert the key after/before the specified key.
+        *
+        * Pages are split as required.
+        */
+       switch (flags) {
+       case R_IAFTER:
+               if (!exact) {
+                       errno = EINVAL;
+                       goto err;
+               }
+               ++index;
+               break;
+       case R_IBEFORE:
+               if (!exact) {
+                       errno = EINVAL;
+                       goto err;
+               }
+               break;
+       case R_NOOVERWRITE:
+               if (!exact)
+                       break;
+               BT_CLR(t);
+               mpool_put(t->bt_mp, h, 0);
+               return (RET_SPECIAL);
+       default:
+               if (!exact || NOTSET(t, BTF_NODUPS))
+                       break;
+               if (__rec_dleaf(t, h, index) == RET_ERROR) {
+err:                   BT_CLR(t);
+                       mpool_put(t->bt_mp, h, 0);
+                       return (RET_ERROR);
+               }
+               break;
+       }
+
+       /*
+        * If not enough room, split the page.  The split code will insert
+        * the key and data and unpin the current page.  If inserting into
+        * the offset array, shift the pointers up.
+        */
+       nbytes = NRLEAFDBT(data->size);
+       if (h->upper - h->lower < nbytes + sizeof(index_t))
+               return (__bt_split(t, h, NULL, data, dflags, nbytes, index));
+
+       if (index < (nxtindex = NEXTINDEX(h)))
+               bcopy(h->linp + index, h->linp + index + 1,
+                   (nxtindex - index) * sizeof(index_t));
+       h->lower += sizeof(index_t);
+
+       h->linp[index] = h->upper -= nbytes;
+       dest = (char *)h + h->upper;
+       WR_RLEAF(dest, data, dflags);
+
+       mpool_put(t->bt_mp, h, MPOOL_DIRTY);
+
+       /* Increment the count on all parent pages. */
+       while  ((parent = BT_POP(t)) != NULL) {
+               if ((h = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL)
+                       return (RET_ERROR);
+               ++GETRINTERNAL(h, parent->index)->nrecs;
+               mpool_put(t->bt_mp, h, MPOOL_DIRTY);
+       }
+       ++t->bt_nrecs;
+       return (RET_SUCCESS);
+}
diff --git a/usr/src/lib/libc/db/recno/rec_search.c b/usr/src/lib/libc/db/recno/rec_search.c
new file mode 100644 (file)
index 0000000..dd00a66
--- /dev/null
@@ -0,0 +1,79 @@
+/*-
+ * Copyright (c) 1990 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * %sccs.include.redist.c%
+ */
+
+#if defined(LIBC_SCCS) && !defined(lint)
+static char sccsid[] = "@(#)rec_search.c       5.1 (Berkeley) %G%";
+#endif /* LIBC_SCCS and not lint */
+
+#include <sys/types.h>
+#include <errno.h>
+#include <db.h>
+#include <stdio.h>
+#include "../btree/btree.h"
+
+/*
+ * __REC_SEARCH -- Search a btree for a key.
+ *
+ * Parameters:
+ *     t:      tree to search
+ *     key:    key to find
+ *     exactp: pointer to exact match flag
+ *
+ * Returns:
+ *     EPG for matching record, if any, or the EPG for the location of the
+ *     key, if it were inserted into the tree.
+ *
+ * Warnings:
+ *     The EPG returned is in static memory, and will be overwritten by the
+ *     next search of any kind in any tree.
+ */
+EPG *
+__rec_search(t, recno, exactp)
+       BTREE *t;
+       recno_t recno;
+       int *exactp;
+{
+       static EPG e;
+       register index_t index;
+       register PAGE *h;
+       RINTERNAL *r;
+       pgno_t pg;
+       index_t top;
+       recno_t total;
+
+       for (pg = P_ROOT, total = 0;;) {
+               if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
+                       return (NULL);
+               if (h->flags & P_RLEAF) {
+                       e.page = h;
+                       e.index = recno - total;
+                       top = NEXTINDEX(h);
+
+                       if (e.index > top) {
+                               mpool_put(t->bt_mp, h, 0);
+                               errno = EINVAL;
+                               return (NULL);
+                       }
+
+                       *exactp = e.index < top ? 1 : 0;
+                       return (&e);
+               }
+
+               for (index = 0, top = NEXTINDEX(h);;) {
+                       r = GETRINTERNAL(h, index);
+                       if (++index == top || total + r->nrecs >= recno)
+                               break;
+                       total += r->nrecs;
+               }
+
+               if (bt_push(t, h->pgno, index - 1) == RET_ERROR)
+                       return (NULL);
+
+               pg = r->pgno;
+               mpool_put(t->bt_mp, h, 0);
+       }
+}
diff --git a/usr/src/lib/libc/db/recno/rec_seq.c b/usr/src/lib/libc/db/recno/rec_seq.c
new file mode 100644 (file)
index 0000000..f63f4c7
--- /dev/null
@@ -0,0 +1,92 @@
+/*-
+ * Copyright (c) 1991 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * %sccs.include.redist.c%
+ */
+
+#ifndef lint
+static char sccsid[] = "@(#)rec_seq.c  5.1 (Berkeley) %G%";
+#endif /* not lint */
+
+#include <sys/types.h>
+#include <errno.h>
+#include <db.h>
+#include <limits.h>
+#include <stdio.h>
+#include "../btree/btree.h"
+
+/*
+ * __REC_SEQ -- Recno sequential scan interface.
+ *
+ * Parameters:
+ *     dbp:    pointer to access method
+ *     key:    key for positioning and return value
+ *     data:   data return value
+ *     flags:  R_CURSOR, R_FIRST, R_LAST, R_NEXT, R_PREV.
+ *
+ * Returns:
+ *     RET_ERROR, RET_SUCCESS or RET_SPECIAL if there's no next key.
+ */
+int
+__rec_seq(dbp, key, data, flags)
+       const DB *dbp;
+       DBT *key, *data;
+       u_int flags;
+{
+       BTREE *t;
+       EPG *e;
+       recno_t nrec;
+       int exact, status;
+
+       t = dbp->internal;
+       switch(flags) {
+       case R_CURSOR:
+               if ((nrec = *(recno_t *)key->data) == 0) {
+                       errno = EINVAL;
+                       return (RET_ERROR);
+               }
+               break;
+       case R_NEXT:
+               if (ISSET(t, BTF_SEQINIT)) {
+                       nrec = t->bt_rcursor + 1;
+                       break;
+               }
+               /* FALLTHROUGH */
+       case R_FIRST:
+               nrec = 1;
+               SET(t, BTF_SEQINIT);
+               break;
+       case R_PREV:
+               if (ISSET(t, BTF_SEQINIT)) {
+                       nrec = t->bt_rcursor - 1;
+                       break;
+               }
+               /* FALLTHROUGH */
+       case R_LAST:
+               if (t->bt_irec(t, MAX_REC_NUMBER) == RET_ERROR)
+                       return (RET_ERROR);
+               nrec = t->bt_nrecs;
+               SET(t, BTF_SEQINIT);
+               break;
+       default:
+               errno = EINVAL;
+               return (RET_ERROR);
+       }
+       
+       if (nrec > t->bt_nrecs && (status = t->bt_irec(t, nrec)) != RET_SUCCESS)
+               return (status);
+
+       if ((e = __rec_search(t, nrec - 1, &exact)) == NULL)
+               return (RET_ERROR);
+
+       if (!exact) {
+               mpool_put(t->bt_mp, e->page, 0);
+               return (RET_SPECIAL);
+       }
+
+       if ((status = __rec_ret(t, e, data)) == RET_SUCCESS)
+               t->bt_rcursor = nrec;
+       mpool_put(t->bt_mp, e->page, 0);
+       return (status);
+}
diff --git a/usr/src/lib/libc/db/recno/rec_utils.c b/usr/src/lib/libc/db/recno/rec_utils.c
new file mode 100644 (file)
index 0000000..1a7a7cc
--- /dev/null
@@ -0,0 +1,56 @@
+/*-
+ * Copyright (c) 1990 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * %sccs.include.redist.c%
+ */
+
+#if defined(LIBC_SCCS) && !defined(lint)
+static char sccsid[] = "@(#)rec_utils.c        5.1 (Berkeley) %G%";
+#endif /* LIBC_SCCS and not lint */
+
+#include <sys/param.h>
+#include <db.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "../btree/btree.h"
+
+/*
+ * __REC_RET -- Build return data as a result of search or scan.
+ *
+ * Parameters:
+ *     t:      tree
+ *     d:      LEAF to be returned to the user.
+ *     data:   user's data structure
+ *
+ * Returns:
+ *     RET_SUCCESS, RET_ERROR.
+ */
+int
+__rec_ret(t, e, data)
+       BTREE *t;
+       EPG *e;
+       DBT *data;
+{
+       register RLEAF *rl;
+
+       rl = GETRLEAF(e->page, e->index);
+       if (rl->flags & P_BIGDATA) {
+               if (__ovfl_get(t, rl->bytes,
+                   &data->size, &t->bt_dbuf, &t->bt_dbufsz))
+                       return (RET_ERROR);
+       } else {
+               if (rl->dsize > t->bt_dbufsz) {
+                       if ((t->bt_dbuf =
+                           realloc(t->bt_dbuf, rl->dsize)) == NULL)
+                               return (RET_ERROR);
+                       t->bt_dbufsz = rl->dsize;
+               }
+               bcopy(rl->bytes, t->bt_dbuf, t->bt_dbufsz);
+               data->size = rl->dsize;
+       }
+       data->data = t->bt_dbuf;
+
+       return (RET_SUCCESS);
+}
index 390bde3..f6dbabc 100644 (file)
  */
 
 #if defined(LIBC_SCCS) && !defined(lint)
  */
 
 #if defined(LIBC_SCCS) && !defined(lint)
-static char sccsid[] = "@(#)main.c     5.1 (Berkeley) %G%";
+static char sccsid[] = "@(#)main.c     5.2 (Berkeley) %G%";
 #endif /* LIBC_SCCS and not lint */
 
 #endif /* LIBC_SCCS and not lint */
 
-/*
- *  test1.c -- simple btree test program.
- */
-
-#include <stdio.h>
-#include <ctype.h>
 #include <sys/param.h>
 #include <sys/param.h>
-#include <sys/types.h>
-#include <sys/file.h>
+#include <fcntl.h>
 #include <db.h>
 #include <db.h>
-#include <btree.h>
-
-#define        DICTIONARY      "/usr/share/dict/words"
+#include <errno.h>
+#include <stdio.h>
+#include <ctype.h>
+#include <stdlib.h>
+#include <string.h>
+#include "btree.h"
 
 typedef struct cmd_table {
        char *cmd;
        int nargs;
 
 typedef struct cmd_table {
        char *cmd;
        int nargs;
-       int (*func)();
-       char *descrip;
+       int rconv;
+       void (*func) __P((DB *, char **));
+       char *usage, *descrip;
 } cmd_table;
 
 } cmd_table;
 
-extern int cursor(), delcur(), delete(), first(), help(), insert();
-extern int last(), lookup(), next(), previous();
-
-cmd_table Commands[] = {
-       "cursor", 2, cursor,
-               "cursor <word>:  point the scan cursor at <word>",
-       "delcur", 1, delcur,
-               "delcur:  delete the word under the scan cursor",
-       "delete", 2, delete,
-               "delete <word>:  delete <word> from the dictionary",
-       "first", 1, first,
-               "first: point the scan cursor at the first dictionary entry",
-       "help", 1, help,
-               "help:  print this command summary",
-       "insert", 3, insert,
-               "insert <word> <def>:  insert <word> into the dictionary with definition <def>",
-       "last", 1, last,
-               "last:  point the scan cursor at the last dictionary entry",
-       "lookup", 2, lookup,
-               "lookup <word>:  look up <word> in the dictionary",
-       "next", 1, next,
-               "next:  move the scan cursor forward one word",
-       "previous", 1, previous,
-               "previous:  move the scan cursor back one word",
-       (char *) NULL, 0, NULL, (char *) NULL,
+int stopstop;
+DB *globaldb;
+
+void bstat     __P((DB *, char **));
+void cursor    __P((DB *, char **));
+void delcur    __P((DB *, char **));
+void delete    __P((DB *, char **));
+void dump      __P((DB *, char **));
+void first     __P((DB *, char **));
+void get       __P((DB *, char **));
+void help      __P((DB *, char **));
+void iafter    __P((DB *, char **));
+void ibefore   __P((DB *, char **));
+void insert    __P((DB *, char **));
+void keydata   __P((DBT *, DBT *));
+void last      __P((DB *, char **));
+void list      __P((DB *, char **));
+void load      __P((DB *, char **));
+void mstat     __P((DB *, char **));
+void next      __P((DB *, char **));
+int  parse     __P((char *, char **, int));
+void previous  __P((DB *, char **));
+void show      __P((DB *, char **));
+void usage     __P((void));
+void user      __P((DB *));
+
+cmd_table commands[] = {
+       "?",    0, 0, help, "help", NULL,
+       "b",    0, 0, bstat, "bstat", "stat btree",
+       "c",    1, 1, cursor,  "cursor word", "move cursor to word",
+       "delc", 0, 0, delcur, "delcur", "delete key the cursor references",
+       "dele", 1, 1, delete, "delete word", "delete word",
+       "d",    0, 0, dump, "dump", "dump database",
+       "f",    0, 0, first, "first", "move cursor to first record",
+       "g",    1, 1, get, "get word", "locate word",
+       "h",    0, 0, help, "help", "print command summary",
+       "ia",   2, 1, iafter, "iafter key data", "insert data after key",
+       "ib",   2, 1, ibefore, "ibefore key data", "insert data before key",
+       "in",   2, 1, insert, "insert word def", "insert key with data def",
+       "la",   0, 0, last, "last", "move cursor to last record",
+       "li",   1, 1, list, "list file", "list to a file",
+       "loa",  1, 1, load, "load file", NULL,
+       "loc",  1, 1, get, "get word", NULL,
+       "m",    0, 0, mstat, "mstat", "stat memory pool",
+       "n",    0, 0, next, "next", "move cursor forward one record",
+       "p",    0, 0, previous, "previous", "move cursor back one record",
+       "q",    0, 0, NULL, "quit", "quit",
+       "sh",   1, 0, show, "show page", "dump a page",
+       { NULL },
 };
 
 };
 
-char *Usage = "[-p pagesize] [-c cachesize] [-u] [-l|b|n] [dbname]";
+int recno;                                     /* use record numbers */
+char *dict = "words";                          /* default dictionary */
+char *progname;
 
 
+int
 main(argc, argv)
        int argc;
        char **argv;
 {
 main(argc, argv)
        int argc;
        char **argv;
 {
-       char *dbname;
        int c;
        int c;
-       char *progname;
-       extern int strcmp();
-       extern char *optarg;
-       extern int optind;
-       DB *t;
+       DB *db;
        BTREEINFO b;
 
        progname = *argv;
 
        BTREEINFO b;
 
        progname = *argv;
 
-       b.psize = 0;
+       b.flags = 0;
        b.cachesize = 0;
        b.cachesize = 0;
+       b.maxkeypage = 0;
+       b.minkeypage = 0;
+       b.psize = 0;
+       b.compare = NULL;
+       b.prefix = NULL;
        b.lorder = 0;
        b.lorder = 0;
-       b.flags = R_DUP;
-       b.compare = strcmp;
 
 
-       while ((c = getopt(argc, argv, "p:c:ulb")) != EOF) {
+       while ((c = getopt(argc, argv, "bc:di:lp:ru")) != EOF) {
                switch (c) {
                switch (c) {
-                 case 'p':
-                       b.psize = atoi(optarg);
+               case 'b':
+                       b.lorder = BIG_ENDIAN;
                        break;
                        break;
-
-                 case 'c':
+               case 'c':
                        b.cachesize = atoi(optarg);
                        break;
                        b.cachesize = atoi(optarg);
                        break;
-
-                 case 'u':
-                       b.flags = 0;
+               case 'd':
+                       b.flags |= R_DUP;
                        break;
                        break;
-
-                 case 'l':
+               case 'i':
+                       dict = optarg;
+                       break;
+               case 'l':
                        b.lorder = LITTLE_ENDIAN;
                        break;
                        b.lorder = LITTLE_ENDIAN;
                        break;
-
-                 case 'b':
-                       b.lorder = BIG_ENDIAN;
+               case 'p':
+                       b.psize = atoi(optarg);
                        break;
                        break;
-
-                 default:
-                       fprintf(stderr, "%s: usage: %s\n", progname, Usage);
-                       exit (1);
+               case 'r':
+                       recno = 1;
+                       break;
+               case 'u':
+                       b.flags = 0;
+                       break;
+               default:
+                       usage();
                }
        }
                }
        }
+       argc -= optind;
+       argv += optind;
 
 
-       if (argv[optind] != (char *) NULL)
-               dbname = argv[optind];
+       if (recno)
+               db = dbopen(*argv == NULL ? NULL : *argv, O_RDWR,
+                   0, DB_RECNO, NULL);
+       else
+               db = dbopen(*argv == NULL ? NULL : *argv, O_CREAT|O_RDWR,
+                   0600, DB_BTREE, &b);
 
 
-       if ((t = btree_open(dbname, O_CREAT|O_RDWR, 0600, &b)) == (DB *) NULL) {
-               perror(progname);
-               exit (1);
+       if (db == NULL) {
+               (void)fprintf(stderr, "dbopen: %s\n", strerror(errno));
+               exit(1);
        }
        }
-
-       load(t);
-
-       user(t);
+       globaldb = db;
+       user(db);
+       exit(0);
+       /* NOTREACHED */
 }
 
 }
 
-load(t)
-       DB *t;
+void
+user(db)
+       DB *db;
 {
 {
-       char *lbuf;
-       int i, l;
-       int status;
-       FILE *fp;
-       DBT key;
-       DBT data;
-       char word[64];
-       char drow[64];
-
-       printf("loading %s...\n", DICTIONARY);
-       fflush(stdout);
-       if ((fp = fopen(DICTIONARY, "r")) == (FILE *) NULL) {
-               perror("/usr/dict/words");
-               (void) (*(t->close))(t->internal);
-               exit (1);
+       FILE *ifp;
+       int argc, i, last;
+       char *lbuf, *argv[4], buf[512];
+
+       if ((ifp = fopen("/dev/tty", "r")) == NULL) {
+               (void)fprintf(stderr,
+                   "/dev/tty: %s\n", strerror(errno));
+               exit(1);
        }
        }
-
-       key.data = &word[0];
-       data.data = &drow[0];
-       while ((lbuf = fgets(word, 64, fp)) != (char *) NULL) {
-               l = strlen(lbuf) - 1;
-               lbuf[l] = '\0';
-               for (i = 0; i < l; i++)
-                       drow[l - (i + 1)] = word[i];
-               drow[l] = '\0';
-
-               key.size = data.size = l + 1;
-
-               status = (*(t->put))(t->internal, &key, &data, R_NOOVERWRITE);
-
-               switch (status) {
-                 case RET_SUCCESS:
-                       break;
-
-                 case RET_ERROR:
-                       perror("put");
-                       break;
-
-                 case RET_SPECIAL:
-                       fprintf(stderr, "%s is a duplicate key!\n", lbuf);
-                       fflush(stderr);
+       for (last = 0;;) {
+               (void)printf("> ");
+               (void)fflush(stdout);
+               if ((lbuf = fgets(&buf[0], 512, ifp)) == NULL)
                        break;
                        break;
+               if (lbuf[0] == '\n') {
+                       i = last;
+                       goto uselast;
                }
                }
-       }
-
-       (void) fclose(fp);
-       printf("done\n");
-       fflush(stdout);
-}
-
-user(t)
-       DB *t;
-{
-       char *lbuf;
-       int argc;
-       int i;
-       char *argv[4];
-       char buf[512];
-
-       for (;;) {
-               printf("> ");
-               fflush(stdout);
-               if ((lbuf = fgets(&buf[0], 512, stdin)) == (char *) NULL)
-                       break;
                lbuf[strlen(lbuf) - 1] = '\0';
 
                lbuf[strlen(lbuf) - 1] = '\0';
 
-               if (strcmp(lbuf, "quit") == 0)
+               if (lbuf[0] == 'q')
                        break;
 
                argc = parse(lbuf, &argv[0], 3);
                if (argc == 0)
                        continue;
 
                        break;
 
                argc = parse(lbuf, &argv[0], 3);
                if (argc == 0)
                        continue;
 
-               for (i = 0; Commands[i].cmd != (char *) NULL; i++) {
-                       if (strcmp(Commands[i].cmd, argv[0]) == 0)
+               for (i = 0; commands[i].cmd != NULL; i++)
+                       if (strncmp(commands[i].cmd, argv[0],
+                           strlen(commands[i].cmd)) == 0)
                                break;
                                break;
-               }
 
 
-               if (Commands[i].cmd == (char *) NULL) {
-                       fprintf(stderr,
-                               "%s: command unknown ('help' for help)\n",
-                               lbuf);
-                       fflush(stderr);
+               if (commands[i].cmd == NULL) {
+                       (void)fprintf(stderr,
+                           "%s: command unknown ('help' for help)\n", lbuf);
                        continue;
                }
 
                        continue;
                }
 
-               if (Commands[i].nargs != argc) {
-                       fprintf(stderr, "arg count\n");
-                       fflush(stderr);
+               if (commands[i].nargs != argc - 1) {
+                       (void)fprintf(stderr, "usage: %s\n", commands[i].usage);
                        continue;
                }
 
                        continue;
                }
 
-               switch (argc) {
-                 case 1:
-                       (*(Commands[i].func))(t);
-                       break;
-                 case 2:
-                       (*(Commands[i].func))(t, argv[1]);
-                       break;
-                 case 3:
-                       (*(Commands[i].func))(t, argv[1], argv[2]);
-                       break;
-                 case 4:
-                       (*(Commands[i].func))(t, argv[1], argv[2], argv[3]);
-                       break;
+               if (recno && commands[i].rconv) {
+                       static recno_t nlong;
+                       nlong = atoi(argv[1]);
+                       argv[1] = (char *)&nlong;
                }
                }
+uselast:       last = i;
+               (*commands[i].func)(db, argv);
        }
        }
-       (void) (*(t->close))(t->internal);
-       exit (0);
+       if ((db->sync)(db) == RET_ERROR)
+               perror("dbsync");
+       else if ((db->close)(db) == RET_ERROR)
+               perror("dbclose");
 }
 
 int
 parse(lbuf, argv, maxargc)
 }
 
 int
 parse(lbuf, argv, maxargc)
-       char *lbuf;
-       char **argv;
+       char *lbuf, **argv;
        int maxargc;
 {
        int argc = 0;
        int maxargc;
 {
        int argc = 0;
@@ -262,241 +239,425 @@ parse(lbuf, argv, maxargc)
        return (argc);
 }
 
        return (argc);
 }
 
-int
-cursor(t, arg)
-       DB *t;
-       char *arg;
+void
+cursor(db, argv)
+       DB *db;
+       char **argv;
 {
 {
+       DBT data, key;
        int status;
        int status;
-       DBT key;
-       DBT data;
 
 
-       key.data = arg;
-       key.size = strlen(arg + 1);
-       status = (*(t->seq))(t->internal, &key, &data, R_CURSOR);
-       if (status == RET_SUCCESS)
-               show(&key, &data);
+       key.data = argv[1];
+       if (recno)
+               key.size = sizeof(recno_t);
        else
        else
+               key.size = strlen(argv[1]) + 1;
+       status = (*db->seq)(db, &key, &data, R_CURSOR);
+       switch (status) {
+       case RET_ERROR:
                perror("cursor");
                perror("cursor");
+               break;
+       case RET_SPECIAL:
+               (void)printf("key not found\n");
+               break;
+       case RET_SUCCESS:
+               keydata(&key, &data);
+               break;
+       }
 }
 
 }
 
-int
-delcur(t)
-       DB *t;
+void
+delcur(db, argv)
+       DB *db;
+       char **argv;
 {
        int status;
 
 {
        int status;
 
-       status = (*(t->delete))(t->internal, (DBT *) NULL, R_CURSOR);
+       status = (*db->del)(db, NULL, R_CURSOR);
 
        if (status == RET_ERROR)
                perror("delcur");
 }
 
 
        if (status == RET_ERROR)
                perror("delcur");
 }
 
-int
-delete(t, arg)
-       DB *t;
-       char *arg;
+void
+delete(db, argv)
+       DB *db;
+       char **argv;
 {
 {
-       int status;
        DBT key;
        DBT key;
+       int status;
 
 
-       key.data = arg;
-       key.size = strlen(arg) + 1;
+       key.data = argv[1];
+       if (recno)
+               key.size = sizeof(recno_t);
+       else
+               key.size = strlen(argv[1]) + 1;
 
 
-       status = (*(t->delete))(t->internal, &key, 0);
+       status = (*db->del)(db, &key, 0);
        switch (status) {
        switch (status) {
-         case RET_SUCCESS:
-               break;
-
-         case RET_ERROR:
+       case RET_ERROR:
                perror("delete");
                break;
                perror("delete");
                break;
-
-         case RET_SPECIAL:
-               fprintf(stderr, "%s not found\n", arg);
-               fflush(stderr);
+       case RET_SPECIAL:
+               (void)printf("key not found\n");
+               break;
+       case RET_SUCCESS:
                break;
        }
 }
 
                break;
        }
 }
 
-int
-first(t)
-       DB *t;
+void
+dump(db, argv)
+       DB *db;
+       char **argv;
+{
+       __bt_dump(db);
+}
+
+void
+first(db, argv)
+       DB *db;
+       char **argv;
 {
 {
+       DBT data, key;
        int status;
        int status;
-       DBT key;
-       DBT data;
 
 
-       status = (*(t->seq))(t->internal, &key, &data, R_FIRST);
+       status = (*db->seq)(db, &key, &data, R_FIRST);
 
        switch (status) {
 
        switch (status) {
-         case RET_ERROR:
+       case RET_ERROR:
                perror("first");
                break;
                perror("first");
                break;
-
-         case RET_SPECIAL:
-               printf("no more keys");
+       case RET_SPECIAL:
+               (void)printf("no more keys\n");
+               break;
+       case RET_SUCCESS:
+               keydata(&key, &data);
                break;
                break;
+       }
+}
+
+void
+get(db, argv)
+       DB *db;
+       char **argv;
+{
+       DBT data, key;
+       int status;
+
+       key.data = argv[1];
+       if (recno)
+               key.size = sizeof(recno_t);
+       else
+               key.size = strlen(argv[1]) + 1;
 
 
-         case RET_SUCCESS:
-               show(&key, &data);
+       status = (*db->get)(db, &key, &data, 0);
+
+       switch (status) {
+       case RET_ERROR:
+               perror("get");
+               break;
+       case RET_SPECIAL:
+               (void)printf("key not found\n");
+               break;
+       case RET_SUCCESS:
+               keydata(&key, &data);
                break;
        }
 }
                break;
        }
 }
-int
-help(t)
-       DB *t;
+
+void
+help(db, argv)
+       DB *db;
+       char **argv;
 {
        int i;
 
 {
        int i;
 
-#ifdef lint
-       t = t;
-#endif /* lint */
-       for (i = 0; Commands[i].cmd != (char *) NULL; i++)
-               printf("%s\n", Commands[i].descrip);
-       printf("type 'quit' to quit\n");
+       for (i = 0; commands[i].cmd; i++)
+               if (commands[i].descrip)
+                       (void)printf("%s: %s\n",
+                           commands[i].usage, commands[i].descrip);
 }
 
 }
 
-int
-insert(t, arg, def)
-       DB *t;
-       char *arg;
-       char *def;
+void
+iafter(db, argv)
+       DB *db;
+       char **argv;
 {
 {
+       DBT key, data;
        int status;
        int status;
-       DBT key;
-       DBT data;
-
-       key.data = arg;
-       key.size = strlen(arg) + 1;
-       data.data = def;
-       data.size = strlen(def) + 1;
 
 
-       status = (*(t->put))(t->internal, &key, &data, R_NOOVERWRITE);
+       if (!recno) {
+               (void)fprintf(stderr,
+                   "iafter only available for recno db's.\n");
+               return;
+       }
+       key.data = argv[1];
+       key.size = sizeof(recno_t);
+       data.data = argv[2];
+       data.size = strlen(data.data);
+       status = (db->put)(db, &key, &data, R_IAFTER);
        switch (status) {
        switch (status) {
-         case RET_SUCCESS:
+       case RET_ERROR:
+               perror("iafter");
                break;
                break;
-
-         case RET_ERROR:
-               perror("put");
+       case RET_SPECIAL:
+               (void)printf("%s (duplicate key)\n", argv[1]);
                break;
                break;
+       case RET_SUCCESS:
+               break;
+       }
+}
 
 
-         case RET_SPECIAL:
-               fprintf(stderr, "%s is a duplicate key!\n", arg);
-               fflush(stderr);
+void
+ibefore(db, argv)
+       DB *db;
+       char **argv;
+{
+       DBT key, data;
+       int status;
+
+       if (!recno) {
+               (void)fprintf(stderr,
+                   "ibefore only available for recno db's.\n");
+               return;
+       }
+       key.data = argv[1];
+       key.size = sizeof(recno_t);
+       data.data = argv[2];
+       data.size = strlen(data.data);
+       status = (db->put)(db, &key, &data, R_IBEFORE);
+       switch (status) {
+       case RET_ERROR:
+               perror("ibefore");
+               break;
+       case RET_SPECIAL:
+               (void)printf("%s (duplicate key)\n", argv[1]);
+               break;
+       case RET_SUCCESS:
                break;
        }
 }
 
                break;
        }
 }
 
-int
-last(t)
-       DB *t;
+void
+insert(db, argv)
+       DB *db;
+       char **argv;
 {
        int status;
 {
        int status;
-       DBT key;
-       DBT data;
+       DBT data, key;
 
 
-       status = (*(t->seq))(t->internal, &key, &data, R_LAST);
+       key.data = argv[1];
+       if (recno)
+               key.size = sizeof(recno_t);
+       else
+               key.size = strlen(argv[1]) + 1;
+       data.data = argv[2];
+       data.size = strlen(argv[2]) + 1;
 
 
+       status = (*db->put)(db, &key, &data, R_NOOVERWRITE);
        switch (status) {
        switch (status) {
-         case RET_ERROR:
-               perror("last");
+       case RET_ERROR:
+               perror("put");
                break;
                break;
-
-         case RET_SPECIAL:
-               printf("no more keys");
+       case RET_SPECIAL:
+               (void)printf("%s (duplicate key)\n", argv[1]);
                break;
                break;
-
-         case RET_SUCCESS:
-               show(&key, &data);
+       case RET_SUCCESS:
                break;
        }
 }
 
                break;
        }
 }
 
-int
-lookup(t, arg)
-       DB *t;
-       char *arg;
+void
+last(db, argv)
+       DB *db;
+       char **argv;
 {
 {
+       DBT data, key;
        int status;
        int status;
-       DBT key;
-       DBT data;
 
 
-       key.data = arg;
-       key.size = strlen(arg) + 1;
-
-       status = (*(t->get))(t->internal, &key, &data, 0);
+       status = (*db->seq)(db, &key, &data, R_LAST);
 
        switch (status) {
 
        switch (status) {
-         case RET_SPECIAL:
-               printf("not found\n");
+       case RET_ERROR:
+               perror("last");
                break;
                break;
-         case RET_SUCCESS:
-               show(&key, &data);
+       case RET_SPECIAL:
+               (void)printf("no more keys\n");
                break;
                break;
-         case RET_ERROR:
-               perror("get");
+       case RET_SUCCESS:
+               keydata(&key, &data);
                break;
        }
 }
 
                break;
        }
 }
 
-int
-next(t)
-       DB *t;
+void
+list(db, argv)
+       DB *db;
+       char **argv;
 {
 {
+       DBT data, key;
+       FILE *fp;
        int status;
        int status;
-       DBT key;
-       DBT data;
 
 
-       status = (*(t->seq))(t->internal, &key, &data, R_NEXT);
+       if ((fp = fopen(argv[1], "w")) == NULL) {
+               (void)fprintf(stderr, "%s: %s\n", argv[1], strerror(errno));
+               return;
+       }
+       status = (*db->seq)(db, &key, &data, R_FIRST);
+       while (status == RET_SUCCESS) {
+               (void)fprintf(fp, "%s\n", key.data);
+               status = (*db->seq)(db, &key, &data, R_NEXT);
+       }
+       if (status == RET_ERROR)
+               perror("list");
+}
+
+void
+load(db, argv)
+       DB *db;
+       char **argv;
+{
+       register char *p, *t;
+       FILE *fp;
+       DBT data, key;
+       int status;
+       char b1[256], b2[256];
+
+       if ((fp = fopen(argv[1], "r")) == NULL) {
+               perror(argv[1]);
+               return;
+       }
+       (void)printf("loading %s...\n", dict);
+
+       key.data = b1;
+       data.data = b2;
+       while (fgets(b1, sizeof(b1), fp) != NULL) {
+               data.size = strlen(b1);
+               b1[data.size - 1] = '\0';
+               for (p = &b1[data.size - 2], t = b2; p >= b1; *t++ = *p--);
+               b2[data.size - 1] = '\0';
+               key.size = data.size;
+
+               status = (*db->put)(db, &key, &data, R_NOOVERWRITE);
+               switch (status) {
+               case RET_ERROR:
+                       perror("load/put");
+                       exit(1);
+               case RET_SPECIAL:
+                       (void)fprintf(stderr, "duplicate: %s\n", key.data);
+                       exit(1);
+               case RET_SUCCESS:
+                       break;
+               }
+       }
+       (void)fclose(fp);
+}
+
+void
+next(db, argv)
+       DB *db;
+       char **argv;
+{
+       DBT data, key;
+       int status;
+
+       status = (*db->seq)(db, &key, &data, R_NEXT);
 
        switch (status) {
 
        switch (status) {
-         case RET_ERROR:
+       case RET_ERROR:
                perror("next");
                break;
                perror("next");
                break;
-
-         case RET_SPECIAL:
-               printf("no more keys");
+       case RET_SPECIAL:
+               (void)printf("no more keys\n");
                break;
                break;
-
-         case RET_SUCCESS:
-               show(&key, &data);
+       case RET_SUCCESS:
+               keydata(&key, &data);
                break;
        }
 }
 
                break;
        }
 }
 
-int
-previous(t)
-       DB *t;
+void
+previous(db, argv)
+       DB *db;
+       char **argv;
 {
 {
+       DBT data, key;
        int status;
        int status;
-       DBT key;
-       DBT data;
 
 
-       status = (*(t->seq))(t->internal, &key, &data, R_PREV);
+       status = (*db->seq)(db, &key, &data, R_PREV);
 
        switch (status) {
 
        switch (status) {
-         case RET_ERROR:
+       case RET_ERROR:
                perror("previous");
                break;
                perror("previous");
                break;
-
-         case RET_SPECIAL:
-               printf("no more keys");
+       case RET_SPECIAL:
+               (void)printf("no more keys\n");
                break;
                break;
-
-         case RET_SUCCESS:
-               show(&key, &data);
+       case RET_SUCCESS:
+               keydata(&key, &data);
                break;
        }
 }
 
                break;
        }
 }
 
-show(key, data)
-       DBT *key;
-       DBT *data;
+void
+show(db, argv)
+       DB *db;
+       char **argv;
+{
+       BTREE *t;
+       PAGE *h;
+       pgno_t pg;
+
+       pg = atoi(argv[1]);
+       if (pg == 0) {
+               (void)printf("page 0 is meta-data page.\n");
+               return;
+       }
+
+       t = db->internal;
+       if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) {
+               (void)printf("getpage of %ld failed\n", pg);
+               return;
+       }
+       __bt_dpage(h);
+       mpool_put(t->bt_mp, h, 0);
+}
+
+void
+bstat(db, argv)
+       DB *db;
+       char **argv;
+{
+       (void)printf("BTREE\n");
+       __bt_stat(db);
+}
+
+void
+mstat(db, argv)
+       DB *db;
+       char **argv;
 {
 {
-       if (key->size > 0)
-               printf("%s", key->data);
+       (void)printf("MPOOL\n");
+       mpool_stat(((BTREE *)db->internal)->bt_mp);
+}
+
+void
+keydata(key, data)
+       DBT *key, *data;
+{
+       if (!recno && key->size > 0)
+               (void)printf("%s/", key->data);
        if (data->size > 0)
        if (data->size > 0)
-               printf("/%s", data->data);
-       printf("\n");
+               (void)printf("%s", data->data);
+       (void)printf("\n");
+}
+
+void
+usage()
+{
+       (void)fprintf(stderr,
+           "usage: %s [-bdlu] [-c cache] [-i file] [-p page] [file]\n",
+           progname);
+       exit (1);
 }
 }