diff options
Diffstat (limited to 'db2')
140 files changed, 12435 insertions, 9935 deletions
diff --git a/db2/Makefile b/db2/Makefile index da1c622642..9020ce5f6a 100644 --- a/db2/Makefile +++ b/db2/Makefile @@ -45,9 +45,8 @@ distribute = db_int.h config.h compat.h clib/getlong.c btree/btree.src \ mp.h mp_ext.h mutex_ext.h os_ext.h queue.h \ shqueue.h txn.h txn_auto.h txn_ext.h \ os.h os_jump.h xa.h xa_ext.h) \ - $(addprefix mutex/,x86.gcc uts4_cc.s sparc.gcc parisc.hp \ - parisc.gcc alpha.gcc alpha.dec README \ - 68020.gcc tsl_parisc.s sco.cc) + $(addprefix mutex/,x86.gcc uts4_cc.s sparc.gcc parisc.gcc \ + README 68020.gcc tsl_parisc.s sco.cc) vpath %.c $(subdir-dirs) diff --git a/db2/btree/bt_close.c b/db2/btree/bt_close.c deleted file mode 100644 index 9df5c717e6..0000000000 --- a/db2/btree/bt_close.c +++ /dev/null @@ -1,177 +0,0 @@ -/*- - * See the file LICENSE for redistribution information. - * - * Copyright (c) 1996, 1997, 1998 - * Sleepycat Software. All rights reserved. - */ -/* - * Copyright (c) 1990, 1993, 1994, 1995, 1996 - * Keith Bostic. All rights reserved. - */ -/* - * Copyright (c) 1990, 1993, 1994, 1995 - * The Regents of the University of California. All rights reserved. - * - * This code is derived from software contributed to Berkeley by - * Mike Olson. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#include "config.h" - -#ifndef lint -static const char sccsid[] = "@(#)bt_close.c 10.32 (Sleepycat) 5/6/98"; -#endif /* not lint */ - -#ifndef NO_SYSTEM_INCLUDES -#include <sys/types.h> - -#include <string.h> -#endif - -#include "db_int.h" -#include "db_page.h" -#include "btree.h" - -static void __bam_upstat __P((DB *dbp)); - -/* - * __bam_close -- - * Close a btree. - * - * PUBLIC: int __bam_close __P((DB *)); - */ -int -__bam_close(dbp) - DB *dbp; -{ - BTREE *t; - - DEBUG_LWRITE(dbp, NULL, "bam_close", NULL, NULL, 0); - - t = dbp->internal; - - /* Update tree statistics. */ - __bam_upstat(dbp); - - /* Free any allocated memory. */ - if (t->bt_rkey.data) - FREE(t->bt_rkey.data, t->bt_rkey.size); - if (t->bt_rdata.data) - FREE(t->bt_rdata.data, t->bt_rdata.ulen); - if (t->bt_sp != t->bt_stack) - FREE(t->bt_sp, (t->bt_esp - t->bt_sp) * sizeof(EPG)); - - FREE(t, sizeof(BTREE)); - dbp->internal = NULL; - - return (0); -} - -/* - * __bam_sync -- - * Sync the btree to disk. - * - * PUBLIC: int __bam_sync __P((DB *, u_int32_t)); - */ -int -__bam_sync(argdbp, flags) - DB *argdbp; - u_int32_t flags; -{ - DB *dbp; - int ret; - - DEBUG_LWRITE(argdbp, NULL, "bam_sync", NULL, NULL, flags); - - /* Check for invalid flags. */ - if ((ret = __db_syncchk(argdbp, flags)) != 0) - return (ret); - - /* If it wasn't possible to modify the file, we're done. */ - if (F_ISSET(argdbp, DB_AM_INMEM | DB_AM_RDONLY)) - return (0); - - GETHANDLE(argdbp, NULL, &dbp, ret); - - /* Flush any dirty pages from the cache to the backing file. */ - if ((ret = memp_fsync(dbp->mpf)) == DB_INCOMPLETE) - ret = 0; - - PUTHANDLE(dbp); - return (ret); -} - -/* - * __bam_upstat -- - * Update tree statistics. - */ -static void -__bam_upstat(dbp) - DB *dbp; -{ - BTREE *t; - BTMETA *meta; - DB_LOCK metalock; - db_pgno_t pgno; - u_int32_t flags; - - /* - * We use a no-op log call to log the update of the statistics onto the - * metadata page. The Db->close call isn't transaction protected to - * start with, and I'm not sure what undoing a statistics update means, - * anyway. - */ - if (F_ISSET(dbp, DB_AM_INMEM | DB_AM_RDONLY)) - return; - - flags = 0; - pgno = PGNO_METADATA; - - /* Lock and retrieve the page. */ - if (__bam_lget(dbp, 0, pgno, DB_LOCK_WRITE, &metalock) != 0) - return; - if (__bam_pget(dbp, (PAGE **)&meta, &pgno, 0) == 0) { - /* Log the change. */ - if (DB_LOGGING(dbp) && - __db_noop_log(dbp->dbenv->lg_info, dbp->txn, &LSN(meta), 0, - dbp->log_fileid, PGNO_METADATA, &LSN(meta)) != 0) - goto err; - - /* Update the statistics. */ - t = dbp->internal; - __bam_add_mstat(&t->lstat, &meta->stat); - - flags = DB_MPOOL_DIRTY; - } - -err: (void)memp_fput(dbp->mpf, (PAGE *)meta, flags); - (void)__BT_LPUT(dbp, metalock); -} diff --git a/db2/btree/bt_compare.c b/db2/btree/bt_compare.c index 5c6d1e38ca..c60f920612 100644 --- a/db2/btree/bt_compare.c +++ b/db2/btree/bt_compare.c @@ -47,7 +47,7 @@ #include "config.h" #ifndef lint -static const char sccsid[] = "@(#)bt_compare.c 10.9 (Sleepycat) 5/6/98"; +static const char sccsid[] = "@(#)bt_compare.c 10.14 (Sleepycat) 10/9/98"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -64,93 +64,76 @@ static const char sccsid[] = "@(#)bt_compare.c 10.9 (Sleepycat) 5/6/98"; * __bam_cmp -- * Compare a key to a given record. * - * PUBLIC: int __bam_cmp __P((DB *, const DBT *, EPG *)); + * PUBLIC: int __bam_cmp __P((DB *, const DBT *, + * PUBLIC: PAGE *, u_int32_t, int (*)(const DBT *, const DBT *))); */ int -__bam_cmp(dbp, k1, e) +__bam_cmp(dbp, dbt, h, indx, func) DB *dbp; - const DBT *k1; - EPG *e; + const DBT *dbt; + PAGE *h; + u_int32_t indx; + int (*func)__P((const DBT *, const DBT *)); { BINTERNAL *bi; BKEYDATA *bk; BOVERFLOW *bo; - BTREE *t; - DBT k2; - PAGE *h; - - t = dbp->internal; + DBT pg_dbt; + int ret; /* * Returns: - * < 0 if k1 is < record - * = 0 if k1 is = record - * > 0 if k1 is > record + * < 0 if dbt is < page record + * = 0 if dbt is = page record + * > 0 if dbt is > page record * - * The left-most key on internal pages, at any level of the tree, is - * guaranteed, by the following code, to be less than any user key. - * This saves us from having to update the leftmost key on an internal - * page when the user inserts a new key in the tree smaller than - * anything we've yet seen. + * !!! + * We do not clear the pg_dbt DBT even though it's likely to contain + * random bits. That should be okay, because the app's comparison + * routine had better not be looking at fields other than data/size. + * We don't clear it because we go through this path a lot and it's + * expensive. */ - h = e->page; - if (e->indx == 0 && - h->prev_pgno == PGNO_INVALID && TYPE(h) != P_LBTREE) - return (1); - - bo = NULL; - if (TYPE(h) == P_LBTREE) { - bk = GET_BKEYDATA(h, e->indx); + if (TYPE(h) == P_LBTREE || TYPE(h) == P_DUPLICATE) { + bk = GET_BKEYDATA(h, indx); if (B_TYPE(bk->type) == B_OVERFLOW) bo = (BOVERFLOW *)bk; else { - k2.data = bk->data; - k2.size = bk->len; + pg_dbt.data = bk->data; + pg_dbt.size = bk->len; + return (func(dbt, &pg_dbt)); } } else { - bi = GET_BINTERNAL(h, e->indx); - if (B_TYPE(bi->type) == B_OVERFLOW) - bo = (BOVERFLOW *)(bi->data); - else { - k2.data = bi->data; - k2.size = bi->len; - } - } - - /* - * XXX - * We ignore system errors; the only recoverable one is ENOMEM, and we - * don't want to require that comparison routines handle random errors. - * We don't want to return a valid comparison, either, so we stop. - */ - if (bo != NULL) { /* - * If using the default comparison routine, use __db_moff(), - * which compares the overflow key a page at a time. + * The following code guarantees that the left-most key on an + * internal page at any level of the btree is less than any + * user specified key. This saves us from having to update the + * leftmost key on an internal page when the user inserts a new + * key in the tree smaller than anything we've seen before. */ - if (t->bt_compare == __bam_defcmp) - return (__db_moff(dbp, k1, bo->pgno)); + if (indx == 0 && h->prev_pgno == PGNO_INVALID) + return (1); - /* - * Otherwise, we need a contiguous record so we can hand it - * to the user's routine. - */ - memset(&k2, 0, sizeof(k2)); - if (__db_goff(dbp, &k2, bo->tlen, - bo->pgno, &t->bt_rdata.data, &t->bt_rdata.ulen) != 0) { - (void)__db_panic(dbp); - return (0); + bi = GET_BINTERNAL(h, indx); + if (B_TYPE(bi->type) == B_OVERFLOW) + bo = (BOVERFLOW *)(bi->data); + else { + pg_dbt.data = bi->data; + pg_dbt.size = bi->len; + return (func(dbt, &pg_dbt)); } } /* + * Overflow. + * * XXX - * Note, we have not cleared the k2 DBT in this path. This should - * be okay, because the user's comparison routine had better not be - * looking at any fields other than the data/size. We don't clear - * it because we go through this path a lot and it's expensive. + * We ignore __db_moff() errors, because we have no way of returning + * them. */ - return ((*t->bt_compare)(k1, &k2)); + (void) __db_moff(dbp, + dbt, bo->pgno, bo->tlen, func == __bam_defcmp ? NULL : func, &ret); + return (ret); } /* diff --git a/db2/btree/bt_conv.c b/db2/btree/bt_conv.c index 3da4507723..a3069082ae 100644 --- a/db2/btree/bt_conv.c +++ b/db2/btree/bt_conv.c @@ -8,7 +8,7 @@ #include "config.h" #ifndef lint -static const char sccsid[] = "@(#)bt_conv.c 10.6 (Sleepycat) 4/10/98"; +static const char sccsid[] = "@(#)bt_conv.c 10.7 (Sleepycat) 9/20/98"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -90,18 +90,5 @@ __bam_mswap(pg) SWAP32(p); /* free */ SWAP32(p); /* flags */ - /* Swap the statistics. */ - p = (u_int8_t *)&((BTMETA *)pg)->stat; - SWAP32(p); /* bt_freed */ - SWAP32(p); /* bt_pfxsaved */ - SWAP32(p); /* bt_split */ - SWAP32(p); /* bt_rootsplit */ - SWAP32(p); /* bt_fastsplit */ - SWAP32(p); /* bt_added */ - SWAP32(p); /* bt_deleted */ - SWAP32(p); /* bt_get */ - SWAP32(p); /* bt_cache_hit */ - SWAP32(p); /* bt_cache_miss */ - return (0); } diff --git a/db2/btree/bt_curadj.c b/db2/btree/bt_curadj.c new file mode 100644 index 0000000000..9b86fbb6d7 --- /dev/null +++ b/db2/btree/bt_curadj.c @@ -0,0 +1,272 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997, 1998 + * Sleepycat Software. All rights reserved. + */ + +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)bt_curadj.c 10.69 (Sleepycat) 12/2/98"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <stdlib.h> +#endif + +#include "db_int.h" +#include "db_page.h" +#include "btree.h" + +#ifdef DEBUG +/* + * __bam_cprint -- + * Display the current cursor list. + * + * PUBLIC: int __bam_cprint __P((DB *)); + */ +int +__bam_cprint(dbp) + DB *dbp; +{ + CURSOR *cp; + DBC *dbc; + + DB_THREAD_LOCK(dbp); + for (dbc = TAILQ_FIRST(&dbp->active_queue); + dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) { + cp = (CURSOR *)dbc->internal; + fprintf(stderr, + "%#0x->%#0x: page: %lu index: %lu dpage %lu dindex: %lu recno: %lu", + (u_int)dbc, (u_int)cp, (u_long)cp->pgno, (u_long)cp->indx, + (u_long)cp->dpgno, (u_long)cp->dindx, (u_long)cp->recno); + if (F_ISSET(cp, C_DELETED)) + fprintf(stderr, " (deleted)"); + fprintf(stderr, "\n"); + } + DB_THREAD_UNLOCK(dbp); + + return (0); +} +#endif /* DEBUG */ + +/* + * __bam_ca_delete -- + * Update the cursors when items are deleted and when already deleted + * items are overwritten. Return the number of relevant cursors found. + * + * PUBLIC: int __bam_ca_delete __P((DB *, db_pgno_t, u_int32_t, int)); + */ +int +__bam_ca_delete(dbp, pgno, indx, delete) + DB *dbp; + db_pgno_t pgno; + u_int32_t indx; + int delete; +{ + DBC *dbc; + CURSOR *cp; + int count; /* !!!: Has to contain max number of cursors. */ + + /* Recno is responsible for its own adjustments. */ + if (dbp->type == DB_RECNO) + return (0); + + /* + * Adjust the cursors. We don't have to review the cursors for any + * thread of control other than the current one, because we have the + * page write locked at this point, and any other thread of control + * had better be using a different locker ID, meaning only cursors in + * our thread of control can be on the page. + * + * It's possible for multiple cursors within the thread to have write + * locks on the same page, but, cursors within a thread must be single + * threaded, so all we're locking here is the cursor linked list. + */ + DB_THREAD_LOCK(dbp); + for (count = 0, dbc = TAILQ_FIRST(&dbp->active_queue); + dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) { + cp = (CURSOR *)dbc->internal; + + if ((cp->pgno == pgno && cp->indx == indx) || + (cp->dpgno == pgno && cp->dindx == indx)) { + if (delete) + F_SET(cp, C_DELETED); + else + F_CLR(cp, C_DELETED); + ++count; + } + } + DB_THREAD_UNLOCK(dbp); + + return (count); +} + +/* + * __bam_ca_di -- + * Adjust the cursors during a delete or insert. + * + * PUBLIC: void __bam_ca_di __P((DB *, db_pgno_t, u_int32_t, int)); + */ +void +__bam_ca_di(dbp, pgno, indx, adjust) + DB *dbp; + db_pgno_t pgno; + u_int32_t indx; + int adjust; +{ + CURSOR *cp; + DBC *dbc; + + /* Recno is responsible for its own adjustments. */ + if (dbp->type == DB_RECNO) + return; + + /* + * Adjust the cursors. See the comment in __bam_ca_delete(). + */ + DB_THREAD_LOCK(dbp); + for (dbc = TAILQ_FIRST(&dbp->active_queue); + dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) { + cp = (CURSOR *)dbc->internal; + if (cp->pgno == pgno && cp->indx >= indx) + cp->indx += adjust; + if (cp->dpgno == pgno && cp->dindx >= indx) + cp->dindx += adjust; + } + DB_THREAD_UNLOCK(dbp); +} + +/* + * __bam_ca_dup -- + * Adjust the cursors when moving items from a leaf page to a duplicates + * page. + * + * PUBLIC: void __bam_ca_dup __P((DB *, + * PUBLIC: db_pgno_t, u_int32_t, u_int32_t, db_pgno_t, u_int32_t)); + */ +void +__bam_ca_dup(dbp, fpgno, first, fi, tpgno, ti) + DB *dbp; + db_pgno_t fpgno, tpgno; + u_int32_t first, fi, ti; +{ + CURSOR *cp; + DBC *dbc; + + /* Recno is responsible for its own adjustments. */ + if (dbp->type == DB_RECNO) + return; + + /* + * Adjust the cursors. See the comment in __bam_ca_delete(). + */ + DB_THREAD_LOCK(dbp); + for (dbc = TAILQ_FIRST(&dbp->active_queue); + dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) { + cp = (CURSOR *)dbc->internal; + /* + * Ignore matching entries that have already been moved, + * we move from the same location on the leaf page more + * than once. + */ + if (cp->dpgno == PGNO_INVALID && + cp->pgno == fpgno && cp->indx == fi) { + cp->indx = first; + cp->dpgno = tpgno; + cp->dindx = ti; + } + } + DB_THREAD_UNLOCK(dbp); +} + +/* + * __bam_ca_rsplit -- + * Adjust the cursors when doing reverse splits. + * + * PUBLIC: void __bam_ca_rsplit __P((DB *, db_pgno_t, db_pgno_t)); + */ +void +__bam_ca_rsplit(dbp, fpgno, tpgno) + DB *dbp; + db_pgno_t fpgno, tpgno; +{ + CURSOR *cp; + DBC *dbc; + + /* Recno is responsible for its own adjustments. */ + if (dbp->type == DB_RECNO) + return; + + /* + * Adjust the cursors. See the comment in __bam_ca_delete(). + */ + DB_THREAD_LOCK(dbp); + for (dbc = TAILQ_FIRST(&dbp->active_queue); + dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) { + cp = (CURSOR *)dbc->internal; + if (cp->pgno == fpgno) + cp->pgno = tpgno; + } + DB_THREAD_UNLOCK(dbp); +} + +/* + * __bam_ca_split -- + * Adjust the cursors when splitting a page. + * + * PUBLIC: void __bam_ca_split __P((DB *, + * PUBLIC: db_pgno_t, db_pgno_t, db_pgno_t, u_int32_t, int)); + */ +void +__bam_ca_split(dbp, ppgno, lpgno, rpgno, split_indx, cleft) + DB *dbp; + db_pgno_t ppgno, lpgno, rpgno; + u_int32_t split_indx; + int cleft; +{ + DBC *dbc; + CURSOR *cp; + + /* Recno is responsible for its own adjustments. */ + if (dbp->type == DB_RECNO) + return; + + /* + * Adjust the cursors. See the comment in __bam_ca_delete(). + * + * If splitting the page that a cursor was on, the cursor has to be + * adjusted to point to the same record as before the split. Most + * of the time we don't adjust pointers to the left page, because + * we're going to copy its contents back over the original page. If + * the cursor is on the right page, it is decremented by the number of + * records split to the left page. + */ + DB_THREAD_LOCK(dbp); + for (dbc = TAILQ_FIRST(&dbp->active_queue); + dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) { + cp = (CURSOR *)dbc->internal; + if (cp->pgno == ppgno) { + if (cp->indx < split_indx) { + if (cleft) + cp->pgno = lpgno; + } else { + cp->pgno = rpgno; + cp->indx -= split_indx; + } + } + if (cp->dpgno == ppgno) { + if (cp->dindx < split_indx) { + if (cleft) + cp->dpgno = lpgno; + } else { + cp->dpgno = rpgno; + cp->dindx -= split_indx; + } + } + } + DB_THREAD_UNLOCK(dbp); +} diff --git a/db2/btree/bt_cursor.c b/db2/btree/bt_cursor.c index 5d3366a3a1..10bc095c9d 100644 --- a/db2/btree/bt_cursor.c +++ b/db2/btree/bt_cursor.c @@ -8,148 +8,219 @@ #include "config.h" #ifndef lint -static const char sccsid[] = "@(#)bt_cursor.c 10.53 (Sleepycat) 5/25/98"; +static const char sccsid[] = "@(#)bt_cursor.c 10.81 (Sleepycat) 12/16/98"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES #include <sys/types.h> #include <errno.h> +#include <stdlib.h> #include <string.h> #endif #include "db_int.h" #include "db_page.h" #include "btree.h" +#include "shqueue.h" +#include "db_shash.h" +#include "lock.h" +#include "lock_ext.h" static int __bam_c_close __P((DBC *)); static int __bam_c_del __P((DBC *, u_int32_t)); -static int __bam_c_first __P((DB *, CURSOR *)); +static int __bam_c_destroy __P((DBC *)); +static int __bam_c_first __P((DBC *, CURSOR *)); static int __bam_c_get __P((DBC *, DBT *, DBT *, u_int32_t)); -static int __bam_c_getstack __P((DB *, CURSOR *)); -static int __bam_c_last __P((DB *, CURSOR *)); -static int __bam_c_next __P((DB *, CURSOR *, int)); -static int __bam_c_physdel __P((DB *, CURSOR *, PAGE *)); -static int __bam_c_prev __P((DB *, CURSOR *)); +static int __bam_c_getstack __P((DBC *, CURSOR *)); +static int __bam_c_last __P((DBC *, CURSOR *)); +static int __bam_c_next __P((DBC *, CURSOR *, int)); +static int __bam_c_physdel __P((DBC *, CURSOR *, PAGE *)); +static int __bam_c_prev __P((DBC *, CURSOR *)); static int __bam_c_put __P((DBC *, DBT *, DBT *, u_int32_t)); -static int __bam_c_rget __P((DB *, CURSOR *, DBT *, u_int32_t)); -static int __bam_c_search - __P((DB *, CURSOR *, const DBT *, u_int32_t, int, int *)); +static void __bam_c_reset __P((CURSOR *)); +static int __bam_c_rget __P((DBC *, DBT *, u_int32_t)); +static int __bam_c_search __P((DBC *, CURSOR *, const DBT *, u_int32_t, int *)); +static int __bam_dsearch __P((DBC *, CURSOR *, DBT *, u_int32_t *)); /* Discard the current page/lock held by a cursor. */ #undef DISCARD -#define DISCARD(dbp, cp) { \ +#define DISCARD(dbc, cp) { \ if ((cp)->page != NULL) { \ - (void)memp_fput(dbp->mpf, (cp)->page, 0); \ + (void)memp_fput((dbc)->dbp->mpf, (cp)->page, 0); \ (cp)->page = NULL; \ } \ if ((cp)->lock != LOCK_INVALID) { \ - (void)__BT_TLPUT((dbp), (cp)->lock); \ + (void)__BT_TLPUT((dbc), (cp)->lock); \ (cp)->lock = LOCK_INVALID; \ } \ } +/* If the cursor references a deleted record. */ +#undef IS_CUR_DELETED +#define IS_CUR_DELETED(cp) \ + (((cp)->dpgno == PGNO_INVALID && \ + B_DISSET(GET_BKEYDATA((cp)->page, \ + (cp)->indx + O_INDX)->type)) || \ + ((cp)->dpgno != PGNO_INVALID && \ + B_DISSET(GET_BKEYDATA((cp)->page, (cp)->dindx)->type))) + +/* If the cursor and index combination references a deleted record. */ +#undef IS_DELETED +#define IS_DELETED(cp, indx) \ + (((cp)->dpgno == PGNO_INVALID && \ + B_DISSET(GET_BKEYDATA((cp)->page, (indx) + O_INDX)->type)) || \ + ((cp)->dpgno != PGNO_INVALID && \ + B_DISSET(GET_BKEYDATA((cp)->page, (indx))->type))) + /* - * __bam_cursor -- - * Interface to the cursor functions. + * Test to see if two cursors could point to duplicates of the same key, + * whether on-page or off-page. The leaf page numbers must be the same + * in both cases. In the case of off-page duplicates, the key indices + * on the leaf page will be the same. In the case of on-page duplicates, + * the duplicate page number must not be set, and the key index offsets + * must be the same. For the last test, as the saved copy of the cursor + * will not have a valid page pointer, we use the cursor's. + */ +#undef POSSIBLE_DUPLICATE +#define POSSIBLE_DUPLICATE(cursor, saved_copy) \ + ((cursor)->pgno == (saved_copy).pgno && \ + ((cursor)->indx == (saved_copy).indx || \ + ((cursor)->dpgno == PGNO_INVALID && \ + (saved_copy).dpgno == PGNO_INVALID && \ + (cursor)->page->inp[(cursor)->indx] == \ + (cursor)->page->inp[(saved_copy).indx]))) + +/* + * __bam_c_reset -- + * Initialize internal cursor structure. + */ +static void +__bam_c_reset(cp) + CURSOR *cp; +{ + cp->sp = cp->csp = cp->stack; + cp->esp = cp->stack + sizeof(cp->stack) / sizeof(cp->stack[0]); + cp->page = NULL; + cp->pgno = PGNO_INVALID; + cp->indx = 0; + cp->dpgno = PGNO_INVALID; + cp->dindx = 0; + cp->lock = LOCK_INVALID; + cp->mode = DB_LOCK_NG; + cp->recno = RECNO_OOB; + cp->flags = 0; +} + +/* + * __bam_c_init -- + * Initialize the access private portion of a cursor * - * PUBLIC: int __bam_cursor __P((DB *, DB_TXN *, DBC **)); + * PUBLIC: int __bam_c_init __P((DBC *)); */ int -__bam_cursor(dbp, txn, dbcp) - DB *dbp; - DB_TXN *txn; - DBC **dbcp; +__bam_c_init(dbc) + DBC *dbc; { + DB *dbp; CURSOR *cp; - DBC *dbc; - - DEBUG_LWRITE(dbp, txn, "bam_cursor", NULL, NULL, 0); + int ret; - if ((dbc = (DBC *)__db_calloc(1, sizeof(DBC))) == NULL) - return (ENOMEM); - if ((cp = (CURSOR *)__db_calloc(1, sizeof(CURSOR))) == NULL) { - __db_free(dbc); - return (ENOMEM); - } + if ((ret = __os_calloc(1, sizeof(CURSOR), &cp)) != 0) + return (ret); + dbp = dbc->dbp; cp->dbc = dbc; - cp->pgno = cp->dpgno = PGNO_INVALID; - cp->lock = LOCK_INVALID; - - dbc->dbp = dbp; - dbc->txn = txn; - dbc->internal = cp; - dbc->c_close = __bam_c_close; - dbc->c_del = __bam_c_del; - dbc->c_get = __bam_c_get; - dbc->c_put = __bam_c_put; /* - * All cursors are queued from the master DB structure. Add the - * cursor to that queue. + * Logical record numbers are always the same size, and we don't want + * to have to check for space every time we return one. Allocate it + * in advance. */ - CURSOR_SETUP(dbp); - TAILQ_INSERT_HEAD(&dbp->curs_queue, dbc, links); - CURSOR_TEARDOWN(dbp); + if (dbp->type == DB_RECNO || F_ISSET(dbp, DB_BT_RECNUM)) { + if ((ret = __os_malloc(sizeof(db_recno_t), + NULL, &dbc->rkey.data)) != 0) { + __os_free(cp, sizeof(CURSOR)); + return (ret); + } + dbc->rkey.ulen = sizeof(db_recno_t); + } + + /* Initialize methods. */ + dbc->internal = cp; + if (dbp->type == DB_BTREE) { + dbc->c_am_close = __bam_c_close; + dbc->c_am_destroy = __bam_c_destroy; + dbc->c_del = __bam_c_del; + dbc->c_get = __bam_c_get; + dbc->c_put = __bam_c_put; + } else { + dbc->c_am_close = __bam_c_close; + dbc->c_am_destroy = __bam_c_destroy; + dbc->c_del = __ram_c_del; + dbc->c_get = __ram_c_get; + dbc->c_put = __ram_c_put; + } + + /* Initialize dynamic information. */ + __bam_c_reset(cp); - *dbcp = dbc; return (0); } /* * __bam_c_close -- - * Close a single cursor. + * Close down the cursor from a single use. */ static int __bam_c_close(dbc) DBC *dbc; { + CURSOR *cp; DB *dbp; int ret; - DEBUG_LWRITE(dbc->dbp, dbc->txn, "bam_c_close", NULL, NULL, 0); + dbp = dbc->dbp; + cp = dbc->internal; + ret = 0; - GETHANDLE(dbc->dbp, dbc->txn, &dbp, ret); + /* + * If a cursor deleted a btree key, perform the actual deletion. + * (Recno keys are either deleted immediately or never deleted.) + */ + if (dbp->type == DB_BTREE && F_ISSET(cp, C_DELETED)) + ret = __bam_c_physdel(dbc, cp, NULL); - ret = __bam_c_iclose(dbp, dbc); + /* Discard any locks not acquired inside of a transaction. */ + if (cp->lock != LOCK_INVALID) { + (void)__BT_TLPUT(dbc, cp->lock); + cp->lock = LOCK_INVALID; + } + + /* Sanity checks. */ +#ifdef DIAGNOSTIC + if (cp->csp != cp->stack) + __db_err(dbp->dbenv, "btree cursor close: stack not empty"); +#endif + + /* Initialize dynamic information. */ + __bam_c_reset(cp); - PUTHANDLE(dbp); return (ret); } /* - * __bam_c_iclose -- + * __bam_c_destroy -- * Close a single cursor -- internal version. - * - * PUBLIC: int __bam_c_iclose __P((DB *, DBC *)); */ -int -__bam_c_iclose(dbp, dbc) - DB *dbp; +static int +__bam_c_destroy(dbc) DBC *dbc; { - CURSOR *cp; - int ret; - - /* If a cursor key was deleted, perform the actual deletion. */ - cp = dbc->internal; - ret = F_ISSET(cp, C_DELETED) ? __bam_c_physdel(dbp, cp, NULL) : 0; - - /* Discard any lock if we're not inside a transaction. */ - if (cp->lock != LOCK_INVALID) - (void)__BT_TLPUT(dbp, cp->lock); - - /* Remove the cursor from the queue. */ - CURSOR_SETUP(dbp); - TAILQ_REMOVE(&dbp->curs_queue, dbc, links); - CURSOR_TEARDOWN(dbp); - /* Discard the structures. */ - FREE(dbc->internal, sizeof(CURSOR)); - FREE(dbc, sizeof(DBC)); + __os_free(dbc->internal, sizeof(CURSOR)); - return (ret); + return (0); } /* @@ -161,7 +232,6 @@ __bam_c_del(dbc, flags) DBC *dbc; u_int32_t flags; { - BTREE *t; CURSOR *cp; DB *dbp; DB_LOCK lock; @@ -170,23 +240,31 @@ __bam_c_del(dbc, flags) db_indx_t indx; int ret; - DEBUG_LWRITE(dbc->dbp, dbc->txn, "bam_c_del", NULL, NULL, flags); - + dbp = dbc->dbp; cp = dbc->internal; h = NULL; + DB_PANIC_CHECK(dbp); + /* Check for invalid flags. */ - if ((ret = __db_cdelchk(dbc->dbp, flags, - F_ISSET(dbc->dbp, DB_AM_RDONLY), cp->pgno != PGNO_INVALID)) != 0) + if ((ret = __db_cdelchk(dbp, flags, + F_ISSET(dbp, DB_AM_RDONLY), cp->pgno != PGNO_INVALID)) != 0) return (ret); + /* + * If we are running CDB, this had better be either a write + * cursor or an immediate writer. + */ + if (F_ISSET(dbp, DB_AM_CDB)) + if (!F_ISSET(dbc, DBC_RMW | DBC_WRITER)) + return (EINVAL); + + DEBUG_LWRITE(dbc, dbc->txn, "bam_c_del", NULL, NULL, flags); + /* If already deleted, return failure. */ - if (F_ISSET(cp, C_DELETED | C_REPLACE)) + if (F_ISSET(cp, C_DELETED)) return (DB_KEYEMPTY); - GETHANDLE(dbc->dbp, dbc->txn, &dbp, ret); - t = dbp->internal; - /* * We don't physically delete the record until the cursor moves, * so we have to have a long-lived write lock on the page instead @@ -194,10 +272,10 @@ __bam_c_del(dbc, flags) * to even get here, so we simply discard it. */ if (F_ISSET(dbp, DB_AM_LOCKING) && cp->mode != DB_LOCK_WRITE) { - if ((ret = __bam_lget(dbp, + if ((ret = __bam_lget(dbc, 0, cp->pgno, DB_LOCK_WRITE, &lock)) != 0) goto err; - (void)__BT_TLPUT(dbp, cp->lock); + (void)__BT_TLPUT(dbc, cp->lock); cp->lock = lock; cp->mode = DB_LOCK_WRITE; } @@ -216,85 +294,50 @@ __bam_c_del(dbc, flags) indx = cp->dindx; } - if ((ret = __bam_pget(dbp, &h, &pgno, 0)) != 0) + if ((ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0) goto err; /* Log the change. */ - if (DB_LOGGING(dbp) && - (ret = __bam_cdel_log(dbp->dbenv->lg_info, dbp->txn, &LSN(h), + if (DB_LOGGING(dbc) && + (ret = __bam_cdel_log(dbp->dbenv->lg_info, dbc->txn, &LSN(h), 0, dbp->log_fileid, PGNO(h), &LSN(h), indx)) != 0) { (void)memp_fput(dbp->mpf, h, 0); goto err; } - /* Set the intent-to-delete flag on the page and in all cursors. */ + /* + * Set the intent-to-delete flag on the page and update all cursors. */ if (cp->dpgno == PGNO_INVALID) B_DSET(GET_BKEYDATA(h, indx + O_INDX)->type); else B_DSET(GET_BKEYDATA(h, indx)->type); - (void)__bam_ca_delete(dbp, pgno, indx, NULL, 0); + (void)__bam_ca_delete(dbp, pgno, indx, 1); ret = memp_fput(dbp->mpf, h, DB_MPOOL_DIRTY); h = NULL; /* - * If it's a btree with record numbers, we have to adjust the - * counts. + * If the tree has record numbers, we have to adjust the counts. + * + * !!! + * This test is right -- we don't yet support duplicates and record + * numbers in the same tree, so ignore duplicates if DB_BT_RECNUM + * set. */ - if (F_ISSET(dbp, DB_BT_RECNUM) && - (ret = __bam_c_getstack(dbp, cp)) == 0) { - ret = __bam_adjust(dbp, t, -1); - (void)__bam_stkrel(dbp); + if (F_ISSET(dbp, DB_BT_RECNUM)) { + if ((ret = __bam_c_getstack(dbc, cp)) != 0) + goto err; + if ((ret = __bam_adjust(dbc, -1)) != 0) + goto err; + (void)__bam_stkrel(dbc, 0); } err: if (h != NULL) (void)memp_fput(dbp->mpf, h, 0); - PUTHANDLE(dbp); return (ret); } /* - * __bam_get -- - * Retrieve a key/data pair from the tree. - * - * PUBLIC: int __bam_get __P((DB *, DB_TXN *, DBT *, DBT *, u_int32_t)); - */ -int -__bam_get(argdbp, txn, key, data, flags) - DB *argdbp; - DB_TXN *txn; - DBT *key, *data; - u_int32_t flags; -{ - DBC dbc; - CURSOR cp; - int ret; - - DEBUG_LREAD(argdbp, txn, "bam_get", key, NULL, flags); - - /* Check for invalid flags. */ - if ((ret = __db_getchk(argdbp, key, data, flags)) != 0) - return (ret); - - /* Build an internal cursor. */ - memset(&cp, 0, sizeof(cp)); - cp.dbc = &dbc; - cp.pgno = cp.dpgno = PGNO_INVALID; - cp.lock = LOCK_INVALID; - cp.flags = C_INTERNAL; - - /* Build an external cursor. */ - memset(&dbc, 0, sizeof(dbc)); - dbc.dbp = argdbp; - dbc.txn = txn; - dbc.internal = &cp; - - /* Get the key. */ - return(__bam_c_get(&dbc, - key, data, LF_ISSET(DB_SET_RECNO) ? DB_SET_RECNO : DB_SET)); -} - -/* * __bam_c_get -- * Get using a cursor (btree). */ @@ -304,91 +347,197 @@ __bam_c_get(dbc, key, data, flags) DBT *key, *data; u_int32_t flags; { - BTREE *t; - CURSOR *cp, copy; + CURSOR *cp, copy, start; DB *dbp; PAGE *h; - int exact, ret; - - DEBUG_LREAD(dbc->dbp, dbc->txn, "bam_c_get", - flags == DB_SET || flags == DB_SET_RANGE ? key : NULL, NULL, flags); + int exact, ret, tmp_rmw; + dbp = dbc->dbp; cp = dbc->internal; + DB_PANIC_CHECK(dbp); + /* Check for invalid flags. */ - if ((ret = __db_cgetchk(dbc->dbp, + if ((ret = __db_cgetchk(dbp, key, data, flags, cp->pgno != PGNO_INVALID)) != 0) return (ret); - GETHANDLE(dbc->dbp, dbc->txn, &dbp, ret); - t = dbp->internal; + /* Clear OR'd in additional bits so we can check for flag equality. */ + tmp_rmw = 0; + if (LF_ISSET(DB_RMW)) { + if (!F_ISSET(dbp, DB_AM_CDB)) { + tmp_rmw = 1; + F_SET(dbc, DBC_RMW); + } + LF_CLR(DB_RMW); + } + + DEBUG_LREAD(dbc, dbc->txn, "bam_c_get", + flags == DB_SET || flags == DB_SET_RANGE ? key : NULL, NULL, flags); /* - * Break out the code to return a cursor's record number. It - * has nothing to do with the cursor get code except that it's - * been rammed into the interface. + * Return a cursor's record number. It has nothing to do with the + * cursor get code except that it's been rammed into the interface. */ - if (LF_ISSET(DB_GET_RECNO)) { - ret = __bam_c_rget(dbp, cp, data, flags); - PUTHANDLE(dbp); + if (flags == DB_GET_RECNO) { + ret = __bam_c_rget(dbc, data, flags); + if (tmp_rmw) + F_CLR(dbc, DBC_RMW); return (ret); } - /* Initialize the cursor for a new retrieval. */ - copy = *cp; + /* + * Initialize the cursor for a new retrieval. Clear the cursor's + * page pointer, it was set before this operation, and no longer + * has any meaning. + */ cp->page = NULL; + copy = *cp; cp->lock = LOCK_INVALID; switch (flags) { case DB_CURRENT: /* It's not possible to return a deleted record. */ - if (F_ISSET(cp, C_DELETED | C_REPLACE)) { - PUTHANDLE(dbp); - return (DB_KEYEMPTY); + if (F_ISSET(cp, C_DELETED)) { + ret = DB_KEYEMPTY; + goto err; } - /* Get the page with the current item on it. */ - if ((ret = __bam_pget(dbp, &cp->page, &cp->pgno, 0)) != 0) + /* Acquire the current page. */ + if ((ret = __bam_lget(dbc, + 0, cp->pgno, DB_LOCK_READ, &cp->lock)) == 0) + ret = memp_fget(dbp->mpf, + cp->dpgno == PGNO_INVALID ? &cp->pgno : &cp->dpgno, + 0, &cp->page); + if (ret != 0) goto err; break; + case DB_NEXT_DUP: + if (cp->pgno == PGNO_INVALID) { + ret = EINVAL; + goto err; + } + if ((ret = __bam_c_next(dbc, cp, 1)) != 0) + goto err; + + /* Make sure we didn't go past the end of the duplicates. */ + if (!POSSIBLE_DUPLICATE(cp, copy)) { + ret = DB_NOTFOUND; + goto err; + } + break; case DB_NEXT: if (cp->pgno != PGNO_INVALID) { - if ((ret = __bam_c_next(dbp, cp, 1)) != 0) + if ((ret = __bam_c_next(dbc, cp, 1)) != 0) goto err; break; } /* FALLTHROUGH */ case DB_FIRST: - if ((ret = __bam_c_first(dbp, cp)) != 0) + if ((ret = __bam_c_first(dbc, cp)) != 0) goto err; break; case DB_PREV: if (cp->pgno != PGNO_INVALID) { - if ((ret = __bam_c_prev(dbp, cp)) != 0) + if ((ret = __bam_c_prev(dbc, cp)) != 0) goto err; break; } /* FALLTHROUGH */ case DB_LAST: - if ((ret = __bam_c_last(dbp, cp)) != 0) + if ((ret = __bam_c_last(dbc, cp)) != 0) goto err; break; + case DB_SET: + if ((ret = __bam_c_search(dbc, cp, key, flags, &exact)) != 0) + goto err; + + /* + * We cannot currently be referencing a deleted record, but we + * may be referencing off-page duplicates. + * + * If we're referencing off-page duplicates, move off-page. + * If we moved off-page, move to the next non-deleted record. + * If we moved to the next non-deleted record, check to make + * sure we didn't switch records because our current record + * had no non-deleted data items. + */ + start = *cp; + if ((ret = __bam_dup(dbc, cp, cp->indx, 0)) != 0) + goto err; + if (cp->dpgno != PGNO_INVALID && IS_CUR_DELETED(cp)) { + if ((ret = __bam_c_next(dbc, cp, 0)) != 0) + goto err; + if (!POSSIBLE_DUPLICATE(cp, start)) { + ret = DB_NOTFOUND; + goto err; + } + } + break; case DB_SET_RECNO: - exact = 1; - if ((ret = - __bam_c_search(dbp, cp, key, S_FIND, 1, &exact)) != 0) + if ((ret = __bam_c_search(dbc, cp, key, flags, &exact)) != 0) goto err; break; - case DB_SET: - exact = 1; - if ((ret = - __bam_c_search(dbp, cp, key, S_FIND, 0, &exact)) != 0) + case DB_GET_BOTH: + if (F_ISSET(dbc, DBC_CONTINUE | DBC_KEYSET)) { + /* Acquire the current page. */ + if ((ret = memp_fget(dbp->mpf, + cp->dpgno == PGNO_INVALID ? &cp->pgno : &cp->dpgno, + 0, &cp->page)) != 0) + goto err; + + /* If DBC_CONTINUE, move to the next item. */ + if (F_ISSET(dbc, DBC_CONTINUE) && + (ret = __bam_c_next(dbc, cp, 1)) != 0) + goto err; + } else { + if ((ret = + __bam_c_search(dbc, cp, key, flags, &exact)) != 0) + goto err; + + /* + * We may be referencing a duplicates page. Move to + * the first duplicate. + */ + if ((ret = __bam_dup(dbc, cp, cp->indx, 0)) != 0) + goto err; + } + + /* Search for a matching entry. */ + if ((ret = __bam_dsearch(dbc, cp, data, NULL)) != 0) goto err; + + /* Ignore deleted entries. */ + if (IS_CUR_DELETED(cp)) { + ret = DB_NOTFOUND; + goto err; + } break; case DB_SET_RANGE: - exact = 0; - if ((ret = - __bam_c_search(dbp, cp, key, S_FIND, 0, &exact)) != 0) + if ((ret = __bam_c_search(dbc, cp, key, flags, &exact)) != 0) + goto err; + + /* + * As we didn't require an exact match, the search function + * may have returned an entry past the end of the page. If + * so, move to the next entry. + */ + if (cp->indx == NUM_ENT(cp->page) && + (ret = __bam_c_next(dbc, cp, 0)) != 0) + goto err; + + /* + * We may be referencing off-page duplicates, if so, move + * off-page. + */ + if ((ret = __bam_dup(dbc, cp, cp->indx, 0)) != 0) + goto err; + + /* + * We may be referencing a deleted record, if so, move to + * the next non-deleted record. + */ + if (IS_CUR_DELETED(cp) && (ret = __bam_c_next(dbc, cp, 0)) != 0) goto err; break; } @@ -401,12 +550,12 @@ __bam_c_get(dbc, key, data, flags) */ if (flags != DB_SET) { if (cp->dpgno != PGNO_INVALID) { - if ((ret = __bam_pget(dbp, &h, &cp->pgno, 0)) != 0) + if ((ret = memp_fget(dbp->mpf, &cp->pgno, 0, &h)) != 0) goto err; } else h = cp->page; ret = __db_ret(dbp, - h, cp->indx, key, &t->bt_rkey.data, &t->bt_rkey.ulen); + h, cp->indx, key, &dbc->rkey.data, &dbc->rkey.ulen); if (cp->dpgno != PGNO_INVALID) (void)memp_fput(dbp->mpf, h, 0); if (ret) @@ -416,62 +565,163 @@ __bam_c_get(dbc, key, data, flags) /* Return the data. */ if ((ret = __db_ret(dbp, cp->page, cp->dpgno == PGNO_INVALID ? cp->indx + O_INDX : cp->dindx, - data, &t->bt_rdata.data, &t->bt_rdata.ulen)) != 0) + data, &dbc->rdata.data, &dbc->rdata.ulen)) != 0) goto err; /* - * If the previous cursor record has been deleted, delete it. The - * returned key isn't a deleted key, so clear the flag. + * If the previous cursor record has been deleted, physically delete + * the entry from the page. We clear the deleted flag before we call + * the underlying delete routine so that, if an error occurs, and we + * restore the cursor, the deleted flag is cleared. This is because, + * if we manage to physically modify the page, and then restore the + * cursor, we might try to repeat the page modification when closing + * the cursor. */ - if (F_ISSET(©, C_DELETED) && __bam_c_physdel(dbp, ©, cp->page)) - goto err; - F_CLR(cp, C_DELETED | C_REPLACE); + if (F_ISSET(©, C_DELETED)) { + F_CLR(©, C_DELETED); + if ((ret = __bam_c_physdel(dbc, ©, cp->page)) != 0) + goto err; + } + F_CLR(cp, C_DELETED); - /* Release the previous lock, if any. */ + /* Release the previous lock, if any; the current lock is retained. */ if (copy.lock != LOCK_INVALID) - (void)__BT_TLPUT(dbp, copy.lock); - - /* Release the pinned page. */ - ret = memp_fput(dbp->mpf, cp->page, 0); + (void)__BT_TLPUT(dbc, copy.lock); - /* Internal cursors don't hold locks. */ - if (F_ISSET(cp, C_INTERNAL) && cp->lock != LOCK_INVALID) - (void)__BT_TLPUT(dbp, cp->lock); - - ++t->lstat.bt_get; + /* Release the current page. */ + if ((ret = memp_fput(dbp->mpf, cp->page, 0)) != 0) + goto err; if (0) { err: if (cp->page != NULL) (void)memp_fput(dbp->mpf, cp->page, 0); if (cp->lock != LOCK_INVALID) - (void)__BT_TLPUT(dbp, cp->lock); + (void)__BT_TLPUT(dbc, cp->lock); *cp = copy; } - PUTHANDLE(dbp); + /* Release temporary lock upgrade. */ + if (tmp_rmw) + F_CLR(dbc, DBC_RMW); + return (ret); } /* + * __bam_dsearch -- + * Search for a matching data item (or the first data item that's + * equal to or greater than the one we're searching for). + */ +static int +__bam_dsearch(dbc, cp, data, iflagp) + DBC *dbc; + CURSOR *cp; + DBT *data; + u_int32_t *iflagp; +{ + DB *dbp; + CURSOR copy, last; + int cmp, ret; + + dbp = dbc->dbp; + + /* + * If iflagp is non-NULL, we're doing an insert. + * + * If the duplicates are off-page, use the duplicate search routine. + */ + if (cp->dpgno != PGNO_INVALID) { + if ((ret = __db_dsearch(dbc, iflagp != NULL, + data, cp->dpgno, &cp->dindx, &cp->page, &cmp)) != 0) + return (ret); + cp->dpgno = cp->page->pgno; + + if (iflagp == NULL) { + if (cmp != 0) + return (DB_NOTFOUND); + return (0); + } + *iflagp = DB_BEFORE; + return (0); + } + + /* Otherwise, do the search ourselves. */ + copy = *cp; + for (;;) { + /* Save the last interesting cursor position. */ + last = *cp; + + /* See if the data item matches the one we're looking for. */ + if ((cmp = __bam_cmp(dbp, data, cp->page, cp->indx + O_INDX, + dbp->dup_compare == NULL ? + __bam_defcmp : dbp->dup_compare)) == 0) { + if (iflagp != NULL) + *iflagp = DB_AFTER; + return (0); + } + + /* + * If duplicate entries are sorted, we're done if we find a + * page entry that sorts greater than the application item. + * If doing an insert, return success, otherwise DB_NOTFOUND. + */ + if (dbp->dup_compare != NULL && cmp < 0) { + if (iflagp == NULL) + return (DB_NOTFOUND); + *iflagp = DB_BEFORE; + return (0); + } + + /* + * Move to the next item. If we reach the end of the page and + * we're doing an insert, set the cursor to the last item and + * set the referenced memory location so callers know to insert + * after the item, instead of before it. If not inserting, we + * return DB_NOTFOUND. + */ + if ((cp->indx += P_INDX) >= NUM_ENT(cp->page)) { + if (iflagp == NULL) + return (DB_NOTFOUND); + goto use_last; + } + + /* + * Make sure we didn't go past the end of the duplicates. The + * error conditions are the same as above. + */ + if (!POSSIBLE_DUPLICATE(cp, copy)) { + if (iflagp == NULL) + return (DB_NOTFOUND); +use_last: *cp = last; + *iflagp = DB_AFTER; + return (0); + } + } + /* NOTREACHED */ +} + +/* * __bam_c_rget -- * Return the record number for a cursor. */ static int -__bam_c_rget(dbp, cp, data, flags) - DB *dbp; - CURSOR *cp; +__bam_c_rget(dbc, data, flags) + DBC *dbc; DBT *data; u_int32_t flags; { - BTREE *t; + CURSOR *cp; + DB *dbp; DBT dbt; db_recno_t recno; int exact, ret; COMPQUIET(flags, 0); + dbp = dbc->dbp; + cp = dbc->internal; /* Get the page with the current item on it. */ - if ((ret = __bam_pget(dbp, &cp->page, &cp->pgno, 0)) != 0) + if ((ret = memp_fget(dbp->mpf, &cp->pgno, 0, &cp->page)) != 0) return (ret); /* Get a copy of the key. */ @@ -481,18 +731,19 @@ __bam_c_rget(dbp, cp, data, flags) goto err; exact = 1; - if ((ret = __bam_search(dbp, &dbt, S_FIND, 1, &recno, &exact)) != 0) + if ((ret = __bam_search(dbc, &dbt, + F_ISSET(dbc, DBC_RMW) ? S_FIND_WR : S_FIND, + 1, &recno, &exact)) != 0) goto err; - t = dbp->internal; ret = __db_retcopy(data, &recno, sizeof(recno), - &t->bt_rdata.data, &t->bt_rdata.ulen, dbp->db_malloc); + &dbc->rdata.data, &dbc->rdata.ulen, dbp->db_malloc); /* Release the stack. */ - __bam_stkrel(dbp); + __bam_stkrel(dbc, 0); err: (void)memp_fput(dbp->mpf, cp->page, 0); - __db_free(dbt.data); + __os_free(dbt.data, dbt.size); return (ret); } @@ -506,62 +757,97 @@ __bam_c_put(dbc, key, data, flags) DBT *key, *data; u_int32_t flags; { - BTREE *t; CURSOR *cp, copy; DB *dbp; DBT dbt; db_indx_t indx; db_pgno_t pgno; - u_int32_t iiflags; + u_int32_t iiflags, iiop; int exact, needkey, ret, stack; void *arg; - DEBUG_LWRITE(dbc->dbp, dbc->txn, "bam_c_put", - flags == DB_KEYFIRST || flags == DB_KEYLAST ? key : NULL, - data, flags); - + dbp = dbc->dbp; cp = dbc->internal; - if ((ret = __db_cputchk(dbc->dbp, key, data, flags, - F_ISSET(dbc->dbp, DB_AM_RDONLY), cp->pgno != PGNO_INVALID)) != 0) - return (ret); + DB_PANIC_CHECK(dbp); - GETHANDLE(dbc->dbp, dbc->txn, &dbp, ret); - t = dbp->internal; + DEBUG_LWRITE(dbc, dbc->txn, "bam_c_put", + flags == DB_KEYFIRST || flags == DB_KEYLAST ? key : NULL, + data, flags); - /* Initialize the cursor for a new retrieval. */ - copy = *cp; - cp->page = NULL; - cp->lock = LOCK_INVALID; + if ((ret = __db_cputchk(dbp, key, data, flags, + F_ISSET(dbp, DB_AM_RDONLY), cp->pgno != PGNO_INVALID)) != 0) + return (ret); /* - * To split, we need a valid key for the page. Since it's a cursor, - * we have to build one. + * If we are running CDB, this had better be either a write + * cursor or an immediate writer. If it's a regular writer, + * that means we have an IWRITE lock and we need to upgrade + * it to a write lock. */ - stack = 0; + if (F_ISSET(dbp, DB_AM_CDB)) { + if (!F_ISSET(dbc, DBC_RMW | DBC_WRITER)) + return (EINVAL); + + if (F_ISSET(dbc, DBC_RMW) && + (ret = lock_get(dbp->dbenv->lk_info, dbc->locker, + DB_LOCK_UPGRADE, &dbc->lock_dbt, DB_LOCK_WRITE, + &dbc->mylock)) != 0) + return (EAGAIN); + } + if (0) { -split: /* Acquire a copy of a key from the page. */ +split: /* + * To split, we need a valid key for the page. Since it's a + * cursor, we have to build one. + * + * Acquire a copy of a key from the page. + */ if (needkey) { memset(&dbt, 0, sizeof(DBT)); if ((ret = __db_ret(dbp, cp->page, indx, - &dbt, &t->bt_rkey.data, &t->bt_rkey.ulen)) != 0) + &dbt, &dbc->rkey.data, &dbc->rkey.ulen)) != 0) goto err; arg = &dbt; } else arg = key; - /* Discard any pinned pages. */ + /* + * Discard any locks and pinned pages (the locks are discarded + * even if we're running with transactions, as they lock pages + * that we're sorry we ever acquired). If stack is set and the + * cursor entries are valid, they point to the same entries as + * the stack, don't free them twice. + */ if (stack) { - (void)__bam_stkrel(dbp); + (void)__bam_stkrel(dbc, 1); stack = 0; } else - DISCARD(dbp, cp); + DISCARD(dbc, cp); - if ((ret = __bam_split(dbp, arg)) != 0) + /* + * Restore the cursor to its original value. This is necessary + * for two reasons. First, we are about to copy it in case of + * error, again. Second, we adjust cursors during the split, + * and we have to ensure this cursor is adjusted appropriately, + * along with all the other cursors. + */ + *cp = copy; + + if ((ret = __bam_split(dbc, arg)) != 0) goto err; } - ret = 0; + /* + * Initialize the cursor for a new retrieval. Clear the cursor's + * page pointer, it was set before this operation, and no longer + * has any meaning. + */ + cp->page = NULL; + copy = *cp; + cp->lock = LOCK_INVALID; + + iiflags = needkey = ret = stack = 0; switch (flags) { case DB_AFTER: case DB_BEFORE: @@ -574,64 +860,148 @@ split: /* Acquire a copy of a key from the page. */ pgno = cp->dpgno; indx = cp->dindx; } + /* - * XXX - * This test is right -- we don't currently support duplicates - * in the presence of record numbers, so we don't worry about - * them if DB_BT_RECNUM is set. + * !!! + * This test is right -- we don't yet support duplicates and + * record numbers in the same tree, so ignore duplicates if + * DB_BT_RECNUM set. */ if (F_ISSET(dbp, DB_BT_RECNUM) && (flags != DB_CURRENT || F_ISSET(cp, C_DELETED))) { /* Acquire a complete stack. */ - if ((ret = __bam_c_getstack(dbp, cp)) != 0) + if ((ret = __bam_c_getstack(dbc, cp)) != 0) goto err; - cp->page = t->bt_csp->page; + cp->page = cp->csp->page; stack = 1; iiflags = BI_DOINCR; } else { /* Acquire the current page. */ - if ((ret = __bam_lget(dbp, + if ((ret = __bam_lget(dbc, 0, cp->pgno, DB_LOCK_WRITE, &cp->lock)) == 0) - ret = __bam_pget(dbp, &cp->page, &pgno, 0); + ret = memp_fget(dbp->mpf, &pgno, 0, &cp->page); if (ret != 0) goto err; iiflags = 0; } - if ((ret = __bam_iitem(dbp, &cp->page, - &indx, key, data, flags, iiflags)) == DB_NEEDSPLIT) - goto split; - break; - case DB_KEYFIRST: - exact = needkey = 0; - if ((ret = - __bam_c_search(dbp, cp, key, S_KEYFIRST, 0, &exact)) != 0) - goto err; - stack = 1; - indx = cp->dpgno == PGNO_INVALID ? cp->indx : cp->dindx; - if ((ret = __bam_iitem(dbp, &cp->page, &indx, key, - data, DB_BEFORE, exact ? 0 : BI_NEWKEY)) == DB_NEEDSPLIT) - goto split; + /* + * If the user has specified a duplicate comparison function, + * we return an error if DB_CURRENT was specified and the + * replacement data doesn't compare equal to the current data. + * This stops apps from screwing up the duplicate sort order. + */ + if (flags == DB_CURRENT && dbp->dup_compare != NULL) + if (__bam_cmp(dbp, data, + cp->page, indx, dbp->dup_compare) != 0) { + ret = EINVAL; + goto err; + } + + iiop = flags; break; + case DB_KEYFIRST: case DB_KEYLAST: - exact = needkey = 0; - if ((ret = - __bam_c_search(dbp, cp, key, S_KEYLAST, 0, &exact)) != 0) + /* + * If we have a duplicate comparison function, we position to + * the first of any on-page duplicates, and use __bam_dsearch + * to search for the right slot. Otherwise, we position to + * the first/last of any on-page duplicates based on the flag + * value. + */ + if ((ret = __bam_c_search(dbc, cp, key, + flags == DB_KEYFIRST || dbp->dup_compare != NULL ? + DB_KEYFIRST : DB_KEYLAST, &exact)) != 0) goto err; stack = 1; - indx = cp->dpgno == PGNO_INVALID ? cp->indx : cp->dindx; - if ((ret = __bam_iitem(dbp, &cp->page, &indx, key, - data, DB_AFTER, exact ? 0 : BI_NEWKEY)) == DB_NEEDSPLIT) - goto split; + /* + * If an exact match: + * If duplicates aren't supported, replace the current + * item. (When implementing the DB->put function, our + * caller has already checked the DB_NOOVERWRITE flag.) + * + * If there's a duplicate comparison function, find the + * correct slot for this duplicate item. + * + * If there's no duplicate comparison function, set the + * insert flag based on the argument flags. + * + * If there's no match, the search function returned the + * smallest slot greater than the key, use it. + */ + if (exact) { + if (F_ISSET(dbp, DB_AM_DUP)) { + /* + * If at off-page duplicate page, move to the + * first or last entry -- if a comparison + * function was specified, start searching at + * the first entry. Otherwise, move based on + * the DB_KEYFIRST/DB_KEYLAST flags. + */ + if ((ret = __bam_dup(dbc, cp, cp->indx, + dbp->dup_compare == NULL && + flags != DB_KEYFIRST)) != 0) + goto err; + + /* + * If there's a comparison function, search for + * the correct slot. Otherwise, set the insert + * flag based on the argment flag. + */ + if (dbp->dup_compare == NULL) + iiop = flags == DB_KEYFIRST ? + DB_BEFORE : DB_AFTER; + else + if ((ret = __bam_dsearch(dbc, + cp, data, &iiop)) != 0) + goto err; + } else + iiop = DB_CURRENT; + iiflags = 0; + } else { + iiop = DB_BEFORE; + iiflags = BI_NEWKEY; + } + + if (cp->dpgno == PGNO_INVALID) { + pgno = cp->pgno; + indx = cp->indx; + } else { + pgno = cp->dpgno; + indx = cp->dindx; + } break; } - if (ret) + + ret = __bam_iitem(dbc, &cp->page, &indx, key, data, iiop, iiflags); + + if (ret == DB_NEEDSPLIT) + goto split; + if (ret != 0) goto err; /* + * Reset any cursors referencing this item that might have the item + * marked for deletion. + */ + if (iiop == DB_CURRENT) { + (void)__bam_ca_delete(dbp, pgno, indx, 0); + + /* + * It's also possible that we are the cursor that had the + * item marked for deletion, in which case we want to make + * sure that we don't delete it because we had the delete + * flag set already. + */ + if (cp->pgno == copy.pgno && cp->indx == copy.indx && + cp->dpgno == copy.dpgno && cp->dindx == copy.dindx) + F_CLR(©, C_DELETED); + } + + /* * Update the cursor to point to the new entry. The new entry was * stored on the current page, because we split pages until it was * possible. @@ -642,17 +1012,24 @@ split: /* Acquire a copy of a key from the page. */ cp->dindx = indx; /* - * If the previous cursor record has been deleted, delete it. The - * returned key isn't a deleted key, so clear the flag. + * If the previous cursor record has been deleted, physically delete + * the entry from the page. We clear the deleted flag before we call + * the underlying delete routine so that, if an error occurs, and we + * restore the cursor, the deleted flag is cleared. This is because, + * if we manage to physically modify the page, and then restore the + * cursor, we might try to repeat the page modification when closing + * the cursor. */ - if (F_ISSET(©, C_DELETED) && - (ret = __bam_c_physdel(dbp, ©, cp->page)) != 0) - goto err; - F_CLR(cp, C_DELETED | C_REPLACE); + if (F_ISSET(©, C_DELETED)) { + F_CLR(©, C_DELETED); + if ((ret = __bam_c_physdel(dbc, ©, cp->page)) != 0) + goto err; + } + F_CLR(cp, C_DELETED); - /* Release the previous lock, if any. */ + /* Release the previous lock, if any; the current lock is retained. */ if (copy.lock != LOCK_INVALID) - (void)__BT_TLPUT(dbp, copy.lock); + (void)__BT_TLPUT(dbc, copy.lock); /* * Discard any pages pinned in the tree and their locks, except for @@ -662,23 +1039,26 @@ split: /* Acquire a copy of a key from the page. */ * we have to adjust the stack as necessary. If there was only a * single page on the stack, we don't have to free further stack pages. */ + if (stack && BT_STK_POP(cp) != NULL) + (void)__bam_stkrel(dbc, 0); - if (stack && BT_STK_POP(t) != NULL) - (void)__bam_stkrel(dbp); - + /* Release the current page. */ if ((ret = memp_fput(dbp->mpf, cp->page, 0)) != 0) goto err; if (0) { err: /* Discard any pinned pages. */ if (stack) - (void)__bam_stkrel(dbp); + (void)__bam_stkrel(dbc, 0); else - DISCARD(dbp, cp); + DISCARD(dbc, cp); *cp = copy; } - PUTHANDLE(dbp); + if (F_ISSET(dbp, DB_AM_CDB) && F_ISSET(dbc, DBC_RMW)) + (void)__lock_downgrade(dbp->dbenv->lk_info, dbc->mylock, + DB_LOCK_IWRITE, 0); + return (ret); } @@ -687,19 +1067,22 @@ err: /* Discard any pinned pages. */ * Return the first record. */ static int -__bam_c_first(dbp, cp) - DB *dbp; +__bam_c_first(dbc, cp) + DBC *dbc; CURSOR *cp; { + DB *dbp; db_pgno_t pgno; int ret; + dbp = dbc->dbp; + /* Walk down the left-hand side of the tree. */ for (pgno = PGNO_ROOT;;) { if ((ret = - __bam_lget(dbp, 0, pgno, DB_LOCK_READ, &cp->lock)) != 0) + __bam_lget(dbc, 0, pgno, DB_LOCK_READ, &cp->lock)) != 0) return (ret); - if ((ret = __bam_pget(dbp, &cp->page, &pgno, 0)) != 0) + if ((ret = memp_fget(dbp->mpf, &pgno, 0, &cp->page)) != 0) return (ret); /* If we find a leaf page, we're done. */ @@ -707,28 +1090,22 @@ __bam_c_first(dbp, cp) break; pgno = GET_BINTERNAL(cp->page, 0)->pgno; - DISCARD(dbp, cp); + DISCARD(dbc, cp); } cp->pgno = cp->page->pgno; cp->indx = 0; cp->dpgno = PGNO_INVALID; - /* If it's an empty page or a deleted record, go to the next one. */ - if (NUM_ENT(cp->page) == 0 || - B_DISSET(GET_BKEYDATA(cp->page, cp->indx + O_INDX)->type)) - if ((ret = __bam_c_next(dbp, cp, 0)) != 0) - return (ret); - - /* If it's a duplicate reference, go to the first entry. */ - if ((ret = __bam_ovfl_chk(dbp, cp, O_INDX, 0)) != 0) + /* Check for duplicates. */ + if ((ret = __bam_dup(dbc, cp, cp->indx, 0)) != 0) return (ret); - /* If it's a deleted record, go to the next one. */ - if (cp->dpgno != PGNO_INVALID && - B_DISSET(GET_BKEYDATA(cp->page, cp->dindx)->type)) - if ((ret = __bam_c_next(dbp, cp, 0)) != 0) + /* If on an empty page or a deleted record, move to the next one. */ + if (NUM_ENT(cp->page) == 0 || IS_CUR_DELETED(cp)) + if ((ret = __bam_c_next(dbc, cp, 0)) != 0) return (ret); + return (0); } @@ -737,19 +1114,22 @@ __bam_c_first(dbp, cp) * Return the last record. */ static int -__bam_c_last(dbp, cp) - DB *dbp; +__bam_c_last(dbc, cp) + DBC *dbc; CURSOR *cp; { + DB *dbp; db_pgno_t pgno; int ret; + dbp = dbc->dbp; + /* Walk down the right-hand side of the tree. */ for (pgno = PGNO_ROOT;;) { if ((ret = - __bam_lget(dbp, 0, pgno, DB_LOCK_READ, &cp->lock)) != 0) + __bam_lget(dbc, 0, pgno, DB_LOCK_READ, &cp->lock)) != 0) return (ret); - if ((ret = __bam_pget(dbp, &cp->page, &pgno, 0)) != 0) + if ((ret = memp_fget(dbp->mpf, &pgno, 0, &cp->page)) != 0) return (ret); /* If we find a leaf page, we're done. */ @@ -758,28 +1138,22 @@ __bam_c_last(dbp, cp) pgno = GET_BINTERNAL(cp->page, NUM_ENT(cp->page) - O_INDX)->pgno; - DISCARD(dbp, cp); + DISCARD(dbc, cp); } cp->pgno = cp->page->pgno; cp->indx = NUM_ENT(cp->page) == 0 ? 0 : NUM_ENT(cp->page) - P_INDX; cp->dpgno = PGNO_INVALID; - /* If it's an empty page or a deleted record, go to the previous one. */ - if (NUM_ENT(cp->page) == 0 || - B_DISSET(GET_BKEYDATA(cp->page, cp->indx + O_INDX)->type)) - if ((ret = __bam_c_prev(dbp, cp)) != 0) - return (ret); - - /* If it's a duplicate reference, go to the last entry. */ - if ((ret = __bam_ovfl_chk(dbp, cp, cp->indx + O_INDX, 1)) != 0) + /* Check for duplicates. */ + if ((ret = __bam_dup(dbc, cp, cp->indx, 1)) != 0) return (ret); - /* If it's a deleted record, go to the previous one. */ - if (cp->dpgno != PGNO_INVALID && - B_DISSET(GET_BKEYDATA(cp->page, cp->dindx)->type)) - if ((ret = __bam_c_prev(dbp, cp)) != 0) + /* If on an empty page or a deleted record, move to the next one. */ + if (NUM_ENT(cp->page) == 0 || IS_CUR_DELETED(cp)) + if ((ret = __bam_c_prev(dbc, cp)) != 0) return (ret); + return (0); } @@ -788,15 +1162,18 @@ __bam_c_last(dbp, cp) * Move to the next record. */ static int -__bam_c_next(dbp, cp, initial_move) - DB *dbp; +__bam_c_next(dbc, cp, initial_move) + DBC *dbc; CURSOR *cp; int initial_move; { + DB *dbp; db_indx_t adjust, indx; db_pgno_t pgno; int ret; + dbp = dbc->dbp; + /* * We're either moving through a page of duplicates or a btree leaf * page. @@ -812,9 +1189,9 @@ __bam_c_next(dbp, cp, initial_move) } if (cp->page == NULL) { if ((ret = - __bam_lget(dbp, 0, pgno, DB_LOCK_READ, &cp->lock)) != 0) + __bam_lget(dbc, 0, pgno, DB_LOCK_READ, &cp->lock)) != 0) return (ret); - if ((ret = __bam_pget(dbp, &cp->page, &pgno, 0)) != 0) + if ((ret = memp_fget(dbp->mpf, &pgno, 0, &cp->page)) != 0) return (ret); } @@ -832,15 +1209,13 @@ __bam_c_next(dbp, cp, initial_move) indx += adjust; for (;;) { if (indx >= NUM_ENT(cp->page)) { - pgno = cp->page->next_pgno; - DISCARD(dbp, cp); - /* * If we're in a btree leaf page, we've reached the end * of the tree. If we've reached the end of a page of * duplicates, continue from the btree leaf page where * we found this page of duplicates. */ + pgno = cp->page->next_pgno; if (pgno == PGNO_INVALID) { /* If in a btree leaf page, it's EOF. */ if (cp->dpgno == PGNO_INVALID) @@ -855,20 +1230,18 @@ __bam_c_next(dbp, cp, initial_move) } else indx = 0; - if ((ret = __bam_lget(dbp, + DISCARD(dbc, cp); + if ((ret = __bam_lget(dbc, 0, pgno, DB_LOCK_READ, &cp->lock)) != 0) return (ret); - if ((ret = __bam_pget(dbp, &cp->page, &pgno, 0)) != 0) + if ((ret = + memp_fget(dbp->mpf, &pgno, 0, &cp->page)) != 0) return (ret); continue; } /* Ignore deleted records. */ - if (dbp->type == DB_BTREE && - ((cp->dpgno == PGNO_INVALID && - B_DISSET(GET_BKEYDATA(cp->page, indx + O_INDX)->type)) || - (cp->dpgno != PGNO_INVALID && - B_DISSET(GET_BKEYDATA(cp->page, indx)->type)))) { + if (IS_DELETED(cp, indx)) { indx += adjust; continue; } @@ -882,8 +1255,7 @@ __bam_c_next(dbp, cp, initial_move) cp->pgno = cp->page->pgno; cp->indx = indx; - if ((ret = - __bam_ovfl_chk(dbp, cp, indx + O_INDX, 0)) != 0) + if ((ret = __bam_dup(dbc, cp, indx, 0)) != 0) return (ret); if (cp->dpgno != PGNO_INVALID) { indx = cp->dindx; @@ -904,14 +1276,17 @@ __bam_c_next(dbp, cp, initial_move) * Move to the previous record. */ static int -__bam_c_prev(dbp, cp) - DB *dbp; +__bam_c_prev(dbc, cp) + DBC *dbc; CURSOR *cp; { + DB *dbp; db_indx_t indx, adjust; db_pgno_t pgno; int ret, set_indx; + dbp = dbc->dbp; + /* * We're either moving through a page of duplicates or a btree leaf * page. @@ -927,9 +1302,9 @@ __bam_c_prev(dbp, cp) } if (cp->page == NULL) { if ((ret = - __bam_lget(dbp, 0, pgno, DB_LOCK_READ, &cp->lock)) != 0) + __bam_lget(dbc, 0, pgno, DB_LOCK_READ, &cp->lock)) != 0) return (ret); - if ((ret = __bam_pget(dbp, &cp->page, &pgno, 0)) != 0) + if ((ret = memp_fget(dbp->mpf, &pgno, 0, &cp->page)) != 0) return (ret); } @@ -941,15 +1316,13 @@ __bam_c_prev(dbp, cp) */ for (;;) { if (indx == 0) { - pgno = cp->page->prev_pgno; - DISCARD(dbp, cp); - /* * If we're in a btree leaf page, we've reached the * beginning of the tree. If we've reached the first * of a page of duplicates, continue from the btree * leaf page where we found this page of duplicates. */ + pgno = cp->page->prev_pgno; if (pgno == PGNO_INVALID) { /* If in a btree leaf page, it's SOF. */ if (cp->dpgno == PGNO_INVALID) @@ -965,10 +1338,12 @@ __bam_c_prev(dbp, cp) } else set_indx = 1; - if ((ret = __bam_lget(dbp, + DISCARD(dbc, cp); + if ((ret = __bam_lget(dbc, 0, pgno, DB_LOCK_READ, &cp->lock)) != 0) return (ret); - if ((ret = __bam_pget(dbp, &cp->page, &pgno, 0)) != 0) + if ((ret = + memp_fget(dbp->mpf, &pgno, 0, &cp->page)) != 0) return (ret); if (set_indx) @@ -979,11 +1354,7 @@ __bam_c_prev(dbp, cp) /* Ignore deleted records. */ indx -= adjust; - if (dbp->type == DB_BTREE && - ((cp->dpgno == PGNO_INVALID && - B_DISSET(GET_BKEYDATA(cp->page, indx + O_INDX)->type)) || - (cp->dpgno != PGNO_INVALID && - B_DISSET(GET_BKEYDATA(cp->page, indx)->type)))) + if (IS_DELETED(cp, indx)) continue; /* @@ -995,8 +1366,7 @@ __bam_c_prev(dbp, cp) cp->pgno = cp->page->pgno; cp->indx = indx; - if ((ret = - __bam_ovfl_chk(dbp, cp, indx + O_INDX, 1)) != 0) + if ((ret = __bam_dup(dbc, cp, indx, 1)) != 0) return (ret); if (cp->dpgno != PGNO_INVALID) { indx = cp->dindx + O_INDX; @@ -1017,499 +1387,261 @@ __bam_c_prev(dbp, cp) * Move to a specified record. */ static int -__bam_c_search(dbp, cp, key, flags, isrecno, exactp) - DB *dbp; +__bam_c_search(dbc, cp, key, flags, exactp) + DBC *dbc; CURSOR *cp; const DBT *key; u_int32_t flags; - int isrecno, *exactp; + int *exactp; { BTREE *t; + DB *dbp; + DB_LOCK lock; + PAGE *h; db_recno_t recno; - int needexact, ret; + db_indx_t indx; + u_int32_t sflags; + int cmp, needexact, ret; + dbp = dbc->dbp; t = dbp->internal; - needexact = *exactp; - /* - * Find any matching record; the search function pins the page. Make - * sure it's a valid key (__bam_search may return an index just past - * the end of a page) and return it. - */ - if (isrecno) { - if ((ret = __ram_getno(dbp, key, &recno, 0)) != 0) + /* Find an entry in the database. */ + switch (flags) { + case DB_SET_RECNO: + if ((ret = __ram_getno(dbc, key, &recno, 0)) != 0) return (ret); - ret = __bam_rsearch(dbp, &recno, flags, 1, exactp); - } else - ret = __bam_search(dbp, key, flags, 1, NULL, exactp); + sflags = F_ISSET(dbc, DBC_RMW) ? S_FIND_WR : S_FIND; + needexact = *exactp = 1; + ret = __bam_rsearch(dbc, &recno, sflags, 1, exactp); + break; + case DB_SET: + case DB_GET_BOTH: + sflags = F_ISSET(dbc, DBC_RMW) ? S_FIND_WR : S_FIND; + needexact = *exactp = 1; + goto search; + case DB_SET_RANGE: + sflags = F_ISSET(dbc, DBC_RMW) ? S_FIND_WR : S_FIND; + needexact = *exactp = 0; + goto search; + case DB_KEYFIRST: + sflags = S_KEYFIRST; + goto fast_search; + case DB_KEYLAST: + sflags = S_KEYLAST; +fast_search: needexact = *exactp = 0; + /* + * If the application has a history of inserting into the first + * or last pages of the database, we check those pages first to + * avoid doing a full search. + * + * Record numbers can't be fast-tracked, the entire tree has to + * be locked. + */ + h = NULL; + lock = LOCK_INVALID; + if (F_ISSET(dbp, DB_BT_RECNUM)) + goto search; + + /* Check if the application has a history of sorted input. */ + if (t->bt_lpgno == PGNO_INVALID) + goto search; + + /* + * Lock and retrieve the page on which we did the last insert. + * It's okay if it doesn't exist, or if it's not the page type + * we expected, it just means that the world changed. + */ + if (__bam_lget(dbc, 0, t->bt_lpgno, DB_LOCK_WRITE, &lock)) + goto fast_miss; + if (memp_fget(dbp->mpf, &t->bt_lpgno, 0, &h)) + goto fast_miss; + if (TYPE(h) != P_LBTREE) + goto fast_miss; + if (NUM_ENT(h) == 0) + goto fast_miss; + + /* + * What we do here is test to see if we're at the beginning or + * end of the tree and if the new item sorts before/after the + * first/last page entry. We don't try and catch inserts into + * the middle of the tree (although we could, as long as there + * were two keys on the page and we saved both the index and + * the page number of the last insert). + */ + if (h->next_pgno == PGNO_INVALID) { + indx = NUM_ENT(h) - P_INDX; + if ((cmp = + __bam_cmp(dbp, key, h, indx, t->bt_compare)) < 0) + goto try_begin; + if (cmp > 0) { + indx += P_INDX; + goto fast_hit; + } + + /* + * Found a duplicate. If doing DB_KEYLAST, we're at + * the correct position, otherwise, move to the first + * of the duplicates. + */ + if (flags == DB_KEYLAST) + goto fast_hit; + for (; + indx > 0 && h->inp[indx - P_INDX] == h->inp[indx]; + indx -= P_INDX) + ; + goto fast_hit; + } +try_begin: if (h->prev_pgno == PGNO_INVALID) { + indx = 0; + if ((cmp = + __bam_cmp(dbp, key, h, indx, t->bt_compare)) > 0) + goto fast_miss; + if (cmp < 0) + goto fast_hit; + /* + * Found a duplicate. If doing DB_KEYFIRST, we're at + * the correct position, otherwise, move to the last + * of the duplicates. + */ + if (flags == DB_KEYFIRST) + goto fast_hit; + for (; + indx < (db_indx_t)(NUM_ENT(h) - P_INDX) && + h->inp[indx] == h->inp[indx + P_INDX]; + indx += P_INDX) + ; + goto fast_hit; + } + goto fast_miss; + +fast_hit: /* Set the exact match flag, we may have found a duplicate. */ + *exactp = cmp == 0; + + /* Enter the entry in the stack. */ + BT_STK_CLR(cp); + BT_STK_ENTER(cp, h, indx, lock, ret); + break; + +fast_miss: if (h != NULL) + (void)memp_fput(dbp->mpf, h, 0); + if (lock != LOCK_INVALID) + (void)__BT_LPUT(dbc, lock); + +search: ret = __bam_search(dbc, key, sflags, 1, NULL, exactp); + break; + default: /* XXX: Impossible. */ + abort(); + /* NOTREACHED */ + } if (ret != 0) return (ret); - cp->page = t->bt_csp->page; - cp->pgno = cp->page->pgno; - cp->indx = t->bt_csp->indx; - cp->lock = t->bt_csp->lock; - cp->dpgno = PGNO_INVALID; - /* - * If we have an exact match, make sure that we're not looking at a - * chain of duplicates -- if so, move to an entry in that chain. + * Initialize the cursor to reference it. This has to be done + * before we return (even with DB_NOTFOUND) because we have to + * free the page(s) we locked in __bam_search. */ - if (*exactp) { - if ((ret = __bam_ovfl_chk(dbp, - cp, cp->indx + O_INDX, LF_ISSET(S_DUPLAST))) != 0) - return (ret); - } else - if (needexact) - return (DB_NOTFOUND); - - /* If past the end of a page, find the next entry. */ - if (cp->indx == NUM_ENT(cp->page) && - (ret = __bam_c_next(dbp, cp, 0)) != 0) - return (ret); + cp->page = cp->csp->page; + cp->pgno = cp->csp->page->pgno; + cp->indx = cp->csp->indx; + cp->lock = cp->csp->lock; + cp->dpgno = PGNO_INVALID; - /* If it's a deleted record, go to the next or previous one. */ - if (cp->dpgno != PGNO_INVALID && - B_DISSET(GET_BKEYDATA(cp->page, cp->dindx)->type)) { - if (flags == S_KEYLAST) { - if ((ret = __bam_c_prev(dbp, cp)) != 0) - return (ret); - } else - if ((ret = __bam_c_next(dbp, cp, 0)) != 0) - return (ret); - } /* - * If we don't specify an exact match (the DB_KEYFIRST/DB_KEYLAST or - * DB_SET_RANGE flags were set) __bam_search() may return a deleted - * item. For DB_KEYFIRST/DB_KEYLAST, we don't care since we're only - * using it for a tree position. For DB_SET_RANGE, we're returning - * the key, so we have to adjust it. + * If we inserted a key into the first or last slot of the tree, + * remember where it was so we can do it more quickly next time. */ - if (LF_ISSET(S_DELNO) && cp->dpgno == PGNO_INVALID && - B_DISSET(GET_BKEYDATA(cp->page, cp->indx + O_INDX)->type)) - if ((ret = __bam_c_next(dbp, cp, 0)) != 0) - return (ret); + if (flags == DB_KEYFIRST || flags == DB_KEYLAST) + t->bt_lpgno = + ((cp->page->next_pgno == PGNO_INVALID && + cp->indx >= NUM_ENT(cp->page)) || + (cp->page->prev_pgno == PGNO_INVALID && cp->indx == 0)) ? + cp->pgno : PGNO_INVALID; + + /* If we need an exact match and didn't find one, we're done. */ + if (needexact && *exactp == 0) + return (DB_NOTFOUND); return (0); } /* - * __bam_ovfl_chk -- - * Check for an overflow record, and if found, move to the correct - * record. + * __bam_dup -- + * Check for an off-page duplicates entry, and if found, move to the + * first or last entry. * - * PUBLIC: int __bam_ovfl_chk __P((DB *, CURSOR *, u_int32_t, int)); + * PUBLIC: int __bam_dup __P((DBC *, CURSOR *, u_int32_t, int)); */ int -__bam_ovfl_chk(dbp, cp, indx, to_end) - DB *dbp; +__bam_dup(dbc, cp, indx, last_dup) + DBC *dbc; CURSOR *cp; u_int32_t indx; - int to_end; + int last_dup; { BOVERFLOW *bo; + DB *dbp; db_pgno_t pgno; int ret; - /* Check for an overflow entry. */ - bo = GET_BOVERFLOW(cp->page, indx); - if (B_TYPE(bo->type) != B_DUPLICATE) - return (0); + dbp = dbc->dbp; /* - * If we find one, go to the duplicates page, and optionally move - * to the last record on that page. + * Check for an overflow entry. If we find one, move to the + * duplicates page, and optionally move to the last record on + * that page. * - * XXX + * !!! * We don't lock duplicates pages, we've already got the correct * lock on the main page. */ + bo = GET_BOVERFLOW(cp->page, indx + O_INDX); + if (B_TYPE(bo->type) != B_DUPLICATE) + return (0); + pgno = bo->pgno; if ((ret = memp_fput(dbp->mpf, cp->page, 0)) != 0) return (ret); cp->page = NULL; - if (to_end) { - if ((ret = __db_dend(dbp, pgno, &cp->page)) != 0) + if (last_dup) { + if ((ret = __db_dend(dbc, pgno, &cp->page)) != 0) return (ret); indx = NUM_ENT(cp->page) - O_INDX; } else { - if ((ret = __bam_pget(dbp, &cp->page, &pgno, 0)) != 0) + if ((ret = memp_fget(dbp->mpf, &pgno, 0, &cp->page)) != 0) return (ret); indx = 0; } - /* Update the duplicate entry in the cursor. */ + /* Update the cursor's duplicate information. */ cp->dpgno = cp->page->pgno; cp->dindx = indx; return (0); } -#ifdef DEBUG -/* - * __bam_cprint -- - * Display the current btree cursor list. - * - * PUBLIC: int __bam_cprint __P((DB *)); - */ -int -__bam_cprint(dbp) - DB *dbp; -{ - CURSOR *cp; - DBC *dbc; - - CURSOR_SETUP(dbp); - for (dbc = TAILQ_FIRST(&dbp->curs_queue); - dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) { - cp = (CURSOR *)dbc->internal; - fprintf(stderr, - "%#0x: page: %lu index: %lu dpage %lu dindex: %lu", - (u_int)cp, (u_long)cp->pgno, (u_long)cp->indx, - (u_long)cp->dpgno, (u_long)cp->dindx); - if (F_ISSET(cp, C_DELETED)) - fprintf(stderr, "(deleted)"); - fprintf(stderr, "\n"); - } - CURSOR_TEARDOWN(dbp); - - return (0); -} -#endif /* DEBUG */ - -/* - * __bam_ca_delete -- - * Check if any of the cursors refer to the item we are about to delete, - * returning the number of cursors that refer to the item in question. - * - * PUBLIC: int __bam_ca_delete __P((DB *, db_pgno_t, u_int32_t, CURSOR *, int)); - */ -int -__bam_ca_delete(dbp, pgno, indx, curs, key_delete) - DB *dbp; - db_pgno_t pgno; - u_int32_t indx; - CURSOR *curs; - int key_delete; -{ - DBC *dbc; - CURSOR *cp; - int count; /* !!!: Has to contain max number of cursors. */ - - /* - * Adjust the cursors. We don't have to review the cursors for any - * process other than the current one, because we have the page write - * locked at this point, and any other process had better be using a - * different locker ID, meaning that only cursors in our process can - * be on the page. - * - * It's possible for multiple cursors within the thread to have write - * locks on the same page, but, cursors within a thread must be single - * threaded, so all we're locking here is the cursor linked list. - */ - CURSOR_SETUP(dbp); - for (count = 0, dbc = TAILQ_FIRST(&dbp->curs_queue); - dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) { - cp = (CURSOR *)dbc->internal; - - /* - * Optionally, a cursor passed in is the one initiating the - * delete, so we don't want to count it or set its deleted - * flag. Otherwise, if a cursor refers to the item, then we - * set its deleted flag. - */ - if (curs == cp) - continue; - - /* - * If we're deleting the key itself and not just one of its - * duplicates, repoint the cursor to the main-page key/data - * pair, everything else is about to be discarded. - */ - if (key_delete || cp->dpgno == PGNO_INVALID) { - if (cp->pgno == pgno && cp->indx == indx) { - cp->dpgno = PGNO_INVALID; - ++count; - F_SET(cp, C_DELETED); - } - } else - if (cp->dpgno == pgno && cp->dindx == indx) { - ++count; - F_SET(cp, C_DELETED); - } - } - CURSOR_TEARDOWN(dbp); - - return (count); -} - -/* - * __bam_ca_di -- - * Adjust the cursors during a delete or insert. - * - * PUBLIC: void __bam_ca_di __P((DB *, db_pgno_t, u_int32_t, int)); - */ -void -__bam_ca_di(dbp, pgno, indx, adjust) - DB *dbp; - db_pgno_t pgno; - u_int32_t indx; - int adjust; -{ - CURSOR *cp; - DBC *dbc; - - /* Recno is responsible for its own adjustments. */ - if (dbp->type == DB_RECNO) - return; - - /* - * Adjust the cursors. See the comment in __bam_ca_delete(). - */ - CURSOR_SETUP(dbp); - for (dbc = TAILQ_FIRST(&dbp->curs_queue); - dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) { - cp = (CURSOR *)dbc->internal; - if (cp->pgno == pgno && cp->indx >= indx) - cp->indx += adjust; - if (cp->dpgno == pgno && cp->dindx >= indx) - cp->dindx += adjust; - } - CURSOR_TEARDOWN(dbp); -} - -/* - * __bam_ca_dup -- - * Adjust the cursors when moving data items to a duplicates page. - * - * PUBLIC: void __bam_ca_dup __P((DB *, - * PUBLIC: db_pgno_t, u_int32_t, u_int32_t, db_pgno_t, u_int32_t)); - */ -void -__bam_ca_dup(dbp, fpgno, first, fi, tpgno, ti) - DB *dbp; - db_pgno_t fpgno, tpgno; - u_int32_t first, fi, ti; -{ - CURSOR *cp; - DBC *dbc; - - /* - * Adjust the cursors. See the comment in __bam_ca_delete(). - * - * No need to test duplicates, this only gets called when moving - * leaf page data items onto a duplicates page. - */ - CURSOR_SETUP(dbp); - for (dbc = TAILQ_FIRST(&dbp->curs_queue); - dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) { - cp = (CURSOR *)dbc->internal; - /* - * Ignore matching entries that have already been moved, - * we move from the same location on the leaf page more - * than once. - */ - if (cp->dpgno == PGNO_INVALID && - cp->pgno == fpgno && cp->indx == fi) { - cp->indx = first; - cp->dpgno = tpgno; - cp->dindx = ti; - } - } - CURSOR_TEARDOWN(dbp); -} - -/* - * __bam_ca_move -- - * Adjust the cursors when moving data items to another page. - * - * PUBLIC: void __bam_ca_move __P((DB *, db_pgno_t, db_pgno_t)); - */ -void -__bam_ca_move(dbp, fpgno, tpgno) - DB *dbp; - db_pgno_t fpgno, tpgno; -{ - CURSOR *cp; - DBC *dbc; - - /* Recno is responsible for its own adjustments. */ - if (dbp->type == DB_RECNO) - return; - - /* - * Adjust the cursors. See the comment in __bam_ca_delete(). - * - * No need to test duplicates, this only gets called when copying - * over the root page with a leaf or internal page. - */ - CURSOR_SETUP(dbp); - for (dbc = TAILQ_FIRST(&dbp->curs_queue); - dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) { - cp = (CURSOR *)dbc->internal; - if (cp->pgno == fpgno) - cp->pgno = tpgno; - } - CURSOR_TEARDOWN(dbp); -} - -/* - * __bam_ca_replace -- - * Check if any of the cursors refer to the item we are about to replace. - * If so, their flags should be changed from deleted to replaced. - * - * PUBLIC: void __bam_ca_replace - * PUBLIC: __P((DB *, db_pgno_t, u_int32_t, ca_replace_arg)); - */ -void -__bam_ca_replace(dbp, pgno, indx, pass) - DB *dbp; - db_pgno_t pgno; - u_int32_t indx; - ca_replace_arg pass; -{ - CURSOR *cp; - DBC *dbc; - - /* - * Adjust the cursors. See the comment in __bam_ca_delete(). - * - * Find any cursors that have logically deleted a record we're about - * to overwrite. - * - * Pass == REPLACE_SETUP: - * Set C_REPLACE_SETUP so we can find the cursors again. - * - * Pass == REPLACE_SUCCESS: - * Clear C_DELETED and C_REPLACE_SETUP, set C_REPLACE, the - * overwrite was successful. - * - * Pass == REPLACE_FAILED: - * Clear C_REPLACE_SETUP, the overwrite failed. - * - * For REPLACE_SUCCESS and REPLACE_FAILED, we reset the indx value - * for the cursor as it may have been changed by other cursor update - * routines as the item was deleted/inserted. - */ - CURSOR_SETUP(dbp); - switch (pass) { - case REPLACE_SETUP: /* Setup. */ - for (dbc = TAILQ_FIRST(&dbp->curs_queue); - dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) { - cp = (CURSOR *)dbc->internal; - if ((cp->pgno == pgno && cp->indx == indx) || - (cp->dpgno == pgno && cp->dindx == indx)) - F_SET(cp, C_REPLACE_SETUP); - } - break; - case REPLACE_SUCCESS: /* Overwrite succeeded. */ - for (dbc = TAILQ_FIRST(&dbp->curs_queue); - dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) { - cp = (CURSOR *)dbc->internal; - if (F_ISSET(cp, C_REPLACE_SETUP)) { - if (cp->dpgno == pgno) - cp->dindx = indx; - if (cp->pgno == pgno) - cp->indx = indx; - F_SET(cp, C_REPLACE); - F_CLR(cp, C_DELETED | C_REPLACE_SETUP); - } - } - break; - case REPLACE_FAILED: /* Overwrite failed. */ - for (dbc = TAILQ_FIRST(&dbp->curs_queue); - dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) { - cp = (CURSOR *)dbc->internal; - if (F_ISSET(cp, C_REPLACE_SETUP)) { - if (cp->dpgno == pgno) - cp->dindx = indx; - if (cp->pgno == pgno) - cp->indx = indx; - F_CLR(cp, C_REPLACE_SETUP); - } - } - break; - } - CURSOR_TEARDOWN(dbp); -} - -/* - * __bam_ca_split -- - * Adjust the cursors when splitting a page. - * - * PUBLIC: void __bam_ca_split __P((DB *, - * PUBLIC: db_pgno_t, db_pgno_t, db_pgno_t, u_int32_t, int)); - */ -void -__bam_ca_split(dbp, ppgno, lpgno, rpgno, split_indx, cleft) - DB *dbp; - db_pgno_t ppgno, lpgno, rpgno; - u_int32_t split_indx; - int cleft; -{ - DBC *dbc; - CURSOR *cp; - - /* Recno is responsible for its own adjustments. */ - if (dbp->type == DB_RECNO) - return; - - /* - * Adjust the cursors. See the comment in __bam_ca_delete(). - * - * If splitting the page that a cursor was on, the cursor has to be - * adjusted to point to the same record as before the split. Most - * of the time we don't adjust pointers to the left page, because - * we're going to copy its contents back over the original page. If - * the cursor is on the right page, it is decremented by the number of - * records split to the left page. - */ - CURSOR_SETUP(dbp); - for (dbc = TAILQ_FIRST(&dbp->curs_queue); - dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) { - cp = (CURSOR *)dbc->internal; - if (cp->pgno == ppgno) { - if (cp->indx < split_indx) { - if (cleft) - cp->pgno = lpgno; - } else { - cp->pgno = rpgno; - cp->indx -= split_indx; - } - } - if (cp->dpgno == ppgno) { - if (cp->dindx < split_indx) { - if (cleft) - cp->dpgno = lpgno; - } else { - cp->dpgno = rpgno; - cp->dindx -= split_indx; - } - } - } - CURSOR_TEARDOWN(dbp); -} - /* * __bam_c_physdel -- * Actually do the cursor deletion. */ static int -__bam_c_physdel(dbp, cp, h) - DB *dbp; +__bam_c_physdel(dbc, cp, h) + DBC *dbc; CURSOR *cp; PAGE *h; { enum { DELETE_ITEM, DELETE_PAGE, NOTHING_FURTHER } cmd; BOVERFLOW bo; - BTREE *t; + DB *dbp; DBT dbt; DB_LOCK lock; db_indx_t indx; db_pgno_t pgno, next_pgno, prev_pgno; int delete_page, local_page, ret; - t = dbp->internal; + dbp = dbc->dbp; + delete_page = ret = 0; /* Figure out what we're deleting. */ @@ -1522,20 +1654,37 @@ __bam_c_physdel(dbp, cp, h) } /* - * If the item is referenced by another cursor, leave it up to that - * cursor to do the delete. + * If the item is referenced by another cursor, set that cursor's + * delete flag and leave it up to it to do the delete. + * + * !!! + * This test for > 0 is a tricky. There are two ways that we can + * be called here. Either we are closing the cursor or we've moved + * off the page with the deleted entry. In the first case, we've + * already removed the cursor from the active queue, so we won't see + * it in __bam_ca_delete. In the second case, it will be on a different + * item, so we won't bother with it in __bam_ca_delete. */ - if (__bam_ca_delete(dbp, pgno, indx, cp, 0) != 0) + if (__bam_ca_delete(dbp, pgno, indx, 1) > 0) return (0); /* + * If this is concurrent DB, upgrade the lock if necessary. + */ + if (F_ISSET(dbp, DB_AM_CDB) && F_ISSET(dbc, DBC_RMW) && + (ret = lock_get(dbp->dbenv->lk_info, + dbc->locker, DB_LOCK_UPGRADE, &dbc->lock_dbt, DB_LOCK_WRITE, + &dbc->mylock)) != 0) + return (EAGAIN); + + /* * If we don't already have the page locked, get it and delete the * items. */ if ((h == NULL || h->pgno != pgno)) { - if ((ret = __bam_lget(dbp, 0, pgno, DB_LOCK_WRITE, &lock)) != 0) + if ((ret = __bam_lget(dbc, 0, pgno, DB_LOCK_WRITE, &lock)) != 0) return (ret); - if ((ret = __bam_pget(dbp, &h, &pgno, 0)) != 0) + if ((ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0) return (ret); local_page = 1; } else @@ -1581,7 +1730,7 @@ __bam_c_physdel(dbp, cp, h) cmd = DELETE_ITEM; /* Delete the duplicate. */ - if ((ret = __db_drem(dbp, &h, indx, __bam_free)) != 0) + if ((ret = __db_drem(dbc, &h, indx, __bam_free)) != 0) goto err; /* @@ -1610,7 +1759,7 @@ __bam_c_physdel(dbp, cp, h) if (local_page) { if (h != NULL) (void)memp_fput(dbp->mpf, h, 0); - (void)__BT_TLPUT(dbp, lock); + (void)__BT_TLPUT(dbc, lock); local_page = 0; } @@ -1619,10 +1768,10 @@ __bam_c_physdel(dbp, cp, h) /* Acquire the parent page and switch the index to its entry. */ if ((ret = - __bam_lget(dbp, 0, cp->pgno, DB_LOCK_WRITE, &lock)) != 0) + __bam_lget(dbc, 0, cp->pgno, DB_LOCK_WRITE, &lock)) != 0) goto err; - if ((ret = __bam_pget(dbp, &h, &cp->pgno, 0)) != 0) { - (void)__BT_TLPUT(dbp, lock); + if ((ret = memp_fget(dbp->mpf, &cp->pgno, 0, &h)) != 0) { + (void)__BT_TLPUT(dbc, lock); goto err; } local_page = 1; @@ -1641,12 +1790,12 @@ __bam_c_physdel(dbp, cp, h) */ indx += O_INDX; bo = *GET_BOVERFLOW(h, indx); - (void)__db_ditem(dbp, h, indx, BOVERFLOW_SIZE); + (void)__db_ditem(dbc, h, indx, BOVERFLOW_SIZE); bo.pgno = next_pgno; memset(&dbt, 0, sizeof(dbt)); dbt.data = &bo; dbt.size = BOVERFLOW_SIZE; - (void)__db_pitem(dbp, h, indx, BOVERFLOW_SIZE, &dbt, NULL); + (void)__db_pitem(dbc, h, indx, BOVERFLOW_SIZE, &dbt, NULL); (void)memp_fset(dbp->mpf, h, DB_MPOOL_DIRTY); goto done; } @@ -1661,7 +1810,7 @@ btd: /* * set them is because we're (potentially) about to do a reverse split, * which would make our saved page information useless. * - * XXX + * !!! * The following operations to delete a page might deadlock. I think * that's OK. The problem is if we're deleting an item because we're * closing cursors because we've already deadlocked and want to call @@ -1680,37 +1829,44 @@ btd: /* /* * Do a normal btree delete. * - * XXX + * !!! * Delete the key item first, otherwise the duplicate checks in * __bam_ditem() won't work! */ - if ((ret = __bam_ditem(dbp, h, indx)) != 0) + if ((ret = __bam_ditem(dbc, h, indx)) != 0) goto err; - if ((ret = __bam_ditem(dbp, h, indx)) != 0) + if ((ret = __bam_ditem(dbc, h, indx)) != 0) goto err; /* Discard any remaining locks/pages. */ if (local_page) { (void)memp_fput(dbp->mpf, h, 0); - (void)__BT_TLPUT(dbp, lock); + (void)__BT_TLPUT(dbc, lock); local_page = 0; } /* Delete the page if it was emptied. */ if (delete_page) - ret = __bam_dpage(dbp, &dbt); + ret = __bam_dpage(dbc, &dbt); err: done: if (delete_page) - __db_free(dbt.data); + __os_free(dbt.data, dbt.size); if (local_page) { - (void)memp_fput(dbp->mpf, h, 0); - (void)__BT_TLPUT(dbp, lock); + /* + * It's possible for h to be NULL, as __db_drem may have + * been relinking pages by the time that it deadlocked. + */ + if (h != NULL) + (void)memp_fput(dbp->mpf, h, 0); + (void)__BT_TLPUT(dbc, lock); } - if (ret == 0) - ++t->lstat.bt_deleted; + if (F_ISSET(dbp, DB_AM_CDB) && F_ISSET(dbc, DBC_RMW)) + (void)__lock_downgrade(dbp->dbenv->lk_info, dbc->mylock, + DB_LOCK_IWRITE, 0); + return (ret); } @@ -1719,22 +1875,24 @@ done: if (delete_page) * Acquire a full stack for a cursor. */ static int -__bam_c_getstack(dbp, cp) - DB *dbp; +__bam_c_getstack(dbc, cp) + DBC *dbc; CURSOR *cp; { + DB *dbp; DBT dbt; PAGE *h; db_pgno_t pgno; int exact, ret; - ret = 0; + dbp = dbc->dbp; h = NULL; memset(&dbt, 0, sizeof(DBT)); + ret = 0; /* Get the page with the current item on it. */ pgno = cp->pgno; - if ((ret = __bam_pget(dbp, &h, &pgno, 0)) != 0) + if ((ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0) return (ret); /* Get a copy of a key from the page. */ @@ -1744,12 +1902,12 @@ __bam_c_getstack(dbp, cp) /* Get a write-locked stack for that page. */ exact = 0; - ret = __bam_search(dbp, &dbt, S_KEYFIRST, 1, NULL, &exact); + ret = __bam_search(dbc, &dbt, S_KEYFIRST, 1, NULL, &exact); /* We no longer need the key or the page. */ err: if (h != NULL) (void)memp_fput(dbp->mpf, h, 0); if (dbt.data != NULL) - __db_free(dbt.data); + __os_free(dbt.data, dbt.size); return (ret); } diff --git a/db2/btree/bt_delete.c b/db2/btree/bt_delete.c index 7e71037e46..d623bd8a6f 100644 --- a/db2/btree/bt_delete.c +++ b/db2/btree/bt_delete.c @@ -47,7 +47,7 @@ #include "config.h" #ifndef lint -static const char sccsid[] = "@(#)bt_delete.c 10.31 (Sleepycat) 5/6/98"; +static const char sccsid[] = "@(#)bt_delete.c 10.43 (Sleepycat) 12/7/98"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -60,8 +60,6 @@ static const char sccsid[] = "@(#)bt_delete.c 10.31 (Sleepycat) 5/6/98"; #include "db_page.h" #include "btree.h" -static int __bam_dpages __P((DB *, BTREE *)); - /* * __bam_delete -- * Delete the items referenced by a key. @@ -69,182 +67,67 @@ static int __bam_dpages __P((DB *, BTREE *)); * PUBLIC: int __bam_delete __P((DB *, DB_TXN *, DBT *, u_int32_t)); */ int -__bam_delete(argdbp, txn, key, flags) - DB *argdbp; - DB_TXN *txn; - DBT *key; - u_int32_t flags; -{ - BTREE *t; +__bam_delete(dbp, txn, key, flags) DB *dbp; - PAGE *h; - db_indx_t cnt, i, indx; - int dpage, exact, ret, stack; - - DEBUG_LWRITE(argdbp, txn, "bam_delete", key, NULL, flags); - - stack = 0; - - /* Check for invalid flags. */ - if ((ret = __db_delchk(argdbp, - key, flags, F_ISSET(argdbp, DB_AM_RDONLY))) != 0) - return (ret); - - GETHANDLE(argdbp, txn, &dbp, ret); - t = dbp->internal; - - /* Search the tree for the key; delete only deletes exact matches. */ - if ((ret = __bam_search(dbp, key, S_DELETE, 1, NULL, &exact)) != 0) - goto err; - stack = 1; - h = t->bt_csp->page; - indx = t->bt_csp->indx; - - /* Delete the key/data pair, including any on-or-off page duplicates. */ - for (cnt = 1, i = indx;; ++cnt) - if ((i += P_INDX) >= NUM_ENT(h) || h->inp[i] != h->inp[indx]) - break; - for (; cnt > 0; --cnt, ++t->lstat.bt_deleted) - if (__bam_ca_delete(dbp, h->pgno, indx, NULL, 1) == 0) { - /* - * XXX - * Delete the key item first, otherwise the duplicate - * checks in __bam_ditem() won't work! - */ - if ((ret = __bam_ditem(dbp, h, indx)) != 0) - goto err; - if ((ret = __bam_ditem(dbp, h, indx)) != 0) - goto err; - } else { - B_DSET(GET_BKEYDATA(h, indx + O_INDX)->type); - indx += P_INDX; - } - - /* If we're using record numbers, update internal page record counts. */ - if (F_ISSET(dbp, DB_BT_RECNUM) && (ret = __bam_adjust(dbp, t, -1)) != 0) - goto err; - - /* If the page is now empty, delete it. */ - dpage = NUM_ENT(h) == 0 && h->pgno != PGNO_ROOT; - - __bam_stkrel(dbp); - stack = 0; - - ret = dpage ? __bam_dpage(dbp, key) : 0; - -err: if (stack) - __bam_stkrel(dbp); - PUTHANDLE(dbp); - return (ret); -} - -/* - * __ram_delete -- - * Delete the items referenced by a key. - * - * PUBLIC: int __ram_delete __P((DB *, DB_TXN *, DBT *, u_int32_t)); - */ -int -__ram_delete(argdbp, txn, key, flags) - DB *argdbp; DB_TXN *txn; DBT *key; u_int32_t flags; { - BKEYDATA bk; - BTREE *t; - DB *dbp; - DBT hdr, data; - PAGE *h; - db_indx_t indx; - db_recno_t recno; - int exact, ret, stack; + DBC *dbc; + DBT data; + u_int32_t f_init, f_next; + int ret, t_ret; - stack = 0; + DB_PANIC_CHECK(dbp); /* Check for invalid flags. */ - if ((ret = __db_delchk(argdbp, - key, flags, F_ISSET(argdbp, DB_AM_RDONLY))) != 0) + if ((ret = + __db_delchk(dbp, key, flags, F_ISSET(dbp, DB_AM_RDONLY))) != 0) return (ret); - GETHANDLE(argdbp, txn, &dbp, ret); - t = dbp->internal; - - /* Check the user's record number and fill in as necessary. */ - if ((ret = __ram_getno(argdbp, key, &recno, 0)) != 0) - goto err; - - /* Search the tree for the key; delete only deletes exact matches. */ - if ((ret = __bam_rsearch(dbp, &recno, S_DELETE, 1, &exact)) != 0) - goto err; - if (!exact) { - ret = DB_NOTFOUND; - goto err; - } - - h = t->bt_csp->page; - indx = t->bt_csp->indx; - stack = 1; + /* Allocate a cursor. */ + if ((ret = dbp->cursor(dbp, txn, &dbc, DB_WRITELOCK)) != 0) + return (ret); - /* If the record has already been deleted, we couldn't have found it. */ - if (B_DISSET(GET_BKEYDATA(h, indx)->type)) { - ret = DB_KEYEMPTY; - goto done; - } + DEBUG_LWRITE(dbc, txn, "bam_delete", key, NULL, flags); /* - * If we're not renumbering records, replace the record with a marker - * and return. + * Walk a cursor through the key/data pairs, deleting as we go. Set + * the DB_DBT_USERMEM flag, as this might be a threaded application + * and the flags checking will catch us. We don't actually want the + * keys or data, so request a partial of length 0. */ - if (!F_ISSET(dbp, DB_RE_RENUMBER)) { - if ((ret = __bam_ditem(dbp, h, indx)) != 0) - goto err; - - B_TSET(bk.type, B_KEYDATA, 1); - bk.len = 0; - memset(&hdr, 0, sizeof(hdr)); - hdr.data = &bk; - hdr.size = SSZA(BKEYDATA, data); - memset(&data, 0, sizeof(data)); - data.data = (char *)""; - data.size = 0; - if ((ret = __db_pitem(dbp, - h, indx, BKEYDATA_SIZE(0), &hdr, &data)) != 0) - goto err; - - ++t->lstat.bt_deleted; - goto done; + memset(&data, 0, sizeof(data)); + F_SET(&data, DB_DBT_USERMEM | DB_DBT_PARTIAL); + + /* If locking, set read-modify-write flag. */ + f_init = DB_SET; + f_next = DB_NEXT_DUP; + if (dbp->dbenv != NULL && dbp->dbenv->lk_info != NULL) { + f_init |= DB_RMW; + f_next |= DB_RMW; } - /* Delete the item. */ - if ((ret = __bam_ditem(dbp, h, indx)) != 0) + /* Walk through the set of key/data pairs, deleting as we go. */ + if ((ret = dbc->c_get(dbc, key, &data, f_init)) != 0) goto err; - - ++t->lstat.bt_deleted; - if (t->bt_recno != NULL) - F_SET(t->bt_recno, RECNO_MODIFIED); - - /* Adjust the counts. */ - __bam_adjust(dbp, t, -1); - - /* Adjust the cursors. */ - __ram_ca(dbp, recno, CA_DELETE); - - /* - * If the page is now empty, delete it -- we have the whole tree - * locked, so there are no preparations to make. Else, release - * the pages. - */ - if (NUM_ENT(h) == 0 && h->pgno != PGNO_ROOT) { - stack = 0; - ret = __bam_dpages(dbp, t); + for (;;) { + if ((ret = dbc->c_del(dbc, 0)) != 0) + goto err; + if ((ret = dbc->c_get(dbc, key, &data, f_next)) != 0) { + if (ret == DB_NOTFOUND) { + ret = 0; + break; + } + goto err; + } } -done: -err: if (stack) - __bam_stkrel(dbp); +err: /* Discard the cursor. */ + if ((t_ret = dbc->c_close(dbc)) != 0 && + (ret == 0 || ret == DB_NOTFOUND)) + ret = t_ret; - PUTHANDLE(dbp); return (ret); } @@ -252,20 +135,23 @@ err: if (stack) * __bam_ditem -- * Delete one or more entries from a page. * - * PUBLIC: int __bam_ditem __P((DB *, PAGE *, u_int32_t)); + * PUBLIC: int __bam_ditem __P((DBC *, PAGE *, u_int32_t)); */ int -__bam_ditem(dbp, h, indx) - DB *dbp; +__bam_ditem(dbc, h, indx) + DBC *dbc; PAGE *h; u_int32_t indx; { BINTERNAL *bi; BKEYDATA *bk; BOVERFLOW *bo; + DB *dbp; u_int32_t nbytes; int ret; + dbp = dbc->dbp; + switch (TYPE(h)) { case P_IBTREE: bi = GET_BINTERNAL(h, indx); @@ -304,7 +190,7 @@ __bam_ditem(dbp, h, indx) */ if (indx + P_INDX < (u_int32_t)NUM_ENT(h) && h->inp[indx] == h->inp[indx + P_INDX]) - return (__bam_adjindx(dbp, + return (__bam_adjindx(dbc, h, indx, indx + O_INDX, 0)); /* * Check for a duplicate before us on the page. It @@ -312,7 +198,7 @@ __bam_ditem(dbp, h, indx) * after the data item for the purposes of this one. */ if (indx > 0 && h->inp[indx] == h->inp[indx - P_INDX]) - return (__bam_adjindx(dbp, + return (__bam_adjindx(dbc, h, indx, indx - P_INDX, 0)); } /* FALLTHROUGH */ @@ -327,11 +213,11 @@ __bam_ditem(dbp, h, indx) offpage: /* Delete duplicate/offpage chains. */ if (B_TYPE(bo->type) == B_DUPLICATE) { if ((ret = - __db_ddup(dbp, bo->pgno, __bam_free)) != 0) + __db_ddup(dbc, bo->pgno, __bam_free)) != 0) return (ret); } else if ((ret = - __db_doff(dbp, bo->pgno, __bam_free)) != 0) + __db_doff(dbc, bo->pgno, __bam_free)) != 0) return (ret); break; case B_KEYDATA: @@ -346,7 +232,7 @@ offpage: /* Delete duplicate/offpage chains. */ } /* Delete the item. */ - if ((ret = __db_ditem(dbp, h, indx, nbytes)) != 0) + if ((ret = __db_ditem(dbc, h, indx, nbytes)) != 0) return (ret); /* Mark the page dirty. */ @@ -357,21 +243,24 @@ offpage: /* Delete duplicate/offpage chains. */ * __bam_adjindx -- * Adjust an index on the page. * - * PUBLIC: int __bam_adjindx __P((DB *, PAGE *, u_int32_t, u_int32_t, int)); + * PUBLIC: int __bam_adjindx __P((DBC *, PAGE *, u_int32_t, u_int32_t, int)); */ int -__bam_adjindx(dbp, h, indx, indx_copy, is_insert) - DB *dbp; +__bam_adjindx(dbc, h, indx, indx_copy, is_insert) + DBC *dbc; PAGE *h; u_int32_t indx, indx_copy; int is_insert; { + DB *dbp; db_indx_t copy; int ret; + dbp = dbc->dbp; + /* Log the change. */ - if (DB_LOGGING(dbp) && - (ret = __bam_adj_log(dbp->dbenv->lg_info, dbp->txn, &LSN(h), + if (DB_LOGGING(dbc) && + (ret = __bam_adj_log(dbp->dbenv->lg_info, dbc->txn, &LSN(h), 0, dbp->log_fileid, PGNO(h), &LSN(h), indx, indx_copy, (u_int32_t)is_insert)) != 0) return (ret); @@ -402,22 +291,24 @@ __bam_adjindx(dbp, h, indx, indx_copy, is_insert) * __bam_dpage -- * Delete a page from the tree. * - * PUBLIC: int __bam_dpage __P((DB *, const DBT *)); + * PUBLIC: int __bam_dpage __P((DBC *, const DBT *)); */ int -__bam_dpage(dbp, key) - DB *dbp; +__bam_dpage(dbc, key) + DBC *dbc; const DBT *key; { - BTREE *t; + CURSOR *cp; + DB *dbp; DB_LOCK lock; PAGE *h; db_pgno_t pgno; int level; /* !!!: has to hold number of tree levels. */ int exact, ret; + dbp = dbc->dbp; + cp = dbc->internal; ret = 0; - t = dbp->internal; /* * The locking protocol is that we acquire locks by walking down the @@ -433,40 +324,40 @@ __bam_dpage(dbp, key) for (level = LEAFLEVEL;; ++level) { /* Acquire a page and its parent, locked. */ if ((ret = - __bam_search(dbp, key, S_WRPAIR, level, NULL, &exact)) != 0) + __bam_search(dbc, key, S_WRPAIR, level, NULL, &exact)) != 0) return (ret); /* * If we reach the root or the page isn't going to be empty * when we delete one record, quit. */ - h = t->bt_csp[-1].page; + h = cp->csp[-1].page; if (h->pgno == PGNO_ROOT || NUM_ENT(h) != 1) break; /* Release the two locked pages. */ - (void)memp_fput(dbp->mpf, t->bt_csp[-1].page, 0); - (void)__BT_TLPUT(dbp, t->bt_csp[-1].lock); - (void)memp_fput(dbp->mpf, t->bt_csp[0].page, 0); - (void)__BT_TLPUT(dbp, t->bt_csp[0].lock); + (void)memp_fput(dbp->mpf, cp->csp[-1].page, 0); + (void)__BT_TLPUT(dbc, cp->csp[-1].lock); + (void)memp_fput(dbp->mpf, cp->csp[0].page, 0); + (void)__BT_TLPUT(dbc, cp->csp[0].lock); } /* * Leave the stack pointer one after the last entry, we may be about * to push more items on the stack. */ - ++t->bt_csp; + ++cp->csp; /* - * t->bt_csp[-2].page is the top page, which we're not going to delete, - * and t->bt_csp[-1].page is the first page we are going to delete. + * cp->csp[-2].page is the top page, which we're not going to delete, + * and cp->csp[-1].page is the first page we are going to delete. * * Walk down the chain, acquiring the rest of the pages until we've * retrieved the leaf page. If we find any pages that aren't going * to be emptied by the delete, someone else added something while we * were walking the tree, and we discontinue the delete. */ - for (h = t->bt_csp[-1].page;;) { + for (h = cp->csp[-1].page;;) { if (ISLEAF(h)) { if (NUM_ENT(h) != 0) goto release; @@ -482,45 +373,53 @@ __bam_dpage(dbp, key) pgno = TYPE(h) == P_IBTREE ? GET_BINTERNAL(h, 0)->pgno : GET_RINTERNAL(h, 0)->pgno; - if ((ret = __bam_lget(dbp, 0, pgno, DB_LOCK_WRITE, &lock)) != 0) - goto release; - if ((ret = __bam_pget(dbp, &h, &pgno, 0)) != 0) + if ((ret = __bam_lget(dbc, 0, pgno, DB_LOCK_WRITE, &lock)) != 0) goto release; - BT_STK_PUSH(t, h, 0, lock, ret); - if (ret != 0) + if ((ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0) goto release; + BT_STK_PUSH(cp, h, 0, lock, ret); } - BT_STK_POP(t); - return (__bam_dpages(dbp, t)); + /* Adjust back to reference the last page on the stack. */ + BT_STK_POP(cp); + + /* Delete the pages. */ + return (__bam_dpages(dbc)); release: + /* Adjust back to reference the last page on the stack. */ + BT_STK_POP(cp); + /* Discard any locked pages and return. */ - BT_STK_POP(t); - __bam_stkrel(dbp); + __bam_stkrel(dbc, 0); + return (ret); } /* * __bam_dpages -- * Delete a set of locked pages. + * + * PUBLIC: int __bam_dpages __P((DBC *)); */ -static int -__bam_dpages(dbp, t) - DB *dbp; - BTREE *t; +int +__bam_dpages(dbc) + DBC *dbc; { + CURSOR *cp; + DB *dbp; DBT a, b; - DB_LOCK lock; + DB_LOCK c_lock, p_lock; EPG *epg; - PAGE *h; + PAGE *child, *parent; + db_indx_t nitems; db_pgno_t pgno; db_recno_t rcnt; - int ret; - - COMPQUIET(rcnt, 0); + int done, ret; - epg = t->bt_sp; + dbp = dbc->dbp; + cp = dbc->internal; + epg = cp->sp; /* * !!! @@ -533,45 +432,107 @@ __bam_dpages(dbp, t) * that we can never again access by walking down the tree. So, before * we unlink the subtree, we relink the leaf page chain. */ - if ((ret = __db_relink(dbp, t->bt_csp->page, NULL, 1)) != 0) + if ((ret = __db_relink(dbc, DB_REM_PAGE, cp->csp->page, NULL, 1)) != 0) goto release; /* - * We have the entire stack of deletable pages locked. Start from the - * top of the tree and move to the bottom, as it's better to release - * the inner pages as soon as possible. + * We have the entire stack of deletable pages locked. + * + * Delete the highest page in the tree's reference to the underlying + * stack of pages. Then, release that page, letting the rest of the + * tree get back to business. */ - if ((ret = __bam_ditem(dbp, epg->page, epg->indx)) != 0) - goto release; + if ((ret = __bam_ditem(dbc, epg->page, epg->indx)) != 0) { +release: (void)__bam_stkrel(dbc, 0); + return (ret); + } + + pgno = epg->page->pgno; + nitems = NUM_ENT(epg->page); + + (void)memp_fput(dbp->mpf, epg->page, 0); + (void)__BT_TLPUT(dbc, epg->lock); + + /* + * Free the rest of the stack of pages. + * + * !!! + * Don't bother checking for errors. We've unlinked the subtree from + * the tree, and there's no possibility of recovery outside of doing + * TXN rollback. + */ + while (++epg <= cp->csp) { + /* + * Delete page entries so they will be restored as part of + * recovery. + */ + if (NUM_ENT(epg->page) != 0) + (void)__bam_ditem(dbc, epg->page, epg->indx); + + (void)__bam_free(dbc, epg->page); + (void)__BT_TLPUT(dbc, epg->lock); + } + BT_STK_CLR(cp); + + /* + * Try and collapse the tree a level -- this is only applicable + * if we've deleted the next-to-last element from the root page. + * + * There are two cases when collapsing a tree. + * + * If we've just deleted the last item from the root page, there is no + * further work to be done. The code above has emptied the root page + * and freed all pages below it. + */ + if (pgno != PGNO_ROOT || nitems != 1) + return (0); /* - * If we just deleted the last or next-to-last item from the root page, - * the tree can collapse a level. Write lock the last page referenced + * If we just deleted the next-to-last item from the root page, the + * tree can collapse one or more levels. While there remains only a + * single item on the root page, write lock the last page referenced * by the root page and copy it over the root page. If we can't get a - * write lock, that's okay, the tree just remains a level deeper than - * we'd like. + * write lock, that's okay, the tree just stays deeper than we'd like. */ - h = epg->page; - if (h->pgno == PGNO_ROOT && NUM_ENT(h) <= 1) { - pgno = TYPE(epg->page) == P_IBTREE ? - GET_BINTERNAL(epg->page, 0)->pgno : - GET_RINTERNAL(epg->page, 0)->pgno; - if ((ret = __bam_lget(dbp, 0, pgno, DB_LOCK_WRITE, &lock)) != 0) - goto release; - if ((ret = __bam_pget(dbp, &h, &pgno, 0)) != 0) - goto release; + for (done = 0; !done;) { + /* Initialize. */ + parent = child = NULL; + p_lock = c_lock = LOCK_INVALID; + + /* Lock the root. */ + pgno = PGNO_ROOT; + if ((ret = + __bam_lget(dbc, 0, pgno, DB_LOCK_WRITE, &p_lock)) != 0) + goto stop; + if ((ret = memp_fget(dbp->mpf, &pgno, 0, &parent)) != 0) + goto stop; + + if (NUM_ENT(parent) != 1 || + (TYPE(parent) != P_IBTREE && TYPE(parent) != P_IRECNO)) + goto stop; + + pgno = TYPE(parent) == P_IBTREE ? + GET_BINTERNAL(parent, 0)->pgno : + GET_RINTERNAL(parent, 0)->pgno; + + /* Lock the child page. */ + if ((ret = + __bam_lget(dbc, 0, pgno, DB_LOCK_WRITE, &c_lock)) != 0) + goto stop; + if ((ret = memp_fget(dbp->mpf, &pgno, 0, &child)) != 0) + goto stop; /* Log the change. */ - if (DB_LOGGING(dbp)) { + if (DB_LOGGING(dbc)) { memset(&a, 0, sizeof(a)); - a.data = h; + a.data = child; a.size = dbp->pgsize; memset(&b, 0, sizeof(b)); - b.data = P_ENTRY(epg->page, 0); + b.data = P_ENTRY(parent, 0); b.size = BINTERNAL_SIZE(((BINTERNAL *)b.data)->len); - __bam_rsplit_log(dbp->dbenv->lg_info, dbp->txn, - &h->lsn, 0, dbp->log_fileid, h->pgno, &a, - RE_NREC(epg->page), &b, &epg->page->lsn); + __bam_rsplit_log(dbp->dbenv->lg_info, dbc->txn, + &child->lsn, 0, dbp->log_fileid, child->pgno, &a, + RE_NREC(parent), &b, &parent->lsn); } /* @@ -579,69 +540,50 @@ __bam_dpages(dbp, t) * * One fixup -- if the tree has record numbers and we're not * converting to a leaf page, we have to preserve the total - * record count. + * record count. Note that we are about to overwrite everything + * on the parent, including its LSN. This is actually OK, + * because the above log message, which describes this update, + * stores its LSN on the child page. When the child is copied + * to the parent, the correct LSN is going to copied into + * place in the parent. */ - if (TYPE(h) == P_IRECNO || - (TYPE(h) == P_IBTREE && F_ISSET(dbp, DB_BT_RECNUM))) - rcnt = RE_NREC(epg->page); - memcpy(epg->page, h, dbp->pgsize); - epg->page->pgno = PGNO_ROOT; - if (TYPE(h) == P_IRECNO || - (TYPE(h) == P_IBTREE && F_ISSET(dbp, DB_BT_RECNUM))) - RE_NREC_SET(epg->page, rcnt); - (void)memp_fset(dbp->mpf, epg->page, DB_MPOOL_DIRTY); + COMPQUIET(rcnt, 0); + if (TYPE(child) == P_IRECNO || + (TYPE(child) == P_IBTREE && F_ISSET(dbp, DB_BT_RECNUM))) + rcnt = RE_NREC(parent); + memcpy(parent, child, dbp->pgsize); + parent->pgno = PGNO_ROOT; + if (TYPE(child) == P_IRECNO || + (TYPE(child) == P_IBTREE && F_ISSET(dbp, DB_BT_RECNUM))) + RE_NREC_SET(parent, rcnt); + + /* Mark the pages dirty. */ + memp_fset(dbp->mpf, parent, DB_MPOOL_DIRTY); + memp_fset(dbp->mpf, child, DB_MPOOL_DIRTY); + + /* Adjust the cursors. */ + __bam_ca_rsplit(dbp, child->pgno, PGNO_ROOT); /* * Free the page copied onto the root page and discard its * lock. (The call to __bam_free() discards our reference * to the page.) - * - * It's possible that the reverse split we're doing involves - * pages from the stack of pages we're deleting. Don't free - * the page twice. */ - if (h->pgno == (epg + 1)->page->pgno) - (void)memp_fput(dbp->mpf, h, 0); - else { - (void)__bam_free(dbp, h); - ++t->lstat.bt_freed; - } - (void)__BT_TLPUT(dbp, lock); + (void)__bam_free(dbc, child); + child = NULL; - /* Adjust the cursors. */ - __bam_ca_move(dbp, h->pgno, PGNO_ROOT); + if (0) { +stop: done = 1; + } + if (p_lock != LOCK_INVALID) + (void)__BT_TLPUT(dbc, p_lock); + if (parent != NULL) + memp_fput(dbp->mpf, parent, 0); + if (c_lock != LOCK_INVALID) + (void)__BT_TLPUT(dbc, c_lock); + if (child != NULL) + memp_fput(dbp->mpf, child, 0); } - /* Release the top page in the subtree. */ - (void)memp_fput(dbp->mpf, epg->page, 0); - (void)__BT_TLPUT(dbp, epg->lock); - - /* - * Free the rest of the pages. - * - * XXX - * Don't bother checking for errors. We've unlinked the subtree from - * the tree, and there's no possibility of recovery. - */ - while (++epg <= t->bt_csp) { - /* - * XXX - * Why do we need to do this? Isn't the page already empty? - */ - if (NUM_ENT(epg->page) != 0) - (void)__bam_ditem(dbp, epg->page, epg->indx); - - (void)__bam_free(dbp, epg->page); - (void)__BT_TLPUT(dbp, epg->lock); - ++t->lstat.bt_freed; - } return (0); - -release: - /* Discard any remaining pages and return. */ - for (; epg <= t->bt_csp; ++epg) { - (void)memp_fput(dbp->mpf, epg->page, 0); - (void)__BT_TLPUT(dbp, epg->lock); - } - return (ret); } diff --git a/db2/btree/bt_open.c b/db2/btree/bt_open.c index f5974ec61e..a89cfccb97 100644 --- a/db2/btree/bt_open.c +++ b/db2/btree/bt_open.c @@ -47,17 +47,9 @@ #include "config.h" #ifndef lint -static const char sccsid[] = "@(#)bt_open.c 10.27 (Sleepycat) 5/6/98"; +static const char sccsid[] = "@(#)bt_open.c 10.39 (Sleepycat) 11/21/98"; #endif /* not lint */ -/* - * Implementation of btree access method for 4.4BSD. - * - * The design here was originally based on that of the btree access method - * used in the Postgres database system at UC Berkeley. This implementation - * is wholly independent of the Postgres code. - */ - #ifndef NO_SYSTEM_INCLUDES #include <sys/types.h> @@ -70,40 +62,34 @@ static const char sccsid[] = "@(#)bt_open.c 10.27 (Sleepycat) 5/6/98"; #include "db_page.h" #include "btree.h" -static int __bam_keyalloc __P((BTREE *)); -static int __bam_setmeta __P((DB *, BTREE *)); - /* * __bam_open -- * Open a btree. * - * PUBLIC: int __bam_open __P((DB *, DBTYPE, DB_INFO *)); + * PUBLIC: int __bam_open __P((DB *, DB_INFO *)); */ int -__bam_open(dbp, type, dbinfo) +__bam_open(dbp, dbinfo) DB *dbp; - DBTYPE type; DB_INFO *dbinfo; { BTREE *t; int ret; - /* Allocate the btree internal structure. */ - if ((t = (BTREE *)__db_calloc(1, sizeof(BTREE))) == NULL) - return (ENOMEM); - - t->bt_sp = t->bt_csp = t->bt_stack; - t->bt_esp = t->bt_stack + sizeof(t->bt_stack) / sizeof(t->bt_stack[0]); - - if ((type == DB_RECNO || F_ISSET(dbp, DB_BT_RECNUM)) && - (ret = __bam_keyalloc(t)) != 0) - goto err; + /* Allocate and initialize the private btree structure. */ + if ((ret = __os_calloc(1, sizeof(BTREE), &t)) != 0) + return (ret); + dbp->internal = t; /* * Intention is to make sure all of the user's selections are okay * here and then use them without checking. */ - if (dbinfo != NULL) { + if (dbinfo == NULL) { + t->bt_minkey = DEFMINKEYPAGE; + t->bt_compare = __bam_defcmp; + t->bt_prefix = __bam_defpfx; + } else { /* Minimum number of keys per page. */ if (dbinfo->bt_minkey == 0) t->bt_minkey = DEFMINKEYPAGE; @@ -126,152 +112,125 @@ __bam_open(dbp, type, dbinfo) * If no comparison, use default comparison. If no comparison * and no prefix, use default prefix. (We can't default the * prefix if the user supplies a comparison routine; shortening - * the keys may break their comparison algorithm.) + * the keys may break their comparison algorithm. We don't + * permit the user to specify a prefix routine if they didn't + * also specify a comparison routine, they can't know enough + * about our comparison routine to get it right.) */ - t->bt_compare = dbinfo->bt_compare == NULL ? - __bam_defcmp : dbinfo->bt_compare; - t->bt_prefix = dbinfo->bt_prefix == NULL ? - (dbinfo->bt_compare == NULL ? - __bam_defpfx : NULL) : dbinfo->bt_prefix; - } else { - t->bt_minkey = DEFMINKEYPAGE; - t->bt_compare = __bam_defcmp; - t->bt_prefix = __bam_defpfx; + if ((t->bt_compare = dbinfo->bt_compare) == NULL) { + if (dbinfo->bt_prefix != NULL) + goto einval; + t->bt_compare = __bam_defcmp; + t->bt_prefix = __bam_defpfx; + } else + t->bt_prefix = dbinfo->bt_prefix; } - /* Initialize the remaining fields of the DB. */ - dbp->type = type; - dbp->internal = t; - dbp->cursor = __bam_cursor; + /* Initialize the remaining fields/methods of the DB. */ + dbp->am_close = __bam_close; dbp->del = __bam_delete; - dbp->get = __bam_get; - dbp->put = __bam_put; dbp->stat = __bam_stat; - dbp->sync = __bam_sync; - - /* - * The btree data structure requires that at least two key/data pairs - * can fit on a page, but other than that there's no fixed requirement. - * Translate the minimum number of items into the bytes a key/data pair - * can use before being placed on an overflow page. We calculate for - * the worst possible alignment by assuming every item requires the - * maximum alignment for padding. - * - * Recno uses the btree bt_ovflsize value -- it's close enough. - */ - t->bt_ovflsize = (dbp->pgsize - P_OVERHEAD) / (t->bt_minkey * P_INDX) - - (BKEYDATA_PSIZE(0) + ALIGN(1, 4)); - /* Create a root page if new tree. */ - if ((ret = __bam_setmeta(dbp, t)) != 0) + /* Start up the tree. */ + if ((ret = __bam_read_root(dbp)) != 0) goto err; + /* Set the overflow page size. */ + __bam_setovflsize(dbp); + return (0); einval: ret = EINVAL; -err: if (t != NULL) { - /* If we allocated room for key/data return, discard it. */ - if (t->bt_rkey.data != NULL) - __db_free(t->bt_rkey.data); - - FREE(t, sizeof(BTREE)); - } +err: __os_free(t, sizeof(BTREE)); return (ret); } /* - * __bam_bdup -- - * Create a BTREE handle for a threaded DB handle. + * __bam_close -- + * Close a btree. * - * PUBLIC: int __bam_bdup __P((DB *, DB *)); + * PUBLIC: int __bam_close __P((DB *)); */ int -__bam_bdup(orig, new) - DB *orig, *new; +__bam_close(dbp) + DB *dbp; { - BTREE *t, *ot; - int ret; - - ot = orig->internal; - - if ((t = (BTREE *)__db_calloc(1, sizeof(*t))) == NULL) - return (ENOMEM); - - /* - * !!! - * Ignore the cursor queue, only the first DB has attached cursors. - */ + __os_free(dbp->internal, sizeof(BTREE)); + dbp->internal = NULL; - t->bt_sp = t->bt_csp = t->bt_stack; - t->bt_esp = t->bt_stack + sizeof(t->bt_stack) / sizeof(t->bt_stack[0]); + return (0); +} - if ((orig->type == DB_RECNO || F_ISSET(orig, DB_BT_RECNUM)) && - (ret = __bam_keyalloc(t)) != 0) { - FREE(t, sizeof(*t)); - return (ret); - } +/* + * __bam_setovflsize -- + * + * PUBLIC: void __bam_setovflsize __P((DB *)); + */ +void +__bam_setovflsize(dbp) + DB *dbp; +{ + BTREE *t; - t->bt_maxkey = ot->bt_maxkey; - t->bt_minkey = ot->bt_minkey; - t->bt_compare = ot->bt_compare; - t->bt_prefix = ot->bt_prefix; - t->bt_ovflsize = ot->bt_ovflsize; + t = dbp->internal; /* * !!! - * The entire RECNO structure is shared. If it breaks, the application - * was misusing it to start with. + * Correction for recno, which doesn't know anything about minimum + * keys per page. */ - t->bt_recno = ot->bt_recno; - - new->internal = t; - - return (0); -} + if (t->bt_minkey == 0) + t->bt_minkey = DEFMINKEYPAGE; -/* - * __bam_keyalloc -- - * Allocate return memory for recno keys. - */ -static int -__bam_keyalloc(t) - BTREE *t; -{ /* - * Recno keys are always the same size, and we don't want to have - * to check for space on each return. Allocate it now. + * The btree data structure requires that at least two key/data pairs + * can fit on a page, but other than that there's no fixed requirement. + * Translate the minimum number of items into the bytes a key/data pair + * can use before being placed on an overflow page. We calculate for + * the worst possible alignment by assuming every item requires the + * maximum alignment for padding. + * + * Recno uses the btree bt_ovflsize value -- it's close enough. */ - if ((t->bt_rkey.data = (void *)__db_malloc(sizeof(db_recno_t))) == NULL) - return (ENOMEM); - t->bt_rkey.ulen = sizeof(db_recno_t); - return (0); + t->bt_ovflsize = (dbp->pgsize - P_OVERHEAD) / (t->bt_minkey * P_INDX) + - (BKEYDATA_PSIZE(0) + ALIGN(1, 4)); } /* - * __bam_setmeta -- + * __bam_read_root -- * Check (and optionally create) a tree. + * + * PUBLIC: int __bam_read_root __P((DB *)); */ -static int -__bam_setmeta(dbp, t) +int +__bam_read_root(dbp) DB *dbp; - BTREE *t; { BTMETA *meta; - PAGE *root; + BTREE *t; + DBC *dbc; DB_LOCK metalock, rootlock; + PAGE *root; db_pgno_t pgno; - int ret; + int ret, t_ret; + + ret = 0; + t = dbp->internal; + + /* Get a cursor. */ + if ((ret = dbp->cursor(dbp, NULL, &dbc, 0)) != 0) + return (ret); /* Get, and optionally create the metadata page. */ pgno = PGNO_METADATA; if ((ret = - __bam_lget(dbp, 0, PGNO_METADATA, DB_LOCK_WRITE, &metalock)) != 0) - return (ret); + __bam_lget(dbc, 0, PGNO_METADATA, DB_LOCK_WRITE, &metalock)) != 0) + goto err; if ((ret = - __bam_pget(dbp, (PAGE **)&meta, &pgno, DB_MPOOL_CREATE)) != 0) { - (void)__BT_LPUT(dbp, metalock); - return (ret); + memp_fget(dbp->mpf, &pgno, DB_MPOOL_CREATE, (PAGE **)&meta)) != 0) { + (void)__BT_LPUT(dbc, metalock); + goto err; } /* @@ -284,8 +243,8 @@ __bam_setmeta(dbp, t) t->bt_minkey = meta->minkey; (void)memp_fput(dbp->mpf, (PAGE *)meta, 0); - (void)__BT_LPUT(dbp, metalock); - return (0); + (void)__BT_LPUT(dbc, metalock); + goto done; } /* Initialize the tree structure metadata information. */ @@ -308,16 +267,16 @@ __bam_setmeta(dbp, t) F_SET(meta, BTM_RECNUM); if (F_ISSET(dbp, DB_RE_RENUMBER)) F_SET(meta, BTM_RENUMBER); - memcpy(meta->uid, dbp->lock.fileid, DB_FILE_ID_LEN); + memcpy(meta->uid, dbp->fileid, DB_FILE_ID_LEN); /* Create and initialize a root page. */ pgno = PGNO_ROOT; if ((ret = - __bam_lget(dbp, 0, PGNO_ROOT, DB_LOCK_WRITE, &rootlock)) != 0) - return (ret); - if ((ret = __bam_pget(dbp, &root, &pgno, DB_MPOOL_CREATE)) != 0) { - (void)__BT_LPUT(dbp, rootlock); - return (ret); + __bam_lget(dbc, 0, PGNO_ROOT, DB_LOCK_WRITE, &rootlock)) != 0) + goto err; + if ((ret = memp_fget(dbp->mpf, &pgno, DB_MPOOL_CREATE, &root)) != 0) { + (void)__BT_LPUT(dbc, rootlock); + goto err; } P_INIT(root, dbp->pgsize, PGNO_ROOT, PGNO_INVALID, PGNO_INVALID, 1, dbp->type == DB_RECNO ? P_LRECNO : P_LBTREE); @@ -325,9 +284,9 @@ __bam_setmeta(dbp, t) /* Release the metadata and root pages. */ if ((ret = memp_fput(dbp->mpf, (PAGE *)meta, DB_MPOOL_DIRTY)) != 0) - return (ret); + goto err; if ((ret = memp_fput(dbp->mpf, root, DB_MPOOL_DIRTY)) != 0) - return (ret); + goto err; /* * Flush the metadata and root pages to disk -- since the user can't @@ -341,8 +300,11 @@ __bam_setmeta(dbp, t) ret = EINVAL; /* Release the locks. */ - (void)__BT_LPUT(dbp, metalock); - (void)__BT_LPUT(dbp, rootlock); + (void)__BT_LPUT(dbc, metalock); + (void)__BT_LPUT(dbc, rootlock); +err: +done: if ((t_ret = dbc->c_close(dbc)) != 0 && ret == 0) + ret = t_ret; return (ret); } diff --git a/db2/btree/bt_page.c b/db2/btree/bt_page.c index 87f2811398..6ccd68a5ab 100644 --- a/db2/btree/bt_page.c +++ b/db2/btree/bt_page.c @@ -47,7 +47,7 @@ #include "config.h" #ifndef lint -static const char sccsid[] = "@(#)bt_page.c 10.12 (Sleepycat) 5/6/98"; +static const char sccsid[] = "@(#)bt_page.c 10.17 (Sleepycat) 1/3/99"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -65,45 +65,47 @@ static const char sccsid[] = "@(#)bt_page.c 10.12 (Sleepycat) 5/6/98"; * __bam_new -- * Get a new page, preferably from the freelist. * - * PUBLIC: int __bam_new __P((DB *, u_int32_t, PAGE **)); + * PUBLIC: int __bam_new __P((DBC *, u_int32_t, PAGE **)); */ int -__bam_new(dbp, type, pagepp) - DB *dbp; +__bam_new(dbc, type, pagepp) + DBC *dbc; u_int32_t type; PAGE **pagepp; { BTMETA *meta; + DB *dbp; DB_LOCK metalock; PAGE *h; db_pgno_t pgno; int ret; + dbp = dbc->dbp; meta = NULL; h = NULL; metalock = LOCK_INVALID; pgno = PGNO_METADATA; - if ((ret = __bam_lget(dbp, 0, pgno, DB_LOCK_WRITE, &metalock)) != 0) + if ((ret = __bam_lget(dbc, 0, pgno, DB_LOCK_WRITE, &metalock)) != 0) goto err; - if ((ret = __bam_pget(dbp, (PAGE **)&meta, &pgno, 0)) != 0) + if ((ret = memp_fget(dbp->mpf, &pgno, 0, (PAGE **)&meta)) != 0) goto err; if (meta->free == PGNO_INVALID) { - if ((ret = __bam_pget(dbp, &h, &pgno, DB_MPOOL_NEW)) != 0) + if ((ret = memp_fget(dbp->mpf, &pgno, DB_MPOOL_NEW, &h)) != 0) goto err; ZERO_LSN(h->lsn); h->pgno = pgno; } else { pgno = meta->free; - if ((ret = __bam_pget(dbp, &h, &pgno, 0)) != 0) + if ((ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0) goto err; meta->free = h->next_pgno; } /* Log the change. */ - if (DB_LOGGING(dbp)) { - if ((ret = __bam_pg_alloc_log(dbp->dbenv->lg_info, dbp->txn, + if (DB_LOGGING(dbc)) { + if ((ret = __bam_pg_alloc_log(dbp->dbenv->lg_info, dbc->txn, &meta->lsn, 0, dbp->log_fileid, &meta->lsn, &h->lsn, h->pgno, (u_int32_t)type, meta->free)) != 0) goto err; @@ -111,7 +113,7 @@ __bam_new(dbp, type, pagepp) } (void)memp_fput(dbp->mpf, (PAGE *)meta, DB_MPOOL_DIRTY); - (void)__BT_TLPUT(dbp, metalock); + (void)__BT_TLPUT(dbc, metalock); P_INIT(h, dbp->pgsize, h->pgno, PGNO_INVALID, PGNO_INVALID, 0, type); *pagepp = h; @@ -122,28 +124,45 @@ err: if (h != NULL) if (meta != NULL) (void)memp_fput(dbp->mpf, meta, 0); if (metalock != LOCK_INVALID) - (void)__BT_TLPUT(dbp, metalock); + (void)__BT_TLPUT(dbc, metalock); return (ret); } /* + * __bam_lput -- + * The standard lock put call. + * + * PUBLIC: int __bam_lput __P((DBC *, DB_LOCK)); + */ +int +__bam_lput(dbc, lock) + DBC *dbc; + DB_LOCK lock; +{ + return (__BT_LPUT(dbc, lock)); +} + +/* * __bam_free -- * Add a page to the head of the freelist. * - * PUBLIC: int __bam_free __P((DB *, PAGE *)); + * PUBLIC: int __bam_free __P((DBC *, PAGE *)); */ int -__bam_free(dbp, h) - DB *dbp; +__bam_free(dbc, h) + DBC *dbc; PAGE *h; { BTMETA *meta; + DB *dbp; DBT ldbt; DB_LOCK metalock; db_pgno_t pgno; u_int32_t dirty_flag; int ret, t_ret; + dbp = dbc->dbp; + /* * Retrieve the metadata page and insert the page at the head of * the free list. If either the lock get or page get routines @@ -152,23 +171,23 @@ __bam_free(dbp, h) */ dirty_flag = 0; pgno = PGNO_METADATA; - if ((ret = __bam_lget(dbp, 0, pgno, DB_LOCK_WRITE, &metalock)) != 0) + if ((ret = __bam_lget(dbc, 0, pgno, DB_LOCK_WRITE, &metalock)) != 0) goto err; - if ((ret = __bam_pget(dbp, (PAGE **)&meta, &pgno, 0)) != 0) { - (void)__BT_TLPUT(dbp, metalock); + if ((ret = memp_fget(dbp->mpf, &pgno, 0, (PAGE **)&meta)) != 0) { + (void)__BT_TLPUT(dbc, metalock); goto err; } /* Log the change. */ - if (DB_LOGGING(dbp)) { + if (DB_LOGGING(dbc)) { memset(&ldbt, 0, sizeof(ldbt)); ldbt.data = h; ldbt.size = P_OVERHEAD; if ((ret = __bam_pg_free_log(dbp->dbenv->lg_info, - dbp->txn, &meta->lsn, 0, dbp->log_fileid, h->pgno, + dbc->txn, &meta->lsn, 0, dbp->log_fileid, h->pgno, &meta->lsn, &ldbt, meta->free)) != 0) { (void)memp_fput(dbp->mpf, (PAGE *)meta, 0); - (void)__BT_TLPUT(dbp, metalock); + (void)__BT_TLPUT(dbc, metalock); return (ret); } LSN(h) = LSN(meta); @@ -182,7 +201,7 @@ __bam_free(dbp, h) { db_pgno_t __pgno; DB_LSN __lsn; __pgno = h->pgno; __lsn = h->lsn; - memset(h, 0xff, dbp->pgsize); + memset(h, 0xdb, dbp->pgsize); h->pgno = __pgno; h->lsn = __lsn; } @@ -194,7 +213,7 @@ __bam_free(dbp, h) /* Discard the metadata page. */ ret = memp_fput(dbp->mpf, (PAGE *)meta, DB_MPOOL_DIRTY); - if ((t_ret = __BT_TLPUT(dbp, metalock)) != 0) + if ((t_ret = __BT_TLPUT(dbc, metalock)) != 0) ret = t_ret; /* Discard the caller's page reference. */ @@ -212,19 +231,21 @@ err: if ((t_ret = memp_fput(dbp->mpf, h, dirty_flag)) != 0 && ret == 0) #ifdef DEBUG /* * __bam_lt -- - * Print out the list of currently held locks. + * Print out the list of locks currently held by a cursor. * - * PUBLIC: int __bam_lt __P((DB *)); + * PUBLIC: int __bam_lt __P((DBC *)); */ int -__bam_lt(dbp) - DB *dbp; +__bam_lt(dbc) + DBC *dbc; { + DB *dbp; DB_LOCKREQ req; + dbp = dbc->dbp; if (F_ISSET(dbp, DB_AM_LOCKING)) { req.op = DB_LOCK_DUMP; - lock_vec(dbp->dbenv->lk_info, dbp->locker, 0, &req, 1, NULL); + lock_vec(dbp->dbenv->lk_info, dbc->locker, 0, &req, 1, NULL); } return (0); } @@ -234,27 +255,29 @@ __bam_lt(dbp) * __bam_lget -- * The standard lock get call. * - * PUBLIC: int __bam_lget __P((DB *, int, db_pgno_t, db_lockmode_t, DB_LOCK *)); + * PUBLIC: int __bam_lget + * PUBLIC: __P((DBC *, int, db_pgno_t, db_lockmode_t, DB_LOCK *)); */ int -__bam_lget(dbp, do_couple, pgno, mode, lockp) - DB *dbp; +__bam_lget(dbc, do_couple, pgno, mode, lockp) + DBC *dbc; int do_couple; db_pgno_t pgno; db_lockmode_t mode; DB_LOCK *lockp; { + DB *dbp; DB_LOCKREQ couple[2]; - u_int32_t locker; int ret; + dbp = dbc->dbp; + if (!F_ISSET(dbp, DB_AM_LOCKING)) { *lockp = LOCK_INVALID; return (0); } - locker = dbp->txn == NULL ? dbp->locker : dbp->txn->txnid; - dbp->lock.pgno = pgno; + dbc->lock.pgno = pgno; /* * If the object not currently locked, acquire the lock and return, @@ -263,54 +286,32 @@ __bam_lget(dbp, do_couple, pgno, mode, lockp) */ if (do_couple) { couple[0].op = DB_LOCK_GET; - couple[0].obj = &dbp->lock_dbt; + couple[0].obj = &dbc->lock_dbt; couple[0].mode = mode; couple[1].op = DB_LOCK_PUT; couple[1].lock = *lockp; - ret = lock_vec(dbp->dbenv->lk_info, locker, 0, couple, 2, NULL); + if (dbc->txn == NULL) + ret = lock_vec(dbp->dbenv->lk_info, + dbc->locker, 0, couple, 2, NULL); + else + ret = lock_tvec(dbp->dbenv->lk_info, + dbc->txn, 0, couple, 2, NULL); if (ret != 0) { /* If we fail, discard the lock we held. */ - __bam_lput(dbp, *lockp); + __BT_LPUT(dbc, *lockp); return (ret < 0 ? EAGAIN : ret); } *lockp = couple[0].lock; } else { - ret = lock_get(dbp->dbenv->lk_info, - locker, 0, &dbp->lock_dbt, mode, lockp); + if (dbc->txn == NULL) + ret = lock_get(dbp->dbenv->lk_info, + dbc->locker, 0, &dbc->lock_dbt, mode, lockp); + else + ret = lock_tget(dbp->dbenv->lk_info, + dbc->txn, 0, &dbc->lock_dbt, mode, lockp); return (ret < 0 ? EAGAIN : ret); } return (0); } - -/* - * __bam_lput -- - * The standard lock put call. - * - * PUBLIC: int __bam_lput __P((DB *, DB_LOCK)); - */ -int -__bam_lput(dbp, lock) - DB *dbp; - DB_LOCK lock; -{ - return (__BT_LPUT(dbp, lock)); -} - -/* - * __bam_pget -- - * The standard page get call. - * - * PUBLIC: int __bam_pget __P((DB *, PAGE **, db_pgno_t *, u_int32_t)); - */ -int -__bam_pget(dbp, hp, pgnop, mpool_flags) - DB *dbp; - PAGE **hp; - db_pgno_t *pgnop; - u_int32_t mpool_flags; -{ - return (memp_fget((dbp)->mpf, - pgnop, mpool_flags, hp) == 0 ? 0 : __db_pgerr(dbp, *pgnop)); -} diff --git a/db2/btree/bt_put.c b/db2/btree/bt_put.c index a93faac98c..0d7a69889a 100644 --- a/db2/btree/bt_put.c +++ b/db2/btree/bt_put.c @@ -47,7 +47,7 @@ #include "config.h" #ifndef lint -static const char sccsid[] = "@(#)bt_put.c 10.45 (Sleepycat) 5/25/98"; +static const char sccsid[] = "@(#)bt_put.c 10.54 (Sleepycat) 12/6/98"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -61,372 +61,23 @@ static const char sccsid[] = "@(#)bt_put.c 10.45 (Sleepycat) 5/25/98"; #include "db_page.h" #include "btree.h" -static int __bam_fixed __P((BTREE *, DBT *)); -static int __bam_isdeleted __P((DB *, PAGE *, u_int32_t, int *)); -static int __bam_lookup __P((DB *, DBT *, int *)); -static int __bam_ndup __P((DB *, PAGE *, u_int32_t)); -static int __bam_ovput __P((DB *, PAGE *, u_int32_t, DBT *)); -static int __bam_partial __P((DB *, DBT *, PAGE *, u_int32_t, u_int32_t)); +static int __bam_fixed __P((DBC *, DBT *)); +static int __bam_ndup __P((DBC *, PAGE *, u_int32_t)); +static int __bam_ovput __P((DBC *, PAGE *, u_int32_t, DBT *)); +static int __bam_partial __P((DBC *, + DBT *, PAGE *, u_int32_t, u_int32_t, u_int32_t)); static u_int32_t __bam_partsize __P((DBT *, PAGE *, u_int32_t)); /* - * __bam_put -- - * Add a new key/data pair or replace an existing pair (btree). - * - * PUBLIC: int __bam_put __P((DB *, DB_TXN *, DBT *, DBT *, u_int32_t)); - */ -int -__bam_put(argdbp, txn, key, data, flags) - DB *argdbp; - DB_TXN *txn; - DBT *key, *data; - u_int32_t flags; -{ - BTREE *t; - CURSOR c; - DB *dbp; - PAGE *h; - db_indx_t indx; - u_int32_t iitem_flags, insert_flags; - int exact, isdeleted, newkey, ret, stack; - - DEBUG_LWRITE(argdbp, txn, "bam_put", key, data, flags); - - /* Check flags. */ - if ((ret = __db_putchk(argdbp, key, data, flags, - F_ISSET(argdbp, DB_AM_RDONLY), F_ISSET(argdbp, DB_AM_DUP))) != 0) - return (ret); - - GETHANDLE(argdbp, txn, &dbp, ret); - t = dbp->internal; - -retry: /* - * Find the location at which to insert. The call to __bam_lookup - * leaves the returned page pinned. - */ - if ((ret = __bam_lookup(dbp, key, &exact)) != 0) { - PUTHANDLE(dbp); - return (ret); - } - h = t->bt_csp->page; - indx = t->bt_csp->indx; - stack = 1; - - /* - * If DB_NOOVERWRITE is set and there's an identical key in the tree, - * return an error unless the data item has already been marked for - * deletion, or, all the remaining data items have already been marked - * for deletion in the case of duplicates. If all the data items have - * been marked for deletion, we do a replace, otherwise, it has to be - * a set of duplicates, and we simply append a new one to the set. - */ - isdeleted = 0; - if (exact) { - if ((ret = __bam_isdeleted(dbp, h, indx, &isdeleted)) != 0) - goto err; - if (isdeleted) - __bam_ca_replace(dbp, h->pgno, indx, REPLACE_SETUP); - else - if (flags == DB_NOOVERWRITE) { - ret = DB_KEYEXIST; - goto err; - } - } - - /* - * If we're inserting into the first or last page of the tree, - * remember where we did it so we can do fast lookup next time. - * - * XXX - * Does reverse order still work (did it ever!?!?) - */ - t->bt_lpgno = - h->next_pgno == PGNO_INVALID || h->prev_pgno == PGNO_INVALID ? - h->pgno : PGNO_INVALID; - - /* - * Select the arguments for __bam_iitem() and do the insert. If the - * key is an exact match, we're either adding a new duplicate at the - * end of the duplicate set, or we're replacing the data item with a - * new data item. If the key isn't an exact match, we're inserting - * a new key/data pair, before the search location. - */ - newkey = dbp->type == DB_BTREE && !exact; - if (exact) { - if (!isdeleted && F_ISSET(dbp, DB_AM_DUP)) { - /* - * Make sure that we're not looking at a page of - * duplicates -- if so, move to the last entry on - * that page. - */ - c.page = h; - c.pgno = h->pgno; - c.indx = indx; - c.dpgno = PGNO_INVALID; - c.dindx = 0; - if ((ret = - __bam_ovfl_chk(dbp, &c, indx + O_INDX, 1)) != 0) - goto err; - if (c.dpgno != PGNO_INVALID) { - /* - * XXX - * The __bam_ovfl_chk() routine memp_fput() the - * current page and acquired a new one, but did - * not do anything about the lock we're holding. - */ - t->bt_csp->page = h = c.page; - indx = c.dindx; - } - insert_flags = DB_AFTER; - } else - insert_flags = DB_CURRENT; - } else - insert_flags = DB_BEFORE; - - /* - * The pages we're using may be modified by __bam_iitem(), so make - * sure we reset the stack. - */ - iitem_flags = 0; - if (newkey) - iitem_flags |= BI_NEWKEY; - if (isdeleted) - iitem_flags |= BI_DOINCR; - ret = __bam_iitem(dbp, &h, &indx, key, data, insert_flags, iitem_flags); - t->bt_csp->page = h; - t->bt_csp->indx = indx; - - switch (ret) { - case 0: - /* Done. Clean up the cursor. */ - if (isdeleted) - __bam_ca_replace(dbp, h->pgno, indx, REPLACE_SUCCESS); - break; - case DB_NEEDSPLIT: - /* - * We have to split the page. Back out the cursor setup, - * discard the stack of pages, and do the split. - */ - if (isdeleted) - __bam_ca_replace(dbp, h->pgno, indx, REPLACE_FAILED); - - (void)__bam_stkrel(dbp); - stack = 0; - - if ((ret = __bam_split(dbp, key)) != 0) - break; - - goto retry; - /* NOTREACHED */ - default: - if (isdeleted) - __bam_ca_replace(dbp, h->pgno, indx, REPLACE_FAILED); - break; - } - -err: if (stack) - (void)__bam_stkrel(dbp); - - PUTHANDLE(dbp); - return (ret); -} - -/* - * __bam_isdeleted -- - * Return if the only remaining data item for the element has been - * deleted. - */ -static int -__bam_isdeleted(dbp, h, indx, isdeletedp) - DB *dbp; - PAGE *h; - u_int32_t indx; - int *isdeletedp; -{ - BKEYDATA *bk; - db_pgno_t pgno; - int ret; - - *isdeletedp = 1; - for (;;) { - bk = GET_BKEYDATA(h, indx + O_INDX); - switch (B_TYPE(bk->type)) { - case B_KEYDATA: - case B_OVERFLOW: - if (!B_DISSET(bk->type)) { - *isdeletedp = 0; - return (0); - } - break; - case B_DUPLICATE: - /* - * If the data item referencing the off-page duplicates - * is flagged as deleted, we're done. Else, we have to - * walk the chain of duplicate pages. - */ - if (B_DISSET(bk->type)) - return (0); - goto dupchk; - default: - return (__db_pgfmt(dbp, h->pgno)); - } - - /* - * If there are no more on-page duplicate items, then every - * data item for this key must have been deleted. - */ - if (indx + P_INDX >= (u_int32_t)NUM_ENT(h)) - return (0); - if (h->inp[indx] != h->inp[indx + P_INDX]) - return (0); - - /* Check the next item. */ - indx += P_INDX; - } - /* NOTREACHED */ - -dupchk: /* Check a chain of duplicate pages. */ - pgno = ((BOVERFLOW *)bk)->pgno; - for (;;) { - /* Acquire the next page in the duplicate chain. */ - if ((ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0) - return (ret); - - /* Check each item for a delete flag. */ - for (indx = 0; indx < NUM_ENT(h); ++indx) - if (!B_DISSET(GET_BKEYDATA(h, indx)->type)) { - *isdeletedp = 0; - goto done; - } - /* - * If we reach the end of the duplicate pages, then every - * item we reviewed must have been deleted. - */ - if ((pgno = NEXT_PGNO(h)) == PGNO_INVALID) - goto done; - - (void)memp_fput(dbp->mpf, h, 0); - } - /* NOTREACHED */ - -done: (void)memp_fput(dbp->mpf, h, 0); - return (0); -} - -/* - * __bam_lookup -- - * Find the right location in the tree for the key. - */ -static int -__bam_lookup(dbp, key, exactp) - DB *dbp; - DBT *key; - int *exactp; -{ - BTREE *t; - DB_LOCK lock; - EPG e; - PAGE *h; - db_indx_t indx; - int cmp, ret; - - t = dbp->internal; - h = NULL; - - /* - * Record numbers can't be fast-tracked, we have to lock the entire - * tree. - */ - if (F_ISSET(dbp, DB_BT_RECNUM)) - goto slow; - - /* Check to see if we've been seeing sorted input. */ - if (t->bt_lpgno == PGNO_INVALID) - goto slow; - - /* - * Retrieve the page on which we did the last insert. It's okay if - * it doesn't exist, or if it's not the page type we expect, it just - * means that the world changed. - */ - if (__bam_lget(dbp, 0, t->bt_lpgno, DB_LOCK_WRITE, &lock)) - goto miss; - if (__bam_pget(dbp, &h, &t->bt_lpgno, 0)) { - (void)__BT_LPUT(dbp, lock); - goto miss; - } - if (TYPE(h) != P_LBTREE) - goto miss; - if (NUM_ENT(h) == 0) - goto miss; - - /* - * We have to be at the end or beginning of the tree to know that - * we're inserting in a sort order. If that's the case and we're - * in the right order in comparison to the first/last key/data pair, - * we have the right position. - */ - if (h->next_pgno == PGNO_INVALID) { - e.page = h; - e.indx = NUM_ENT(h) - P_INDX; - if ((cmp = __bam_cmp(dbp, key, &e)) >= 0) { - if (cmp > 0) - e.indx += P_INDX; - goto fast; - } - } - if (h->prev_pgno == PGNO_INVALID) { - e.page = h; - e.indx = 0; - if ((cmp = __bam_cmp(dbp, key, &e)) <= 0) { - /* - * We're doing a put, so we want to insert as the last - * of any set of duplicates. - */ - if (cmp == 0) { - for (indx = 0; - indx < (db_indx_t)(NUM_ENT(h) - P_INDX) && - h->inp[indx] == h->inp[indx + P_INDX]; - indx += P_INDX) - ; - e.indx = indx; - } - goto fast; - } - } - goto miss; - - /* Set the exact match flag in case we've already inserted this key. */ -fast: *exactp = cmp == 0; - - /* Enter the entry in the stack. */ - BT_STK_CLR(t); - BT_STK_ENTER(t, e.page, e.indx, lock, ret); - if (ret != 0) - return (ret); - - ++t->lstat.bt_cache_hit; - return (0); - -miss: ++t->lstat.bt_cache_miss; - if (h != NULL) { - (void)memp_fput(dbp->mpf, h, 0); - (void)__BT_LPUT(dbp, lock); - } - -slow: return (__bam_search(dbp, key, S_INSERT, 1, NULL, exactp)); -} - -/* * __bam_iitem -- * Insert an item into the tree. * - * PUBLIC: int __bam_iitem __P((DB *, + * PUBLIC: int __bam_iitem __P((DBC *, * PUBLIC: PAGE **, db_indx_t *, DBT *, DBT *, u_int32_t, u_int32_t)); */ int -__bam_iitem(dbp, hp, indxp, key, data, op, flags) - DB *dbp; +__bam_iitem(dbc, hp, indxp, key, data, op, flags) + DBC *dbc; PAGE **hp; db_indx_t *indxp; DBT *key, *data; @@ -434,6 +85,7 @@ __bam_iitem(dbp, hp, indxp, key, data, op, flags) { BTREE *t; BKEYDATA *bk; + DB *dbp; DBT tdbt; PAGE *h; db_indx_t indx, nbytes; @@ -442,6 +94,7 @@ __bam_iitem(dbp, hp, indxp, key, data, op, flags) COMPQUIET(bk, NULL); + dbp = dbc->dbp; t = dbp->internal; h = *hp; indx = *indxp; @@ -473,21 +126,21 @@ __bam_iitem(dbp, hp, indxp, key, data, op, flags) default: return (__db_pgfmt(dbp, h->pgno)); } - if ((ret = __db_ditem(dbp, *hp, *indxp, nbytes)) != 0) + if ((ret = __db_ditem(dbc, *hp, *indxp, nbytes)) != 0) return (ret); } /* Put the new/replacement item onto the page. */ - if ((ret = __db_dput(dbp, data, hp, indxp, __bam_new)) != 0) + if ((ret = __db_dput(dbc, data, hp, indxp, __bam_new)) != 0) return (ret); goto done; } /* Handle fixed-length records: build the real record. */ - if (F_ISSET(dbp, DB_RE_FIXEDLEN) && data->size != t->bt_recno->re_len) { + if (F_ISSET(dbp, DB_RE_FIXEDLEN) && data->size != t->recno->re_len) { tdbt = *data; - if ((ret = __bam_fixed(t, &tdbt)) != 0) + if ((ret = __bam_fixed(dbc, &tdbt)) != 0) return (ret); data = &tdbt; } @@ -554,7 +207,8 @@ __bam_iitem(dbp, hp, indxp, key, data, op, flags) /* Handle partial puts: build the real record. */ if (F_ISSET(data, DB_DBT_PARTIAL)) { tdbt = *data; - if ((ret = __bam_partial(dbp, &tdbt, h, indx, data_size)) != 0) + if ((ret = __bam_partial(dbc, + &tdbt, h, indx, data_size, flags)) != 0) return (ret); data = &tdbt; } @@ -583,10 +237,10 @@ __bam_iitem(dbp, hp, indxp, key, data, op, flags) /* Add the key. */ if (bigkey) { - if ((ret = __bam_ovput(dbp, h, indx, key)) != 0) + if ((ret = __bam_ovput(dbc, h, indx, key)) != 0) return (ret); } else - if ((ret = __db_pitem(dbp, h, indx, + if ((ret = __db_pitem(dbc, h, indx, BKEYDATA_SIZE(key->size), NULL, key)) != 0) return (ret); ++indx; @@ -598,7 +252,7 @@ __bam_iitem(dbp, hp, indxp, key, data, op, flags) * Adjust the cursor and copy in the key for * the duplicate. */ - if ((ret = __bam_adjindx(dbp, + if ((ret = __bam_adjindx(dbc, h, indx + P_INDX, indx, 1)) != 0) return (ret); @@ -620,7 +274,7 @@ __bam_iitem(dbp, hp, indxp, key, data, op, flags) * the duplicate. */ if ((ret = - __bam_adjindx(dbp, h, indx, indx, 1)) != 0) + __bam_adjindx(dbc, h, indx, indx, 1)) != 0) return (ret); ++indx; @@ -639,7 +293,7 @@ __bam_iitem(dbp, hp, indxp, key, data, op, flags) * delete and then re-add the item. */ if (bigdata || B_TYPE(bk->type) != B_KEYDATA) { - if ((ret = __bam_ditem(dbp, h, indx)) != 0) + if ((ret = __bam_ditem(dbc, h, indx)) != 0) return (ret); break; } @@ -654,7 +308,7 @@ __bam_iitem(dbp, hp, indxp, key, data, op, flags) /* Add the data. */ if (bigdata) { - if ((ret = __bam_ovput(dbp, h, indx, data)) != 0) + if ((ret = __bam_ovput(dbc, h, indx, data)) != 0) return (ret); } else { BKEYDATA __bk; @@ -665,12 +319,12 @@ __bam_iitem(dbp, hp, indxp, key, data, op, flags) __bk.len = data->size; __hdr.data = &__bk; __hdr.size = SSZA(BKEYDATA, data); - ret = __db_pitem(dbp, h, indx, + ret = __db_pitem(dbc, h, indx, BKEYDATA_SIZE(data->size), &__hdr, data); } else if (replace) - ret = __bam_ritem(dbp, h, indx, data); + ret = __bam_ritem(dbc, h, indx, data); else - ret = __db_pitem(dbp, h, indx, + ret = __db_pitem(dbc, h, indx, BKEYDATA_SIZE(data->size), NULL, data); if (ret != 0) return (ret); @@ -686,7 +340,7 @@ __bam_iitem(dbp, hp, indxp, key, data, op, flags) */ if (dupadjust && P_FREESPACE(h) <= dbp->pgsize / 2) { --indx; - if ((ret = __bam_ndup(dbp, h, indx)) != 0) + if ((ret = __bam_ndup(dbc, h, indx)) != 0) return (ret); } @@ -700,14 +354,12 @@ __bam_iitem(dbp, hp, indxp, key, data, op, flags) done: if (LF_ISSET(BI_DOINCR) || (op != DB_CURRENT && (F_ISSET(dbp, DB_BT_RECNUM) || dbp->type == DB_RECNO))) - if ((ret = __bam_adjust(dbp, t, 1)) != 0) + if ((ret = __bam_adjust(dbc, 1)) != 0) return (ret); /* If we've modified a recno file, set the flag */ - if (t->bt_recno != NULL) - F_SET(t->bt_recno, RECNO_MODIFIED); - - ++t->lstat.bt_added; + if (t->recno != NULL) + F_SET(t->recno, RECNO_MODIFIED); return (ret); } @@ -770,7 +422,7 @@ __bam_partsize(data, h, indx) memset(&__hdr, 0, sizeof(__hdr)); \ __hdr.data = &bo; \ __hdr.size = BOVERFLOW_SIZE; \ - if ((ret = __db_pitem(dbp, \ + if ((ret = __db_pitem(dbc, \ h, indx, BOVERFLOW_SIZE, &__hdr, NULL)) != 0) \ return (ret); \ } while (0) @@ -780,8 +432,8 @@ __bam_partsize(data, h, indx) * Build an overflow item and put it on the page. */ static int -__bam_ovput(dbp, h, indx, item) - DB *dbp; +__bam_ovput(dbc, h, indx, item) + DBC *dbc; PAGE *h; u_int32_t indx; DBT *item; @@ -789,10 +441,12 @@ __bam_ovput(dbp, h, indx, item) BOVERFLOW bo; int ret; + UMRW(bo.unused1); B_TSET(bo.type, B_OVERFLOW, 0); - bo.tlen = item->size; - if ((ret = __db_poff(dbp, item, &bo.pgno, __bam_new)) != 0) + UMRW(bo.unused2); + if ((ret = __db_poff(dbc, item, &bo.pgno, __bam_new)) != 0) return (ret); + bo.tlen = item->size; OVPUT(h, indx, bo); @@ -803,22 +457,25 @@ __bam_ovput(dbp, h, indx, item) * __bam_ritem -- * Replace an item on a page. * - * PUBLIC: int __bam_ritem __P((DB *, PAGE *, u_int32_t, DBT *)); + * PUBLIC: int __bam_ritem __P((DBC *, PAGE *, u_int32_t, DBT *)); */ int -__bam_ritem(dbp, h, indx, data) - DB *dbp; +__bam_ritem(dbc, h, indx, data) + DBC *dbc; PAGE *h; u_int32_t indx; DBT *data; { BKEYDATA *bk; + DB *dbp; DBT orig, repl; db_indx_t cnt, lo, ln, min, off, prefix, suffix; int32_t nbytes; int ret; u_int8_t *p, *t; + dbp = dbc->dbp; + /* * Replace a single item onto a page. The logic figuring out where * to insert and whether it fits is handled in the caller. All we do @@ -827,7 +484,7 @@ __bam_ritem(dbp, h, indx, data) bk = GET_BKEYDATA(h, indx); /* Log the change. */ - if (DB_LOGGING(dbp)) { + if (DB_LOGGING(dbc)) { /* * We might as well check to see if the two data items share * a common prefix and suffix -- it can save us a lot of log @@ -851,7 +508,7 @@ __bam_ritem(dbp, h, indx, data) orig.size = bk->len - (prefix + suffix); repl.data = (u_int8_t *)data->data + prefix; repl.size = data->size - (prefix + suffix); - if ((ret = __bam_repl_log(dbp->dbenv->lg_info, dbp->txn, + if ((ret = __bam_repl_log(dbp->dbenv->lg_info, dbc->txn, &LSN(h), 0, dbp->log_fileid, PGNO(h), &LSN(h), (u_int32_t)indx, (u_int32_t)B_DISSET(bk->type), &orig, &repl, (u_int32_t)prefix, (u_int32_t)suffix)) != 0) @@ -907,18 +564,21 @@ __bam_ritem(dbp, h, indx, data) * If it should, create it. */ static int -__bam_ndup(dbp, h, indx) - DB *dbp; +__bam_ndup(dbc, h, indx) + DBC *dbc; PAGE *h; u_int32_t indx; { BKEYDATA *bk; BOVERFLOW bo; + DB *dbp; DBT hdr; PAGE *cp; db_indx_t cnt, cpindx, first, sz; int ret; + dbp = dbc->dbp; + while (indx > 0 && h->inp[indx] == h->inp[indx - P_INDX]) indx -= P_INDX; for (cnt = 0, sz = 0, first = indx;; ++cnt, indx += P_INDX) { @@ -941,7 +601,7 @@ __bam_ndup(dbp, h, indx) return (0); /* Get a new page. */ - if ((ret = __bam_new(dbp, P_DUPLICATE, &cp)) != 0) + if ((ret = __bam_new(dbc, P_DUPLICATE, &cp)) != 0) return (ret); /* @@ -957,7 +617,7 @@ __bam_ndup(dbp, h, indx) hdr.size = B_TYPE(bk->type) == B_KEYDATA ? BKEYDATA_SIZE(bk->len) : BOVERFLOW_SIZE; if ((ret = - __db_pitem(dbp, cp, cpindx, hdr.size, &hdr, NULL)) != 0) + __db_pitem(dbc, cp, cpindx, hdr.size, &hdr, NULL)) != 0) goto err; /* @@ -970,18 +630,20 @@ __bam_ndup(dbp, h, indx) PGNO(h), first, indx - O_INDX, PGNO(cp), cpindx); /* Delete the data item. */ - if ((ret = __db_ditem(dbp, h, indx, hdr.size)) != 0) + if ((ret = __db_ditem(dbc, h, indx, hdr.size)) != 0) goto err; /* Delete all but the first reference to the key. */ if (--cnt == 0) break; - if ((ret = __bam_adjindx(dbp, h, indx, first, 0)) != 0) + if ((ret = __bam_adjindx(dbc, h, indx, first, 0)) != 0) goto err; } /* Put in a new data item that points to the duplicates page. */ + UMRW(bo.unused1); B_TSET(bo.type, B_DUPLICATE, 0); + UMRW(bo.unused2); bo.pgno = cp->pgno; bo.tlen = 0; @@ -989,7 +651,7 @@ __bam_ndup(dbp, h, indx) return (memp_fput(dbp->mpf, cp, DB_MPOOL_DIRTY)); -err: (void)__bam_free(dbp, cp); +err: (void)__bam_free(dbc, cp); return (ret); } @@ -998,13 +660,16 @@ err: (void)__bam_free(dbp, cp); * Build the real record for a fixed length put. */ static int -__bam_fixed(t, dbt) - BTREE *t; +__bam_fixed(dbc, dbt) + DBC *dbc; DBT *dbt; { + DB *dbp; RECNO *rp; + int ret; - rp = t->bt_recno; + dbp = dbc->dbp; + rp = ((BTREE *)dbp->internal)->recno; /* * If database contains fixed-length records, and the record is long, @@ -1018,29 +683,27 @@ __bam_fixed(t, dbt) * short. Pad it out. We use the record data return memory, it's * only a short-term use. */ - if (t->bt_rdata.ulen < rp->re_len) { - t->bt_rdata.data = t->bt_rdata.data == NULL ? - (void *)__db_malloc(rp->re_len) : - (void *)__db_realloc(t->bt_rdata.data, rp->re_len); - if (t->bt_rdata.data == NULL) { - t->bt_rdata.ulen = 0; - return (ENOMEM); + if (dbc->rdata.ulen < rp->re_len) { + if ((ret = __os_realloc(&dbc->rdata.data, rp->re_len)) != 0) { + dbc->rdata.ulen = 0; + dbc->rdata.data = NULL; + return (ret); } - t->bt_rdata.ulen = rp->re_len; + dbc->rdata.ulen = rp->re_len; } - memcpy(t->bt_rdata.data, dbt->data, dbt->size); - memset((u_int8_t *)t->bt_rdata.data + dbt->size, + memcpy(dbc->rdata.data, dbt->data, dbt->size); + memset((u_int8_t *)dbc->rdata.data + dbt->size, rp->re_pad, rp->re_len - dbt->size); /* * Clean up our flags and other information just in case, and * change the caller's DBT to reference our created record. */ - t->bt_rdata.size = rp->re_len; - t->bt_rdata.dlen = 0; - t->bt_rdata.doff = 0; - t->bt_rdata.flags = 0; - *dbt = t->bt_rdata; + dbc->rdata.size = rp->re_len; + dbc->rdata.dlen = 0; + dbc->rdata.doff = 0; + dbc->rdata.flags = 0; + *dbt = dbc->rdata; return (0); } @@ -1050,15 +713,15 @@ __bam_fixed(t, dbt) * Build the real record for a partial put. */ static int -__bam_partial(dbp, dbt, h, indx, nbytes) - DB *dbp; +__bam_partial(dbc, dbt, h, indx, nbytes, flags) + DBC *dbc; DBT *dbt; PAGE *h; - u_int32_t indx, nbytes; + u_int32_t indx, nbytes, flags; { - BTREE *t; BKEYDATA *bk, tbk; BOVERFLOW *bo; + DB *dbp; DBT copy; u_int32_t len, tlen; u_int8_t *p; @@ -1066,18 +729,34 @@ __bam_partial(dbp, dbt, h, indx, nbytes) COMPQUIET(bo, NULL); - t = dbp->internal; + dbp = dbc->dbp; /* We use the record data return memory, it's only a short-term use. */ - if (t->bt_rdata.ulen < nbytes) { - t->bt_rdata.data = t->bt_rdata.data == NULL ? - (void *)__db_malloc(nbytes) : - (void *)__db_realloc(t->bt_rdata.data, nbytes); - if (t->bt_rdata.data == NULL) { - t->bt_rdata.ulen = 0; - return (ENOMEM); + if (dbc->rdata.ulen < nbytes) { + if ((ret = __os_realloc(&dbc->rdata.data, nbytes)) != 0) { + dbc->rdata.ulen = 0; + dbc->rdata.data = NULL; + return (ret); } - t->bt_rdata.ulen = nbytes; + dbc->rdata.ulen = nbytes; + } + + /* + * We use nul bytes for any part of the record that isn't specified; + * get it over with. + */ + memset(dbc->rdata.data, 0, nbytes); + + /* + * In the next clauses, we need to do three things: a) set p to point + * to the place at which to copy the user's data, b) set tlen to the + * total length of the record, not including the bytes contributed by + * the user, and c) copy any valid data from an existing record. + */ + if (LF_ISSET(BI_NEWKEY)) { + tlen = dbt->doff; + p = (u_int8_t *)dbc->rdata.data + dbt->doff; + goto ucopy; } /* Find the current record. */ @@ -1089,13 +768,6 @@ __bam_partial(dbp, dbt, h, indx, nbytes) B_TSET(bk->type, B_KEYDATA, 0); bk->len = 0; } - - /* - * We use nul bytes for any part of the record that isn't specified, - * get it over with. - */ - memset(t->bt_rdata.data, 0, nbytes); - if (B_TYPE(bk->type) == B_OVERFLOW) { /* * In the case of an overflow record, we shift things around @@ -1103,12 +775,12 @@ __bam_partial(dbp, dbt, h, indx, nbytes) */ memset(©, 0, sizeof(copy)); if ((ret = __db_goff(dbp, ©, bo->tlen, - bo->pgno, &t->bt_rdata.data, &t->bt_rdata.ulen)) != 0) + bo->pgno, &dbc->rdata.data, &dbc->rdata.ulen)) != 0) return (ret); /* Skip any leading data from the original record. */ tlen = dbt->doff; - p = (u_int8_t *)t->bt_rdata.data + dbt->doff; + p = (u_int8_t *)dbc->rdata.data + dbt->doff; /* * Copy in any trailing data from the original record. @@ -1127,20 +799,12 @@ __bam_partial(dbp, dbt, h, indx, nbytes) memmove(p + dbt->size, p + dbt->dlen, len); tlen += len; } - - /* Copy in the application provided data. */ - memcpy(p, dbt->data, dbt->size); - tlen += dbt->size; } else { /* Copy in any leading data from the original record. */ - memcpy(t->bt_rdata.data, + memcpy(dbc->rdata.data, bk->data, dbt->doff > bk->len ? bk->len : dbt->doff); tlen = dbt->doff; - p = (u_int8_t *)t->bt_rdata.data + dbt->doff; - - /* Copy in the application provided data. */ - memcpy(p, dbt->data, dbt->size); - tlen += dbt->size; + p = (u_int8_t *)dbc->rdata.data + dbt->doff; /* Copy in any trailing data from the original record. */ len = dbt->doff + dbt->dlen; @@ -1150,11 +814,18 @@ __bam_partial(dbp, dbt, h, indx, nbytes) } } +ucopy: /* + * Copy in the application provided data -- p and tlen must have been + * initialized above. + */ + memcpy(p, dbt->data, dbt->size); + tlen += dbt->size; + /* Set the DBT to reference our new record. */ - t->bt_rdata.size = tlen; - t->bt_rdata.dlen = 0; - t->bt_rdata.doff = 0; - t->bt_rdata.flags = 0; - *dbt = t->bt_rdata; + dbc->rdata.size = tlen; + dbc->rdata.dlen = 0; + dbc->rdata.doff = 0; + dbc->rdata.flags = 0; + *dbt = dbc->rdata; return (0); } diff --git a/db2/btree/bt_rec.c b/db2/btree/bt_rec.c index fe33825ec4..de6b3b7d0e 100644 --- a/db2/btree/bt_rec.c +++ b/db2/btree/bt_rec.c @@ -8,7 +8,7 @@ #include "config.h" #ifndef lint -static const char sccsid[] = "@(#)bt_rec.c 10.21 (Sleepycat) 4/28/98"; +static const char sccsid[] = "@(#)bt_rec.c 10.28 (Sleepycat) 9/27/98"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -45,7 +45,8 @@ __bam_pg_alloc_recover(logp, dbtp, lsnp, redo, info) BTMETA *meta; DB_MPOOLFILE *mpf; PAGE *pagep; - DB *file_dbp, *mdbp; + DB *file_dbp; + DBC *dbc; db_pgno_t pgno; int cmp_n, cmp_p, modified, ret; @@ -101,7 +102,6 @@ __bam_pg_alloc_recover(logp, dbtp, lsnp, redo, info) modified = 1; } if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0) { - (void)__db_panic(file_dbp); (void)memp_fput(mpf, meta, 0); goto out; } @@ -121,12 +121,10 @@ __bam_pg_alloc_recover(logp, dbtp, lsnp, redo, info) meta->free = argp->pgno; modified = 1; } - if ((ret = memp_fput(mpf, meta, modified ? DB_MPOOL_DIRTY : 0)) != 0) { - (void)__db_panic(file_dbp); + if ((ret = memp_fput(mpf, meta, modified ? DB_MPOOL_DIRTY : 0)) != 0) goto out; - } - *lsnp = argp->prev_lsn; +done: *lsnp = argp->prev_lsn; ret = 0; out: REC_CLOSE; @@ -149,7 +147,8 @@ __bam_pg_free_recover(logp, dbtp, lsnp, redo, info) { __bam_pg_free_args *argp; BTMETA *meta; - DB *file_dbp, *mdbp; + DB *file_dbp; + DBC *dbc; DB_MPOOLFILE *mpf; PAGE *pagep; db_pgno_t pgno; @@ -192,10 +191,8 @@ __bam_pg_free_recover(logp, dbtp, lsnp, redo, info) modified = 1; } - if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0) { - (void)__db_panic(file_dbp); + if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0) goto out; - } /* * Fix up the metadata page. If we're redoing or undoing the operation @@ -224,10 +221,8 @@ __bam_pg_free_recover(logp, dbtp, lsnp, redo, info) meta->lsn = argp->meta_lsn; modified = 1; } - if ((ret = memp_fput(mpf, meta, modified ? DB_MPOOL_DIRTY : 0)) != 0) { - (void)__db_panic(file_dbp); + if ((ret = memp_fput(mpf, meta, modified ? DB_MPOOL_DIRTY : 0)) != 0) goto out; - } done: *lsnp = argp->prev_lsn; ret = 0; @@ -251,7 +246,8 @@ __bam_split_recover(logp, dbtp, lsnp, redo, info) void *info; { __bam_split_args *argp; - DB *file_dbp, *mdbp; + DB *file_dbp; + DBC *dbc; DB_MPOOLFILE *mpf; PAGE *_lp, *lp, *np, *pp, *_rp, *rp, *sp; db_pgno_t pgno; @@ -310,12 +306,9 @@ __bam_split_recover(logp, dbtp, lsnp, redo, info) goto done; /* Allocate and initialize new left/right child pages. */ - if ((_lp = (PAGE *)__db_malloc(file_dbp->pgsize)) == NULL || - (_rp = (PAGE *)__db_malloc(file_dbp->pgsize)) == NULL) { - ret = ENOMEM; - __db_err(file_dbp->dbenv, "%s", strerror(ret)); + if ((ret = __os_malloc(file_dbp->pgsize, NULL, &_lp)) != 0 || + (ret = __os_malloc(file_dbp->pgsize, NULL, &_rp)) != 0) goto out; - } if (rootsplit) { P_INIT(_lp, file_dbp->pgsize, argp->left, PGNO_INVALID, @@ -352,7 +345,7 @@ __bam_split_recover(logp, dbtp, lsnp, redo, info) memcpy(lp, _lp, file_dbp->pgsize); lp->lsn = *lsnp; if ((ret = memp_fput(mpf, lp, DB_MPOOL_DIRTY)) != 0) - goto fatal; + goto out; lp = NULL; } @@ -367,7 +360,7 @@ __bam_split_recover(logp, dbtp, lsnp, redo, info) memcpy(rp, _rp, file_dbp->pgsize); rp->lsn = *lsnp; if ((ret = memp_fput(mpf, rp, DB_MPOOL_DIRTY)) != 0) - goto fatal; + goto out; rp = NULL; } @@ -392,7 +385,7 @@ __bam_split_recover(logp, dbtp, lsnp, redo, info) __bam_total(_lp) + __bam_total(_rp) : 0); pp->lsn = *lsnp; if ((ret = memp_fput(mpf, pp, DB_MPOOL_DIRTY)) != 0) - goto fatal; + goto out; pp = NULL; } @@ -412,9 +405,9 @@ __bam_split_recover(logp, dbtp, lsnp, redo, info) if (log_compare(&LSN(np), &argp->nlsn) == 0) { PREV_PGNO(np) = argp->right; np->lsn = *lsnp; - if ((ret = memp_fput(mpf, - np, DB_MPOOL_DIRTY)) != 0) - goto fatal; + if ((ret = + memp_fput(mpf, np, DB_MPOOL_DIRTY)) != 0) + goto out; np = NULL; } } @@ -433,7 +426,7 @@ __bam_split_recover(logp, dbtp, lsnp, redo, info) if (log_compare(lsnp, &LSN(pp)) == 0) { memcpy(pp, argp->pg.data, argp->pg.size); if ((ret = memp_fput(mpf, pp, DB_MPOOL_DIRTY)) != 0) - goto fatal; + goto out; pp = NULL; } @@ -451,7 +444,7 @@ lrundo: if ((rootsplit && lp != NULL) || rp != NULL) { lp->lsn = argp->llsn; if ((ret = memp_fput(mpf, lp, DB_MPOOL_DIRTY)) != 0) - goto fatal; + goto out; lp = NULL; } if (rp != NULL && @@ -459,7 +452,7 @@ lrundo: if ((rootsplit && lp != NULL) || rp != NULL) { rp->lsn = argp->rlsn; if ((ret = memp_fput(mpf, rp, DB_MPOOL_DIRTY)) != 0) - goto fatal; + goto out; rp = NULL; } } @@ -481,7 +474,7 @@ lrundo: if ((rootsplit && lp != NULL) || rp != NULL) { PREV_PGNO(np) = argp->left; np->lsn = argp->nlsn; if (memp_fput(mpf, np, DB_MPOOL_DIRTY)) - goto fatal; + goto out; np = NULL; } } @@ -490,9 +483,6 @@ lrundo: if ((rootsplit && lp != NULL) || rp != NULL) { done: *lsnp = argp->prev_lsn; ret = 0; - if (0) { -fatal: (void)__db_panic(file_dbp); - } out: /* Free any pages that weren't dirtied. */ if (pp != NULL && (t_ret = memp_fput(mpf, pp, 0)) != 0 && ret == 0) ret = t_ret; @@ -505,9 +495,9 @@ out: /* Free any pages that weren't dirtied. */ /* Free any allocated space. */ if (_lp != NULL) - __db_free(_lp); + __os_free(_lp, file_dbp->pgsize); if (_rp != NULL) - __db_free(_rp); + __os_free(_rp, file_dbp->pgsize); REC_CLOSE; } @@ -528,7 +518,8 @@ __bam_rsplit_recover(logp, dbtp, lsnp, redo, info) void *info; { __bam_rsplit_args *argp; - DB *file_dbp, *mdbp; + DB *file_dbp; + DBC *dbc; DB_MPOOLFILE *mpf; PAGE *pagep; db_pgno_t pgno; @@ -558,16 +549,14 @@ __bam_rsplit_recover(logp, dbtp, lsnp, redo, info) P_INIT(pagep, file_dbp->pgsize, PGNO_ROOT, argp->nrec, PGNO_INVALID, pagep->level + 1, file_dbp->type == DB_BTREE ? P_IBTREE : P_IRECNO); - if ((ret = __db_pitem(file_dbp, pagep, 0, + if ((ret = __db_pitem(dbc, pagep, 0, argp->rootent.size, &argp->rootent, NULL)) != 0) goto out; pagep->lsn = argp->rootlsn; modified = 1; } - if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0) { - (void)__db_panic(file_dbp); + if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0) goto out; - } /* * Fix the page copied over the root page. It's possible that the @@ -592,10 +581,8 @@ __bam_rsplit_recover(logp, dbtp, lsnp, redo, info) memcpy(pagep, argp->pgdbt.data, argp->pgdbt.size); modified = 1; } - if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0) { - (void)__db_panic(file_dbp); + if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0) goto out; - } done: *lsnp = argp->prev_lsn; ret = 0; @@ -619,7 +606,8 @@ __bam_adj_recover(logp, dbtp, lsnp, redo, info) void *info; { __bam_adj_args *argp; - DB *file_dbp, *mdbp; + DB *file_dbp; + DBC *dbc; DB_MPOOLFILE *mpf; PAGE *pagep; int cmp_n, cmp_p, modified, ret; @@ -640,7 +628,7 @@ __bam_adj_recover(logp, dbtp, lsnp, redo, info) cmp_p = log_compare(&LSN(pagep), &argp->lsn); if (cmp_p == 0 && redo) { /* Need to redo update described. */ - if ((ret = __bam_adjindx(file_dbp, + if ((ret = __bam_adjindx(dbc, pagep, argp->indx, argp->indx_copy, argp->is_insert)) != 0) goto err; @@ -648,7 +636,7 @@ __bam_adj_recover(logp, dbtp, lsnp, redo, info) modified = 1; } else if (cmp_n == 0 && !redo) { /* Need to undo update described. */ - if ((ret = __bam_adjindx(file_dbp, + if ((ret = __bam_adjindx(dbc, pagep, argp->indx, argp->indx_copy, !argp->is_insert)) != 0) goto err; @@ -684,7 +672,8 @@ __bam_cadjust_recover(logp, dbtp, lsnp, redo, info) void *info; { __bam_cadjust_args *argp; - DB *file_dbp, *mdbp; + DB *file_dbp; + DBC *dbc; DB_MPOOLFILE *mpf; PAGE *pagep; int cmp_n, cmp_p, modified, ret; @@ -760,7 +749,8 @@ __bam_cdel_recover(logp, dbtp, lsnp, redo, info) void *info; { __bam_cdel_args *argp; - DB *file_dbp, *mdbp; + DB *file_dbp; + DBC *dbc; DB_MPOOLFILE *mpf; PAGE *pagep; int cmp_n, cmp_p, modified, ret; @@ -781,13 +771,19 @@ __bam_cdel_recover(logp, dbtp, lsnp, redo, info) cmp_p = log_compare(&LSN(pagep), &argp->lsn); if (cmp_p == 0 && redo) { /* Need to redo update described. */ - B_DSET(GET_BKEYDATA(pagep, argp->indx + O_INDX)->type); + if (pagep->type == P_DUPLICATE) + B_DSET(GET_BKEYDATA(pagep, argp->indx)->type); + else + B_DSET(GET_BKEYDATA(pagep, argp->indx + O_INDX)->type); LSN(pagep) = *lsnp; modified = 1; } else if (cmp_n == 0 && !redo) { /* Need to undo update described. */ - B_DCLR(GET_BKEYDATA(pagep, argp->indx + O_INDX)->type); + if (pagep->type == P_DUPLICATE) + B_DCLR(GET_BKEYDATA(pagep, argp->indx)->type); + else + B_DCLR(GET_BKEYDATA(pagep, argp->indx + O_INDX)->type); LSN(pagep) = argp->lsn; modified = 1; @@ -818,7 +814,8 @@ __bam_repl_recover(logp, dbtp, lsnp, redo, info) { __bam_repl_args *argp; BKEYDATA *bk; - DB *file_dbp, *mdbp; + DB *file_dbp; + DBC *dbc; DBT dbt; DB_MPOOLFILE *mpf; PAGE *pagep; @@ -848,10 +845,8 @@ __bam_repl_recover(logp, dbtp, lsnp, redo, info) */ memset(&dbt, 0, sizeof(dbt)); dbt.size = argp->prefix + argp->suffix + argp->repl.size; - if ((dbt.data = __db_malloc(dbt.size)) == NULL) { - ret = ENOMEM; + if ((ret = __os_malloc(dbt.size, NULL, &dbt.data)) != 0) goto err; - } p = dbt.data; memcpy(p, bk->data, argp->prefix); p += argp->prefix; @@ -859,8 +854,8 @@ __bam_repl_recover(logp, dbtp, lsnp, redo, info) p += argp->repl.size; memcpy(p, bk->data + (bk->len - argp->suffix), argp->suffix); - ret = __bam_ritem(file_dbp, pagep, argp->indx, &dbt); - __db_free(dbt.data); + ret = __bam_ritem(dbc, pagep, argp->indx, &dbt); + __os_free(dbt.data, dbt.size); if (ret != 0) goto err; @@ -874,10 +869,8 @@ __bam_repl_recover(logp, dbtp, lsnp, redo, info) */ memset(&dbt, 0, sizeof(dbt)); dbt.size = argp->prefix + argp->suffix + argp->orig.size; - if ((dbt.data = __db_malloc(dbt.size)) == NULL) { - ret = ENOMEM; + if ((ret = __os_malloc(dbt.size, NULL, &dbt.data)) != 0) goto err; - } p = dbt.data; memcpy(p, bk->data, argp->prefix); p += argp->prefix; @@ -885,8 +878,8 @@ __bam_repl_recover(logp, dbtp, lsnp, redo, info) p += argp->orig.size; memcpy(p, bk->data + (bk->len - argp->suffix), argp->suffix); - ret = __bam_ritem(file_dbp, pagep, argp->indx, &dbt); - __db_free(dbt.data); + ret = __bam_ritem(dbc, pagep, argp->indx, &dbt); + __os_free(dbt.data, dbt.size); if (ret != 0) goto err; diff --git a/db2/btree/bt_recno.c b/db2/btree/bt_recno.c index 38dbbd1c55..c69877ff7f 100644 --- a/db2/btree/bt_recno.c +++ b/db2/btree/bt_recno.c @@ -8,7 +8,7 @@ #include "config.h" #ifndef lint -static const char sccsid[] = "@(#)bt_recno.c 10.37 (Sleepycat) 5/23/98"; +static const char sccsid[] = "@(#)bt_recno.c 10.53 (Sleepycat) 12/11/98"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -22,64 +22,89 @@ static const char sccsid[] = "@(#)bt_recno.c 10.37 (Sleepycat) 5/23/98"; #include "db_int.h" #include "db_page.h" #include "btree.h" - -static int __ram_add __P((DB *, db_recno_t *, DBT *, u_int32_t, u_int32_t)); -static int __ram_c_close __P((DBC *)); -static int __ram_c_del __P((DBC *, u_int32_t)); -static int __ram_c_get __P((DBC *, DBT *, DBT *, u_int32_t)); -static int __ram_c_put __P((DBC *, DBT *, DBT *, u_int32_t)); -static int __ram_fmap __P((DB *, db_recno_t)); -static int __ram_get __P((DB *, DB_TXN *, DBT *, DBT *, u_int32_t)); -static int __ram_iget __P((DB *, DBT *, DBT *)); +#include "db_ext.h" +#include "shqueue.h" +#include "db_shash.h" +#include "lock.h" +#include "lock_ext.h" + +static int __ram_add __P((DBC *, db_recno_t *, DBT *, u_int32_t, u_int32_t)); +static int __ram_delete __P((DB *, DB_TXN *, DBT *, u_int32_t)); +static int __ram_fmap __P((DBC *, db_recno_t)); +static int __ram_i_delete __P((DBC *)); static int __ram_put __P((DB *, DB_TXN *, DBT *, DBT *, u_int32_t)); static int __ram_source __P((DB *, RECNO *, const char *)); static int __ram_sync __P((DB *, u_int32_t)); -static int __ram_update __P((DB *, db_recno_t, int)); -static int __ram_vmap __P((DB *, db_recno_t)); -static int __ram_writeback __P((DB *)); +static int __ram_update __P((DBC *, db_recno_t, int)); +static int __ram_vmap __P((DBC *, db_recno_t)); +static int __ram_writeback __P((DBC *)); /* - * If we're renumbering records, then we have to detect in the cursor that a - * record was deleted, and adjust the cursor as necessary. If not renumbering - * records, then we can detect this by looking at the actual record, so we - * ignore the cursor delete flag. + * In recno, there are two meanings to the on-page "deleted" flag. If we're + * re-numbering records, it means the record was implicitly created. We skip + * over implicitly created records if doing a cursor "next" or "prev", and + * return DB_KEYEMPTY if they're explicitly requested.. If not re-numbering + * records, it means that the record was implicitly created, or was deleted. + * We skip over implicitly created or deleted records if doing a cursor "next" + * or "prev", and return DB_KEYEMPTY if they're explicitly requested. + * + * If we're re-numbering records, then we have to detect in the cursor that + * a record was deleted, and adjust the cursor as necessary on the next get. + * If we're not re-numbering records, then we can detect that a record has + * been deleted by looking at the actual on-page record, so we completely + * ignore the cursor's delete flag. This is different from the B+tree code. + * It also maintains whether the cursor references a deleted record in the + * cursor, and it doesn't always check the on-page value. */ #define CD_SET(dbp, cp) { \ if (F_ISSET(dbp, DB_RE_RENUMBER)) \ - F_SET(cp, CR_DELETED); \ + F_SET(cp, C_DELETED); \ } #define CD_CLR(dbp, cp) { \ if (F_ISSET(dbp, DB_RE_RENUMBER)) \ - F_CLR(cp, CR_DELETED); \ + F_CLR(cp, C_DELETED); \ } #define CD_ISSET(dbp, cp) \ - (F_ISSET(dbp, DB_RE_RENUMBER) && F_ISSET(cp, CR_DELETED)) + (F_ISSET(dbp, DB_RE_RENUMBER) && F_ISSET(cp, C_DELETED)) /* * __ram_open -- * Recno open function. * - * PUBLIC: int __ram_open __P((DB *, DBTYPE, DB_INFO *)); + * PUBLIC: int __ram_open __P((DB *, DB_INFO *)); */ int -__ram_open(dbp, type, dbinfo) +__ram_open(dbp, dbinfo) DB *dbp; - DBTYPE type; DB_INFO *dbinfo; { BTREE *t; + DBC *dbc; RECNO *rp; - int ret; - - COMPQUIET(type, DB_RECNO); + int ret, t_ret; - ret = 0; + /* Allocate and initialize the private btree structure. */ + if ((ret = __os_calloc(1, sizeof(BTREE), &t)) != 0) + return (ret); + dbp->internal = t; + __bam_setovflsize(dbp); - /* Allocate and initialize the private RECNO structure. */ - if ((rp = (RECNO *)__db_calloc(1, sizeof(*rp))) == NULL) - return (ENOMEM); + /* Allocate and initialize the private recno structure. */ + if ((ret = __os_calloc(1, sizeof(*rp), &rp)) != 0) + return (ret); + /* Link in the private recno structure. */ + t->recno = rp; - if (dbinfo != NULL) { + /* + * Intention is to make sure all of the user's selections are okay + * here and then use them without checking. + */ + if (dbinfo == NULL) { + rp->re_delim = '\n'; + rp->re_pad = ' '; + rp->re_fd = -1; + F_SET(rp, RECNO_EOF); + } else { /* * If the user specified a source tree, open it and map it in. * @@ -111,31 +136,40 @@ __ram_open(dbp, type, dbinfo) } } else rp->re_len = 0; - } else { - rp->re_delim = '\n'; - rp->re_pad = ' '; - rp->re_fd = -1; - F_SET(rp, RECNO_EOF); } - /* Open the underlying btree. */ - if ((ret = __bam_open(dbp, DB_RECNO, dbinfo)) != 0) - goto err; - - /* Set the routines necessary to make it look like a recno tree. */ - dbp->cursor = __ram_cursor; + /* Initialize the remaining fields/methods of the DB. */ + dbp->am_close = __ram_close; dbp->del = __ram_delete; - dbp->get = __ram_get; dbp->put = __ram_put; + dbp->stat = __bam_stat; dbp->sync = __ram_sync; - /* Link in the private recno structure. */ - ((BTREE *)dbp->internal)->bt_recno = rp; + /* Start up the tree. */ + if ((ret = __bam_read_root(dbp)) != 0) + goto err; + + /* Set the overflow page size. */ + __bam_setovflsize(dbp); /* If we're snapshotting an underlying source file, do it now. */ - if (dbinfo != NULL && F_ISSET(dbinfo, DB_SNAPSHOT)) - if ((ret = __ram_snapshot(dbp)) != 0 && ret != DB_NOTFOUND) + if (dbinfo != NULL && F_ISSET(dbinfo, DB_SNAPSHOT)) { + /* Allocate a cursor. */ + if ((ret = dbp->cursor(dbp, NULL, &dbc, 0)) != 0) + goto err; + + /* Do the snapshot. */ + if ((ret = __ram_update(dbc, + DB_MAX_RECORDS, 0)) != 0 && ret == DB_NOTFOUND) + ret = 0; + + /* Discard the cursor. */ + if ((t_ret = dbc->c_close(dbc)) != 0 && ret == 0) + ret = t_ret; + + if (ret != 0) goto err; + } return (0); @@ -145,143 +179,169 @@ err: /* If we mmap'd a source file, discard it. */ /* If we opened a source file, discard it. */ if (rp->re_fd != -1) - (void)__db_close(rp->re_fd); + (void)__os_close(rp->re_fd); if (rp->re_source != NULL) - FREES(rp->re_source); - - /* If we allocated room for key/data return, discard it. */ - t = dbp->internal; - if (t != NULL && t->bt_rkey.data != NULL) - __db_free(t->bt_rkey.data); + __os_freestr(rp->re_source); - FREE(rp, sizeof(*rp)); + __os_free(rp, sizeof(*rp)); return (ret); } /* - * __ram_cursor -- - * Recno db->cursor function. - * - * PUBLIC: int __ram_cursor __P((DB *, DB_TXN *, DBC **)); + * __ram_delete -- + * Recno db->del function. */ -int -__ram_cursor(dbp, txn, dbcp) +static int +__ram_delete(dbp, txn, key, flags) DB *dbp; DB_TXN *txn; - DBC **dbcp; + DBT *key; + u_int32_t flags; { - RCURSOR *cp; + CURSOR *cp; DBC *dbc; + db_recno_t recno; + int ret, t_ret; - DEBUG_LWRITE(dbp, txn, "ram_cursor", NULL, NULL, 0); - - if ((dbc = (DBC *)__db_calloc(1, sizeof(DBC))) == NULL) - return (ENOMEM); - if ((cp = (RCURSOR *)__db_calloc(1, sizeof(RCURSOR))) == NULL) { - __db_free(dbc); - return (ENOMEM); - } - - cp->dbc = dbc; - cp->recno = RECNO_OOB; - - dbc->dbp = dbp; - dbc->txn = txn; - dbc->internal = cp; - dbc->c_close = __ram_c_close; - dbc->c_del = __ram_c_del; - dbc->c_get = __ram_c_get; - dbc->c_put = __ram_c_put; - - /* - * All cursors are queued from the master DB structure. Add the - * cursor to that queue. - */ - CURSOR_SETUP(dbp); - TAILQ_INSERT_HEAD(&dbp->curs_queue, dbc, links); - CURSOR_TEARDOWN(dbp); + DB_PANIC_CHECK(dbp); - *dbcp = dbc; - return (0); -} + /* Check for invalid flags. */ + if ((ret = __db_delchk(dbp, + key, flags, F_ISSET(dbp, DB_AM_RDONLY))) != 0) + return (ret); -/* - * __ram_get -- - * Recno db->get function. - */ -static int -__ram_get(argdbp, txn, key, data, flags) - DB *argdbp; - DB_TXN *txn; - DBT *key, *data; - u_int32_t flags; -{ - DB *dbp; - int ret; + /* Acquire a cursor. */ + if ((ret = dbp->cursor(dbp, txn, &dbc, DB_WRITELOCK)) != 0) + return (ret); - DEBUG_LWRITE(argdbp, txn, "ram_get", key, NULL, flags); + DEBUG_LWRITE(dbc, txn, "ram_delete", key, NULL, flags); - /* Check for invalid flags. */ - if ((ret = __db_getchk(argdbp, key, data, flags)) != 0) - return (ret); + /* Check the user's record number and fill in as necessary. */ + if ((ret = __ram_getno(dbc, key, &recno, 0)) != 0) + goto err; - GETHANDLE(argdbp, txn, &dbp, ret); + /* Do the delete. */ + cp = dbc->internal; + cp->recno = recno; + ret = __ram_i_delete(dbc); - ret = __ram_iget(dbp, key, data); + /* Release the cursor. */ +err: if ((t_ret = dbc->c_close(dbc)) != 0 && ret == 0) + ret = t_ret; - PUTHANDLE(dbp); return (ret); } /* - * __ram_iget -- - * Internal ram get function, called for both standard and cursor - * get after the flags have been checked. + * __ram_i_delete -- + * Internal version of recno delete, called by __ram_delete and + * __ram_c_del. */ static int -__ram_iget(dbp, key, data) - DB *dbp; - DBT *key, *data; +__ram_i_delete(dbc) + DBC *dbc; { + BKEYDATA bk; BTREE *t; + CURSOR *cp; + DB *dbp; + DBT hdr, data; PAGE *h; db_indx_t indx; - db_recno_t recno; int exact, ret, stack; - stack = 0; + dbp = dbc->dbp; + cp = dbc->internal; t = dbp->internal; + stack = 0; - /* Check the user's record number and fill in as necessary. */ - if ((ret = __ram_getno(dbp, key, &recno, 0)) != 0) - goto done; + /* + * If this is CDB and this isn't a write cursor, then it's an error. + * If it is a write cursor, but we don't yet hold the write lock, then + * we need to upgrade to the write lock. + */ + if (F_ISSET(dbp, DB_AM_CDB)) { + /* Make sure it's a valid update cursor. */ + if (!F_ISSET(dbc, DBC_RMW | DBC_WRITER)) + return (EINVAL); + + if (F_ISSET(dbc, DBC_RMW) && + (ret = lock_get(dbp->dbenv->lk_info, dbc->locker, + DB_LOCK_UPGRADE, &dbc->lock_dbt, DB_LOCK_WRITE, + &dbc->mylock)) != 0) + return (EAGAIN); + } - /* Search the tree for the record. */ - if ((ret = __bam_rsearch(dbp, &recno, S_FIND, 1, &exact)) != 0) - goto done; - if (!exact) - return (DB_NOTFOUND); + /* Search the tree for the key; delete only deletes exact matches. */ + if ((ret = __bam_rsearch(dbc, &cp->recno, S_DELETE, 1, &exact)) != 0) + goto err; + if (!exact) { + ret = DB_NOTFOUND; + goto err; + } stack = 1; - h = t->bt_csp->page; - indx = t->bt_csp->indx; + h = cp->csp->page; + indx = cp->csp->indx; - /* If the record has already been deleted, we couldn't have found it. */ + /* + * If re-numbering records, the on-page deleted flag can only mean + * that this record was implicitly created. Applications aren't + * permitted to delete records they never created, return an error. + * + * If not re-numbering records, the on-page deleted flag means that + * this record was implicitly created, or, was deleted at some time. + * The former is an error because applications aren't permitted to + * delete records they never created, the latter is an error because + * if the record was "deleted", we could never have found it. + */ if (B_DISSET(GET_BKEYDATA(h, indx)->type)) { ret = DB_KEYEMPTY; - goto done; + goto err; } - /* Return the data item. */ - ret = __db_ret(dbp, - h, indx, data, &t->bt_rdata.data, &t->bt_rdata.ulen); - ++t->lstat.bt_get; + if (F_ISSET(dbp, DB_RE_RENUMBER)) { + /* Delete the item, adjust the counts, adjust the cursors. */ + if ((ret = __bam_ditem(dbc, h, indx)) != 0) + goto err; + __bam_adjust(dbc, -1); + __ram_ca(dbp, cp->recno, CA_DELETE); + + /* + * If the page is empty, delete it. The whole tree is locked + * so there are no preparations to make. + */ + if (NUM_ENT(h) == 0 && h->pgno != PGNO_ROOT) { + stack = 0; + ret = __bam_dpages(dbc); + } + } else { + /* Use a delete/put pair to replace the record with a marker. */ + if ((ret = __bam_ditem(dbc, h, indx)) != 0) + goto err; + + B_TSET(bk.type, B_KEYDATA, 1); + bk.len = 0; + memset(&hdr, 0, sizeof(hdr)); + hdr.data = &bk; + hdr.size = SSZA(BKEYDATA, data); + memset(&data, 0, sizeof(data)); + data.data = (char *)""; + data.size = 0; + if ((ret = __db_pitem(dbc, + h, indx, BKEYDATA_SIZE(0), &hdr, &data)) != 0) + goto err; + } + F_SET(t->recno, RECNO_MODIFIED); -done: /* Discard the stack. */ - if (stack) - __bam_stkrel(dbp); +err: if (stack) + __bam_stkrel(dbc, 0); + /* If we upgraded the CDB lock upon entry; downgrade it now. */ + if (F_ISSET(dbp, DB_AM_CDB) && F_ISSET(dbc, DBC_RMW)) + (void)__lock_downgrade(dbp->dbenv->lk_info, dbc->mylock, + DB_LOCK_IWRITE, 0); return (ret); } @@ -290,46 +350,50 @@ done: /* Discard the stack. */ * Recno db->put function. */ static int -__ram_put(argdbp, txn, key, data, flags) - DB *argdbp; +__ram_put(dbp, txn, key, data, flags) + DB *dbp; DB_TXN *txn; DBT *key, *data; u_int32_t flags; { - BTREE *t; - DB *dbp; + DBC *dbc; db_recno_t recno; - int ret; + int ret, t_ret; - DEBUG_LWRITE(argdbp, txn, "ram_put", key, data, flags); + DB_PANIC_CHECK(dbp); /* Check for invalid flags. */ - if ((ret = __db_putchk(argdbp, - key, data, flags, F_ISSET(argdbp, DB_AM_RDONLY), 0)) != 0) + if ((ret = __db_putchk(dbp, + key, data, flags, F_ISSET(dbp, DB_AM_RDONLY), 0)) != 0) + return (ret); + + /* Allocate a cursor. */ + if ((ret = dbp->cursor(dbp, txn, &dbc, DB_WRITELOCK)) != 0) return (ret); - GETHANDLE(argdbp, txn, &dbp, ret); + DEBUG_LWRITE(dbc, txn, "ram_put", key, data, flags); /* * If we're appending to the tree, make sure we've read in all of * the backing source file. Otherwise, check the user's record * number and fill in as necessary. */ - ret = LF_ISSET(DB_APPEND) ? - __ram_snapshot(dbp) : __ram_getno(dbp, key, &recno, 1); + ret = flags == DB_APPEND ? + __ram_update(dbc, DB_MAX_RECORDS, 0) : + __ram_getno(dbc, key, &recno, 1); /* Add the record. */ if (ret == 0) - ret = __ram_add(dbp, &recno, data, flags, 0); + ret = __ram_add(dbc, &recno, data, flags, 0); - /* If we're appending to the tree, we have to return the record. */ - if (ret == 0 && LF_ISSET(DB_APPEND)) { - t = dbp->internal; - ret = __db_retcopy(key, &recno, sizeof(recno), - &t->bt_rkey.data, &t->bt_rkey.ulen, dbp->db_malloc); - } + /* Discard the cursor. */ + if ((t_ret = dbc->c_close(dbc)) != 0 && ret == 0) + ret = t_ret; + + /* Return the record number if we're appending to the tree. */ + if (ret == 0 && flags == DB_APPEND) + *(db_recno_t *)key->data = recno; - PUTHANDLE(dbp); return (ret); } @@ -338,23 +402,35 @@ __ram_put(argdbp, txn, key, data, flags) * Recno db->sync function. */ static int -__ram_sync(argdbp, flags) - DB *argdbp; +__ram_sync(dbp, flags) + DB *dbp; u_int32_t flags; { - DB *dbp; - int ret; + DBC *dbc; + int ret, t_ret; - DEBUG_LWRITE(argdbp, NULL, "ram_sync", NULL, NULL, flags); + /* + * Sync the underlying btree. + * + * !!! + * We don't need to do a panic check or flags check, the "real" + * sync function does all that for us. + */ + if ((ret = __db_sync(dbp, flags)) != 0) + return (ret); - /* Sync the underlying btree. */ - if ((ret = __bam_sync(argdbp, flags)) != 0) + /* Allocate a cursor. */ + if ((ret = dbp->cursor(dbp, NULL, &dbc, 0)) != 0) return (ret); + DEBUG_LWRITE(dbc, NULL, "ram_sync", NULL, NULL, flags); + /* Copy back the backing source file. */ - GETHANDLE(argdbp, NULL, &dbp, ret); - ret = __ram_writeback(dbp); - PUTHANDLE(dbp); + ret = __ram_writeback(dbc); + + /* Discard the cursor. */ + if ((t_ret = dbc->c_close(dbc)) != 0 && ret == 0) + ret = t_ret; return (ret); } @@ -366,14 +442,12 @@ __ram_sync(argdbp, flags) * PUBLIC: int __ram_close __P((DB *)); */ int -__ram_close(argdbp) - DB *argdbp; +__ram_close(dbp) + DB *dbp; { RECNO *rp; - DEBUG_LWRITE(argdbp, NULL, "ram_close", NULL, NULL, 0); - - rp = ((BTREE *)argdbp->internal)->bt_recno; + rp = ((BTREE *)dbp->internal)->recno; /* Close any underlying mmap region. */ if (rp->re_smap != NULL) @@ -381,136 +455,133 @@ __ram_close(argdbp) /* Close any backing source file descriptor. */ if (rp->re_fd != -1) - (void)__db_close(rp->re_fd); + (void)__os_close(rp->re_fd); /* Free any backing source file name. */ if (rp->re_source != NULL) - FREES(rp->re_source); + __os_freestr(rp->re_source); /* Free allocated memory. */ - FREE(rp, sizeof(RECNO)); - ((BTREE *)argdbp->internal)->bt_recno = NULL; + __os_free(rp, sizeof(RECNO)); + ((BTREE *)dbp->internal)->recno = NULL; /* Close the underlying btree. */ - return (__bam_close(argdbp)); -} - -/* - * __ram_c_close -- - * Recno cursor->close function. - */ -static int -__ram_c_close(dbc) - DBC *dbc; -{ - DEBUG_LWRITE(dbc->dbp, dbc->txn, "ram_c_close", NULL, NULL, 0); - - return (__ram_c_iclose(dbc->dbp, dbc)); -} - -/* - * __ram_c_iclose -- - * Close a single cursor -- internal version. - * - * PUBLIC: int __ram_c_iclose __P((DB *, DBC *)); - */ -int -__ram_c_iclose(dbp, dbc) - DB *dbp; - DBC *dbc; -{ - /* Remove the cursor from the queue. */ - CURSOR_SETUP(dbp); - TAILQ_REMOVE(&dbp->curs_queue, dbc, links); - CURSOR_TEARDOWN(dbp); - - /* Discard the structures. */ - FREE(dbc->internal, sizeof(RCURSOR)); - FREE(dbc, sizeof(DBC)); - - return (0); + return (__bam_close(dbp)); } /* * __ram_c_del -- * Recno cursor->c_del function. + * + * PUBLIC: int __ram_c_del __P((DBC *, u_int32_t)); */ -static int +int __ram_c_del(dbc, flags) DBC *dbc; u_int32_t flags; { - DBT key; - RCURSOR *cp; + CURSOR *cp; + DB *dbp; int ret; - DEBUG_LWRITE(dbc->dbp, dbc->txn, "ram_c_del", NULL, NULL, flags); - + dbp = dbc->dbp; cp = dbc->internal; + DB_PANIC_CHECK(dbp); + /* Check for invalid flags. */ - if ((ret = __db_cdelchk(dbc->dbp, flags, - F_ISSET(dbc->dbp, DB_AM_RDONLY), cp->recno != RECNO_OOB)) != 0) + if ((ret = __db_cdelchk(dbp, flags, + F_ISSET(dbp, DB_AM_RDONLY), cp->recno != RECNO_OOB)) != 0) return (ret); - /* If already deleted, return failure. */ - if (CD_ISSET(dbc->dbp, cp)) - return (DB_KEYEMPTY); + DEBUG_LWRITE(dbc, dbc->txn, "ram_c_del", NULL, NULL, flags); - /* Build a normal delete request. */ - memset(&key, 0, sizeof(key)); - key.data = &cp->recno; - key.size = sizeof(db_recno_t); - if ((ret = __ram_delete(dbc->dbp, dbc->txn, &key, 0)) == 0) - CD_SET(dbc->dbp, cp); + /* + * If we are running CDB, this had better be either a write + * cursor or an immediate writer. + */ + if (F_ISSET(dbp, DB_AM_CDB)) + if (!F_ISSET(dbc, DBC_RMW | DBC_WRITER)) + return (EINVAL); - return (ret); + /* + * The semantics of cursors during delete are as follows: if record + * numbers are mutable (DB_RE_RENUMBER is set), deleting a record + * causes the cursor to automatically point to the record immediately + * following. In this case it is possible to use a single cursor for + * repeated delete operations, without intervening operations. + * + * If record numbers are not mutable, then records are replaced with + * a marker containing a delete flag. If the record referenced by + * this cursor has already been deleted, we will detect that as part + * of the delete operation, and fail. + */ + return (__ram_i_delete(dbc)); } /* * __ram_c_get -- * Recno cursor->c_get function. + * + * PUBLIC: int __ram_c_get __P((DBC *, DBT *, DBT *, u_int32_t)); */ -static int +int __ram_c_get(dbc, key, data, flags) DBC *dbc; DBT *key, *data; u_int32_t flags; { - BTREE *t; + CURSOR *cp, copy; DB *dbp; - RCURSOR *cp, copy; - int ret; - - DEBUG_LREAD(dbc->dbp, dbc->txn, "ram_c_get", - flags == DB_SET || flags == DB_SET_RANGE ? key : NULL, - NULL, flags); + PAGE *h; + db_indx_t indx; + int exact, ret, stack, tmp_rmw; - cp = dbc->internal; dbp = dbc->dbp; + cp = dbc->internal; + + DB_PANIC_CHECK(dbp); /* Check for invalid flags. */ if ((ret = __db_cgetchk(dbc->dbp, key, data, flags, cp->recno != RECNO_OOB)) != 0) return (ret); - GETHANDLE(dbc->dbp, dbc->txn, &dbp, ret); - t = dbp->internal; + /* Clear OR'd in additional bits so we can check for flag equality. */ + tmp_rmw = 0; + if (LF_ISSET(DB_RMW)) { + if (!F_ISSET(dbp, DB_AM_CDB)) { + tmp_rmw = 1; + F_SET(dbc, DBC_RMW); + } + LF_CLR(DB_RMW); + } + + DEBUG_LREAD(dbc, dbc->txn, "ram_c_get", + flags == DB_SET || flags == DB_SET_RANGE ? key : NULL, NULL, flags); /* Initialize the cursor for a new retrieval. */ copy = *cp; retry: /* Update the record number. */ + stack = 0; switch (flags) { case DB_CURRENT: - if (CD_ISSET(dbp, cp)) { - PUTHANDLE(dbp); - return (DB_KEYEMPTY); - } + /* + * If record numbers are mutable: if we just deleted a record, + * there is no action necessary, we return the record following + * the deleted item by virtue of renumbering the tree. + */ break; case DB_NEXT: + /* + * If record numbers are mutable: if we just deleted a record, + * we have to avoid incrementing the record number so that we + * return the right record by virtue of renumbering the tree. + */ if (CD_ISSET(dbp, cp)) break; + if (cp->recno != RECNO_OOB) { ++cp->recno; break; @@ -522,86 +593,133 @@ retry: /* Update the record number. */ break; case DB_PREV: if (cp->recno != RECNO_OOB) { - if (cp->recno == 1) - return (DB_NOTFOUND); + if (cp->recno == 1) { + ret = DB_NOTFOUND; + goto err; + } --cp->recno; break; } /* FALLTHROUGH */ case DB_LAST: flags = DB_PREV; - if (((ret = __ram_snapshot(dbp)) != 0) && ret != DB_NOTFOUND) + if (((ret = __ram_update(dbc, + DB_MAX_RECORDS, 0)) != 0) && ret != DB_NOTFOUND) goto err; - if ((ret = __bam_nrecs(dbp, &cp->recno)) != 0) + if ((ret = __bam_nrecs(dbc, &cp->recno)) != 0) goto err; - if (cp->recno == 0) - return (DB_NOTFOUND); + if (cp->recno == 0) { + ret = DB_NOTFOUND; + goto err; + } break; case DB_SET: case DB_SET_RANGE: - if ((ret = __ram_getno(dbp, key, &cp->recno, 0)) != 0) + if ((ret = __ram_getno(dbc, key, &cp->recno, 0)) != 0) goto err; break; } - /* - * Return the key if the user didn't give us one, and then pass it - * into __ram_iget(). - */ + /* Return the key if the user didn't give us one. */ if (flags != DB_SET && flags != DB_SET_RANGE && (ret = __db_retcopy(key, &cp->recno, sizeof(cp->recno), - &t->bt_rkey.data, &t->bt_rkey.ulen, dbp->db_malloc)) != 0) - return (ret); + &dbc->rkey.data, &dbc->rkey.ulen, dbp->db_malloc)) != 0) + goto err; - /* - * The cursor was reset, so the delete adjustment is no - * longer necessary. - */ - CD_CLR(dbp, cp); + /* Search the tree for the record. */ + if ((ret = __bam_rsearch(dbc, &cp->recno, + F_ISSET(dbc, DBC_RMW) ? S_FIND_WR : S_FIND, 1, &exact)) != 0) + goto err; + stack = 1; + if (!exact) { + ret = DB_NOTFOUND; + goto err; + } + h = cp->csp->page; + indx = cp->csp->indx; /* - * Retrieve the record. - * - * Skip any keys that don't really exist. + * If re-numbering records, the on-page deleted flag means this record + * was implicitly created. If not re-numbering records, the on-page + * deleted flag means this record was implicitly created, or, it was + * deleted at some time. Regardless, we skip such records if doing + * cursor next/prev operations, and fail if the application requested + * them explicitly. */ - if ((ret = __ram_iget(dbp, key, data)) != 0) - if (ret == DB_KEYEMPTY && - (flags == DB_NEXT || flags == DB_PREV)) + if (B_DISSET(GET_BKEYDATA(h, indx)->type)) { + if (flags == DB_NEXT || flags == DB_PREV) { + (void)__bam_stkrel(dbc, 0); goto retry; + } + ret = DB_KEYEMPTY; + goto err; + } + + /* Return the data item. */ + if ((ret = __db_ret(dbp, + h, indx, data, &dbc->rdata.data, &dbc->rdata.ulen)) != 0) + goto err; + + /* The cursor was reset, no further delete adjustment is necessary. */ + CD_CLR(dbp, cp); + +err: if (stack) + (void)__bam_stkrel(dbc, 0); + + /* Release temporary lock upgrade. */ + if (tmp_rmw) + F_CLR(dbc, DBC_RMW); -err: if (ret != 0) + if (ret != 0) *cp = copy; - PUTHANDLE(dbp); return (ret); } /* * __ram_c_put -- * Recno cursor->c_put function. + * + * PUBLIC: int __ram_c_put __P((DBC *, DBT *, DBT *, u_int32_t)); */ -static int +int __ram_c_put(dbc, key, data, flags) DBC *dbc; DBT *key, *data; u_int32_t flags; { - BTREE *t; - RCURSOR *cp, copy; + CURSOR *cp, copy; DB *dbp; int exact, ret; void *arg; - DEBUG_LWRITE(dbc->dbp, dbc->txn, "ram_c_put", NULL, data, flags); - + dbp = dbc->dbp; cp = dbc->internal; + DB_PANIC_CHECK(dbp); + if ((ret = __db_cputchk(dbc->dbp, key, data, flags, F_ISSET(dbc->dbp, DB_AM_RDONLY), cp->recno != RECNO_OOB)) != 0) return (ret); - GETHANDLE(dbc->dbp, dbc->txn, &dbp, ret); - t = dbp->internal; + DEBUG_LWRITE(dbc, dbc->txn, "ram_c_put", NULL, data, flags); + + /* + * If we are running CDB, this had better be either a write + * cursor or an immediate writer. If it's a regular writer, + * that means we have an IWRITE lock and we need to upgrade + * it to a write lock. + */ + if (F_ISSET(dbp, DB_AM_CDB)) { + if (!F_ISSET(dbc, DBC_RMW | DBC_WRITER)) + return (EINVAL); + + if (F_ISSET(dbc, DBC_RMW) && + (ret = lock_get(dbp->dbenv->lk_info, dbc->locker, + DB_LOCK_UPGRADE, &dbc->lock_dbt, DB_LOCK_WRITE, + &dbc->mylock)) != 0) + return (EAGAIN); + } /* Initialize the cursor for a new retrieval. */ copy = *cp; @@ -614,23 +732,23 @@ __ram_c_put(dbc, key, data, flags) */ if (0) { split: arg = &cp->recno; - if ((ret = __bam_split(dbp, arg)) != 0) + if ((ret = __bam_split(dbc, arg)) != 0) goto err; } - if ((ret = __bam_rsearch(dbp, &cp->recno, S_INSERT, 1, &exact)) != 0) + if ((ret = __bam_rsearch(dbc, &cp->recno, S_INSERT, 1, &exact)) != 0) goto err; if (!exact) { ret = DB_NOTFOUND; goto err; } - if ((ret = __bam_iitem(dbp, &t->bt_csp->page, - &t->bt_csp->indx, key, data, flags, 0)) == DB_NEEDSPLIT) { - if ((ret = __bam_stkrel(dbp)) != 0) + if ((ret = __bam_iitem(dbc, &cp->csp->page, + &cp->csp->indx, key, data, flags, 0)) == DB_NEEDSPLIT) { + if ((ret = __bam_stkrel(dbc, 0)) != 0) goto err; goto split; } - if ((ret = __bam_stkrel(dbp)) != 0) + if ((ret = __bam_stkrel(dbc, 0)) != 0) goto err; switch (flags) { @@ -650,16 +768,16 @@ split: arg = &cp->recno; break; } - /* - * The cursor was reset, so the delete adjustment is no - * longer necessary. - */ + /* The cursor was reset, no further delete adjustment is necessary. */ CD_CLR(dbp, cp); -err: if (ret != 0) +err: if (F_ISSET(dbp, DB_AM_CDB) && F_ISSET(dbc, DBC_RMW)) + (void)__lock_downgrade(dbp->dbenv->lk_info, dbc->mylock, + DB_LOCK_IWRITE, 0); + + if (ret != 0) *cp = copy; - PUTHANDLE(dbp); return (ret); } @@ -675,20 +793,22 @@ __ram_ca(dbp, recno, op) db_recno_t recno; ca_recno_arg op; { + CURSOR *cp; DBC *dbc; - RCURSOR *cp; /* * Adjust the cursors. See the comment in __bam_ca_delete(). */ - CURSOR_SETUP(dbp); - for (dbc = TAILQ_FIRST(&dbp->curs_queue); + DB_THREAD_LOCK(dbp); + for (dbc = TAILQ_FIRST(&dbp->active_queue); dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) { - cp = (RCURSOR *)dbc->internal; + cp = dbc->internal; switch (op) { case CA_DELETE: if (recno > cp->recno) --cp->recno; + if (recno == cp->recno) + CD_SET(dbp, cp); break; case CA_IAFTER: if (recno > cp->recno) @@ -700,51 +820,27 @@ __ram_ca(dbp, recno, op) break; } } - CURSOR_TEARDOWN(dbp); + DB_THREAD_UNLOCK(dbp); } -#ifdef DEBUG -/* - * __ram_cprint -- - * Display the current recno cursor list. - * - * PUBLIC: int __ram_cprint __P((DB *)); - */ -int -__ram_cprint(dbp) - DB *dbp; -{ - DBC *dbc; - RCURSOR *cp; - - CURSOR_SETUP(dbp); - for (dbc = TAILQ_FIRST(&dbp->curs_queue); - dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) { - cp = (RCURSOR *)dbc->internal; - fprintf(stderr, - "%#0x: recno: %lu\n", (u_int)cp, (u_long)cp->recno); - } - CURSOR_TEARDOWN(dbp); - - return (0); -} -#endif /* DEBUG */ - /* * __ram_getno -- * Check the user's record number, and make sure we've seen it. * - * PUBLIC: int __ram_getno __P((DB *, const DBT *, db_recno_t *, int)); + * PUBLIC: int __ram_getno __P((DBC *, const DBT *, db_recno_t *, int)); */ int -__ram_getno(dbp, key, rep, can_create) - DB *dbp; +__ram_getno(dbc, key, rep, can_create) + DBC *dbc; const DBT *key; db_recno_t *rep; int can_create; { + DB *dbp; db_recno_t recno; + dbp = dbc->dbp; + /* Check the user's record number. */ if ((recno = *(db_recno_t *)key->data) == 0) { __db_err(dbp->dbenv, "illegal record number of 0"); @@ -754,24 +850,11 @@ __ram_getno(dbp, key, rep, can_create) *rep = recno; /* - * Btree can neither create records or read them in. Recno can + * Btree can neither create records nor read them in. Recno can * do both, see if we can find the record. */ return (dbp->type == DB_RECNO ? - __ram_update(dbp, recno, can_create) : 0); -} - -/* - * __ram_snapshot -- - * Read in any remaining records from the backing input file. - * - * PUBLIC: int __ram_snapshot __P((DB *)); - */ -int -__ram_snapshot(dbp) - DB *dbp; -{ - return (__ram_update(dbp, DB_MAX_RECORDS, 0)); + __ram_update(dbc, recno, can_create) : 0); } /* @@ -779,18 +862,20 @@ __ram_snapshot(dbp) * Ensure the tree has records up to and including the specified one. */ static int -__ram_update(dbp, recno, can_create) - DB *dbp; +__ram_update(dbc, recno, can_create) + DBC *dbc; db_recno_t recno; int can_create; { BTREE *t; + DB *dbp; RECNO *rp; db_recno_t nrecs; int ret; + dbp = dbc->dbp; t = dbp->internal; - rp = t->bt_recno; + rp = t->recno; /* * If we can't create records and we've read the entire backing input @@ -803,12 +888,12 @@ __ram_update(dbp, recno, can_create) * If we haven't seen this record yet, try to get it from the original * file. */ - if ((ret = __bam_nrecs(dbp, &nrecs)) != 0) + if ((ret = __bam_nrecs(dbc, &nrecs)) != 0) return (ret); if (!F_ISSET(rp, RECNO_EOF) && recno > nrecs) { - if ((ret = rp->re_irec(dbp, recno)) != 0) + if ((ret = rp->re_irec(dbc, recno)) != 0) return (ret); - if ((ret = __bam_nrecs(dbp, &nrecs)) != 0) + if ((ret = __bam_nrecs(dbc, &nrecs)) != 0) return (ret); } @@ -819,28 +904,27 @@ __ram_update(dbp, recno, can_create) if (!can_create || recno <= nrecs + 1) return (0); - t->bt_rdata.dlen = 0; - t->bt_rdata.doff = 0; - t->bt_rdata.flags = 0; + dbc->rdata.dlen = 0; + dbc->rdata.doff = 0; + dbc->rdata.flags = 0; if (F_ISSET(dbp, DB_RE_FIXEDLEN)) { - if (t->bt_rdata.ulen < rp->re_len) { - t->bt_rdata.data = t->bt_rdata.data == NULL ? - (void *)__db_malloc(rp->re_len) : - (void *)__db_realloc(t->bt_rdata.data, rp->re_len); - if (t->bt_rdata.data == NULL) { - t->bt_rdata.ulen = 0; - return (ENOMEM); + if (dbc->rdata.ulen < rp->re_len) { + if ((ret = + __os_realloc(&dbc->rdata.data, rp->re_len)) != 0) { + dbc->rdata.ulen = 0; + dbc->rdata.data = NULL; + return (ret); } - t->bt_rdata.ulen = rp->re_len; + dbc->rdata.ulen = rp->re_len; } - t->bt_rdata.size = rp->re_len; - memset(t->bt_rdata.data, rp->re_pad, rp->re_len); + dbc->rdata.size = rp->re_len; + memset(dbc->rdata.data, rp->re_pad, rp->re_len); } else - t->bt_rdata.size = 0; + dbc->rdata.size = 0; while (recno > ++nrecs) - if ((ret = __ram_add(dbp, - &nrecs, &t->bt_rdata, 0, BI_DELETED)) != 0) + if ((ret = __ram_add(dbc, + &nrecs, &dbc->rdata, 0, BI_DELETED)) != 0) return (ret); return (0); } @@ -859,6 +943,11 @@ __ram_source(dbp, rp, fname) u_int32_t bytes, mbytes, oflags; int ret; + /* + * !!! + * The caller has full responsibility for cleaning up on error -- + * (it has to anyway, in case it fails after this routine succeeds). + */ if ((ret = __db_appname(dbp->dbenv, DB_APP_DATA, NULL, fname, 0, NULL, &rp->re_source)) != 0) return (ret); @@ -867,7 +956,7 @@ __ram_source(dbp, rp, fname) if ((ret = __db_open(rp->re_source, oflags, oflags, 0, &rp->re_fd)) != 0) { __db_err(dbp->dbenv, "%s: %s", rp->re_source, strerror(ret)); - goto err; + return (ret); } /* @@ -878,10 +967,10 @@ __ram_source(dbp, rp, fname) * compiler will perpetrate, doing the comparison in a portable way is * flatly impossible. Hope that mmap fails if the file is too large. */ - if ((ret = __db_ioinfo(rp->re_source, + if ((ret = __os_ioinfo(rp->re_source, rp->re_fd, &mbytes, &bytes, NULL)) != 0) { __db_err(dbp->dbenv, "%s: %s", rp->re_source, strerror(ret)); - goto err; + return (ret); } if (mbytes == 0 && bytes == 0) { F_SET(rp, RECNO_EOF); @@ -891,14 +980,11 @@ __ram_source(dbp, rp, fname) size = mbytes * MEGABYTE + bytes; if ((ret = __db_mapfile(rp->re_source, rp->re_fd, (size_t)size, 1, &rp->re_smap)) != 0) - goto err; + return (ret); rp->re_cmap = rp->re_smap; rp->re_emap = (u_int8_t *)rp->re_smap + (rp->re_msize = size); rp->re_irec = F_ISSET(dbp, DB_RE_FIXEDLEN) ? __ram_fmap : __ram_vmap; return (0); - -err: FREES(rp->re_source) - return (ret); } /* @@ -906,17 +992,19 @@ err: FREES(rp->re_source) * Rewrite the backing file. */ static int -__ram_writeback(dbp) - DB *dbp; +__ram_writeback(dbc) + DBC *dbc; { - RECNO *rp; + DB *dbp; DBT key, data; + RECNO *rp; db_recno_t keyno; ssize_t nw; int fd, ret, t_ret; u_int8_t delim, *pad; - rp = ((BTREE *)dbp->internal)->bt_recno; + dbp = dbc->dbp; + rp = ((BTREE *)dbp->internal)->recno; /* If the file wasn't modified, we're done. */ if (!F_ISSET(rp, RECNO_MODIFIED)) @@ -931,7 +1019,7 @@ __ram_writeback(dbp) /* * Read any remaining records into the tree. * - * XXX + * !!! * This is why we can't support transactions when applications specify * backing (re_source) files. At this point we have to read in the * rest of the records from the file so that we can write all of the @@ -946,7 +1034,8 @@ __ram_writeback(dbp) * protecting the backing source file, i.e. mpool would have to know * about it, and we don't want to go there. */ - if ((ret = __ram_snapshot(dbp)) != 0 && ret != DB_NOTFOUND) + if ((ret = + __ram_update(dbc, DB_MAX_RECORDS, 0)) != 0 && ret != DB_NOTFOUND) return (ret); /* @@ -962,7 +1051,7 @@ __ram_writeback(dbp) /* Get rid of any backing file descriptor, just on GP's. */ if (rp->re_fd != -1) { - (void)__db_close(rp->re_fd); + (void)__os_close(rp->re_fd); rp->re_fd = -1; } @@ -990,10 +1079,8 @@ __ram_writeback(dbp) */ delim = rp->re_delim; if (F_ISSET(dbp, DB_RE_FIXEDLEN)) { - if ((pad = (u_int8_t *)__db_malloc(rp->re_len)) == NULL) { - ret = ENOMEM; + if ((ret = __os_malloc(rp->re_len, NULL, &pad)) != 0) goto err; - } memset(pad, rp->re_pad, rp->re_len); } else COMPQUIET(pad, NULL); @@ -1001,7 +1088,7 @@ __ram_writeback(dbp) switch (ret = dbp->get(dbp, NULL, &key, &data, 0)) { case 0: if ((ret = - __db_write(fd, data.data, data.size, &nw)) != 0) + __os_write(fd, data.data, data.size, &nw)) != 0) goto err; if (nw != (ssize_t)data.size) { ret = EIO; @@ -1011,7 +1098,7 @@ __ram_writeback(dbp) case DB_KEYEMPTY: if (F_ISSET(dbp, DB_RE_FIXEDLEN)) { if ((ret = - __db_write(fd, pad, rp->re_len, &nw)) != 0) + __os_write(fd, pad, rp->re_len, &nw)) != 0) goto err; if (nw != (ssize_t)rp->re_len) { ret = EIO; @@ -1024,7 +1111,7 @@ __ram_writeback(dbp) goto done; } if (!F_ISSET(dbp, DB_RE_FIXEDLEN)) { - if ((ret = __db_write(fd, &delim, 1, &nw)) != 0) + if ((ret = __os_write(fd, &delim, 1, &nw)) != 0) goto err; if (nw != 1) { ret = EIO; @@ -1035,7 +1122,7 @@ __ram_writeback(dbp) err: done: /* Close the file descriptor. */ - if ((t_ret = __db_close(fd)) != 0 || ret == 0) + if ((t_ret = __os_close(fd)) != 0 || ret == 0) ret = t_ret; if (ret == 0) @@ -1048,11 +1135,11 @@ done: /* Close the file descriptor. */ * Get fixed length records from a file. */ static int -__ram_fmap(dbp, top) - DB *dbp; +__ram_fmap(dbc, top) + DBC *dbc; db_recno_t top; { - BTREE *t; + DB *dbp; DBT data; RECNO *rp; db_recno_t recno; @@ -1060,24 +1147,23 @@ __ram_fmap(dbp, top) u_int8_t *sp, *ep, *p; int ret; - if ((ret = __bam_nrecs(dbp, &recno)) != 0) + if ((ret = __bam_nrecs(dbc, &recno)) != 0) return (ret); - t = dbp->internal; - rp = t->bt_recno; - if (t->bt_rdata.ulen < rp->re_len) { - t->bt_rdata.data = t->bt_rdata.data == NULL ? - (void *)__db_malloc(rp->re_len) : - (void *)__db_realloc(t->bt_rdata.data, rp->re_len); - if (t->bt_rdata.data == NULL) { - t->bt_rdata.ulen = 0; - return (ENOMEM); + dbp = dbc->dbp; + rp = ((BTREE *)(dbp->internal))->recno; + + if (dbc->rdata.ulen < rp->re_len) { + if ((ret = __os_realloc(&dbc->rdata.data, rp->re_len)) != 0) { + dbc->rdata.ulen = 0; + dbc->rdata.data = NULL; + return (ret); } - t->bt_rdata.ulen = rp->re_len; + dbc->rdata.ulen = rp->re_len; } memset(&data, 0, sizeof(data)); - data.data = t->bt_rdata.data; + data.data = dbc->rdata.data; data.size = rp->re_len; sp = (u_int8_t *)rp->re_cmap; @@ -1088,7 +1174,7 @@ __ram_fmap(dbp, top) return (DB_NOTFOUND); } len = rp->re_len; - for (p = t->bt_rdata.data; + for (p = dbc->rdata.data; sp < ep && len > 0; *p++ = *sp++, --len) ; @@ -1108,7 +1194,7 @@ __ram_fmap(dbp, top) memset(p, rp->re_pad, len); ++recno; - if ((ret = __ram_add(dbp, &recno, &data, 0, 0)) != 0) + if ((ret = __ram_add(dbc, &recno, &data, 0, 0)) != 0) return (ret); } ++rp->re_last; @@ -1122,21 +1208,19 @@ __ram_fmap(dbp, top) * Get variable length records from a file. */ static int -__ram_vmap(dbp, top) - DB *dbp; +__ram_vmap(dbc, top) + DBC *dbc; db_recno_t top; { - BTREE *t; DBT data; RECNO *rp; db_recno_t recno; u_int8_t *sp, *ep; int delim, ret; - t = dbp->internal; - rp = t->bt_recno; + rp = ((BTREE *)(dbc->dbp->internal))->recno; - if ((ret = __bam_nrecs(dbp, &recno)) != 0) + if ((ret = __bam_nrecs(dbc, &recno)) != 0) return (ret); memset(&data, 0, sizeof(data)); @@ -1163,7 +1247,7 @@ __ram_vmap(dbp, top) if (rp->re_last >= recno) { data.size = sp - (u_int8_t *)data.data; ++recno; - if ((ret = __ram_add(dbp, &recno, &data, 0, 0)) != 0) + if ((ret = __ram_add(dbc, &recno, &data, 0, 0)) != 0) return (ret); } ++rp->re_last; @@ -1178,40 +1262,47 @@ __ram_vmap(dbp, top) * Add records into the tree. */ static int -__ram_add(dbp, recnop, data, flags, bi_flags) - DB *dbp; +__ram_add(dbc, recnop, data, flags, bi_flags) + DBC *dbc; db_recno_t *recnop; DBT *data; u_int32_t flags, bi_flags; { BKEYDATA *bk; - BTREE *t; + CURSOR *cp; + DB *dbp; PAGE *h; db_indx_t indx; int exact, isdeleted, ret, stack; - t = dbp->internal; + dbp = dbc->dbp; + cp = dbc->internal; retry: /* Find the slot for insertion. */ - if ((ret = __bam_rsearch(dbp, recnop, - S_INSERT | (LF_ISSET(DB_APPEND) ? S_APPEND : 0), 1, &exact)) != 0) + if ((ret = __bam_rsearch(dbc, recnop, + S_INSERT | (flags == DB_APPEND ? S_APPEND : 0), 1, &exact)) != 0) return (ret); - h = t->bt_csp->page; - indx = t->bt_csp->indx; + h = cp->csp->page; + indx = cp->csp->indx; stack = 1; /* + * If re-numbering records, the on-page deleted flag means this record + * was implicitly created. If not re-numbering records, the on-page + * deleted flag means this record was implicitly created, or, it was + * deleted at some time. + * * If DB_NOOVERWRITE is set and the item already exists in the tree, - * return an error unless the item has been marked for deletion. + * return an error unless the item was either marked for deletion or + * only implicitly created. */ isdeleted = 0; if (exact) { bk = GET_BKEYDATA(h, indx); - if (B_DISSET(bk->type)) { + if (B_DISSET(bk->type)) isdeleted = 1; - __bam_ca_replace(dbp, h->pgno, indx, REPLACE_SETUP); - } else - if (LF_ISSET(DB_NOOVERWRITE)) { + else + if (flags == DB_NOOVERWRITE) { ret = DB_KEYEXIST; goto err; } @@ -1224,40 +1315,42 @@ retry: /* Find the slot for insertion. */ * match, we're inserting a new key/data pair, before the search * location. */ - switch (ret = __bam_iitem(dbp, + switch (ret = __bam_iitem(dbc, &h, &indx, NULL, data, exact ? DB_CURRENT : DB_BEFORE, bi_flags)) { case 0: /* - * Done. Clean up the cursor and adjust the internal page - * counts. + * Don't adjust anything. + * + * If we inserted a record, no cursors need adjusting because + * the only new record it's possible to insert is at the very + * end of the tree. The necessary adjustments to the internal + * page counts were made by __bam_iitem(). + * + * If we overwrote a record, no cursors need adjusting because + * future DBcursor->get calls will simply return the underlying + * record (there's no adjustment made for the DB_CURRENT flag + * when a cursor get operation immediately follows a cursor + * delete operation, and the normal adjustment for the DB_NEXT + * flag is still correct). */ - if (isdeleted) - __bam_ca_replace(dbp, h->pgno, indx, REPLACE_SUCCESS); break; case DB_NEEDSPLIT: - /* - * We have to split the page. Back out the cursor setup, - * discard the stack of pages, and do the split. - */ - if (isdeleted) - __bam_ca_replace(dbp, h->pgno, indx, REPLACE_FAILED); - - (void)__bam_stkrel(dbp); + /* Discard the stack of pages and split the page. */ + (void)__bam_stkrel(dbc, 0); stack = 0; - if ((ret = __bam_split(dbp, recnop)) != 0) - break; + if ((ret = __bam_split(dbc, recnop)) != 0) + goto err; goto retry; /* NOTREACHED */ default: - if (isdeleted) - __bam_ca_replace(dbp, h->pgno, indx, REPLACE_FAILED); - break; + goto err; } + err: if (stack) - __bam_stkrel(dbp); + __bam_stkrel(dbc, 0); return (ret); } diff --git a/db2/btree/bt_rsearch.c b/db2/btree/bt_rsearch.c index caa6b3515e..8efe4059a8 100644 --- a/db2/btree/bt_rsearch.c +++ b/db2/btree/bt_rsearch.c @@ -44,7 +44,7 @@ #include "config.h" #ifndef lint -static const char sccsid[] = "@(#)bt_rsearch.c 10.15 (Sleepycat) 5/6/98"; +static const char sccsid[] = "@(#)bt_rsearch.c 10.21 (Sleepycat) 12/2/98"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -59,39 +59,37 @@ static const char sccsid[] = "@(#)bt_rsearch.c 10.15 (Sleepycat) 5/6/98"; * __bam_rsearch -- * Search a btree for a record number. * - * PUBLIC: int __bam_rsearch __P((DB *, db_recno_t *, u_int32_t, int, int *)); + * PUBLIC: int __bam_rsearch __P((DBC *, db_recno_t *, u_int32_t, int, int *)); */ int -__bam_rsearch(dbp, recnop, flags, stop, exactp) - DB *dbp; +__bam_rsearch(dbc, recnop, flags, stop, exactp) + DBC *dbc; db_recno_t *recnop; u_int32_t flags; int stop, *exactp; { BINTERNAL *bi; - BTREE *t; + CURSOR *cp; + DB *dbp; DB_LOCK lock; PAGE *h; RINTERNAL *ri; db_indx_t indx, top; db_pgno_t pg; db_recno_t i, recno, total; - int isappend, ret, stack; + int ret, stack; - t = dbp->internal; + dbp = dbc->dbp; + cp = dbc->internal; - /* - * We test for groups of flags, S_APPEND is the only one that can be - * OR'd into the set. Clear it now so that the tests for equality - * will work. - */ - if ((isappend = LF_ISSET(S_APPEND)) != 0) - LF_CLR(S_APPEND); + BT_STK_CLR(cp); /* * There are several ways we search a btree tree. The flags argument * specifies if we're acquiring read or write locks and if we are - * locking pairs of pages. See btree.h for more details. + * locking pairs of pages. In addition, if we're adding or deleting + * an item, we have to lock the entire tree, regardless. See btree.h + * for more details. * * If write-locking pages, we need to know whether or not to acquire a * write lock on a page before getting it. This depends on how deep it @@ -102,15 +100,36 @@ __bam_rsearch(dbp, recnop, flags, stop, exactp) * Retrieve the root page. */ pg = PGNO_ROOT; - if ((ret = __bam_lget(dbp, 0, PGNO_ROOT, - flags == S_INSERT || flags == S_DELETE ? - DB_LOCK_WRITE : DB_LOCK_READ, &lock)) != 0) + stack = LF_ISSET(S_STACK); + if ((ret = __bam_lget(dbc, + 0, pg, stack ? DB_LOCK_WRITE : DB_LOCK_READ, &lock)) != 0) return (ret); - if ((ret = __bam_pget(dbp, &h, &pg, 0)) != 0) { - (void)__BT_LPUT(dbp, lock); + if ((ret = memp_fget(dbp->mpf, &pg, 0, &h)) != 0) { + (void)__BT_LPUT(dbc, lock); return (ret); } - total = RE_NREC(h); + + /* + * Decide if we need to save this page; if we do, write lock it. + * We deliberately don't lock-couple on this call. If the tree + * is tiny, i.e., one page, and two threads are busily updating + * the root page, we're almost guaranteed deadlocks galore, as + * each one gets a read lock and then blocks the other's attempt + * for a write lock. + */ + if (!stack && + ((LF_ISSET(S_PARENT) && (u_int8_t)(stop + 1) >= h->level) || + (LF_ISSET(S_WRITE) && h->level == LEAFLEVEL))) { + (void)memp_fput(dbp->mpf, h, 0); + (void)__BT_LPUT(dbc, lock); + if ((ret = __bam_lget(dbc, 0, pg, DB_LOCK_WRITE, &lock)) != 0) + return (ret); + if ((ret = memp_fget(dbp->mpf, &pg, 0, &h)) != 0) { + (void)__BT_LPUT(dbc, lock); + return (ret); + } + stack = 1; + } /* * If appending to the tree, set the record number now -- we have the @@ -124,7 +143,8 @@ __bam_rsearch(dbp, recnop, flags, stop, exactp) * for the record immediately after the last record in the tree, so do * a fast check now. */ - if (isappend) { + total = RE_NREC(h); + if (LF_ISSET(S_APPEND)) { *exactp = 0; *recnop = recno = total + 1; } else { @@ -133,33 +153,14 @@ __bam_rsearch(dbp, recnop, flags, stop, exactp) *exactp = 1; else { *exactp = 0; - if (!PAST_END_OK(flags) || recno > total + 1) { + if (!LF_ISSET(S_PAST_EOF) || recno > total + 1) { (void)memp_fput(dbp->mpf, h, 0); - (void)__BT_LPUT(dbp, lock); + (void)__BT_LPUT(dbc, lock); return (DB_NOTFOUND); } } } - /* Decide if we're building a stack based on the operation. */ - BT_STK_CLR(t); - stack = flags == S_DELETE || flags == S_INSERT; - - /* - * Decide if we need to save this page; if we do, write lock it, and - * start to build a stack. - */ - if (LF_ISSET(S_PARENT) && (u_int8_t)(stop + 1) >= h->level) { - (void)memp_fput(dbp->mpf, h, 0); - if ((ret = __bam_lget(dbp, 1, pg, DB_LOCK_WRITE, &lock)) != 0) - return (ret); - if ((ret = __bam_pget(dbp, &h, &pg, 0)) != 0) { - (void)__BT_LPUT(dbp, lock); - return (ret); - } - stack = 1; - } - /* * !!! * Record numbers in the tree are 0-based, but the recno is @@ -177,7 +178,7 @@ __bam_rsearch(dbp, recnop, flags, stop, exactp) * not exist if there are enough deleted records in the * page. */ - if (recno <= NUM_ENT(h)) + if (recno <= (db_recno_t)NUM_ENT(h) / P_INDX) for (i = recno - 1;; --i) { if (B_DISSET(GET_BKEYDATA(h, i * P_INDX + O_INDX)->type)) @@ -185,10 +186,10 @@ __bam_rsearch(dbp, recnop, flags, stop, exactp) if (i == 0) break; } - if (recno > NUM_ENT(h)) { + if (recno > (db_recno_t)NUM_ENT(h) / P_INDX) { *exactp = 0; - if (!PAST_END_OK(flags) || - recno > (db_recno_t)(NUM_ENT(h) + 1)) { + if (!LF_ISSET(S_PAST_EOF) || recno > + (db_recno_t)(NUM_ENT(h) / P_INDX + 1)) { ret = DB_NOTFOUND; goto err; } @@ -197,7 +198,7 @@ __bam_rsearch(dbp, recnop, flags, stop, exactp) /* Correct from 1-based to 0-based for a page offset. */ --recno; - BT_STK_ENTER(t, h, recno * P_INDX, lock, ret); + BT_STK_ENTER(cp, h, recno * P_INDX, lock, ret); return (ret); case P_IBTREE: for (indx = 0, top = NUM_ENT(h);;) { @@ -213,7 +214,7 @@ __bam_rsearch(dbp, recnop, flags, stop, exactp) /* Correct from 1-based to 0-based for a page offset. */ --recno; - BT_STK_ENTER(t, h, recno, lock, ret); + BT_STK_ENTER(cp, h, recno, lock, ret); return (ret); case P_IRECNO: for (indx = 0, top = NUM_ENT(h);;) { @@ -232,42 +233,42 @@ __bam_rsearch(dbp, recnop, flags, stop, exactp) if (stack) { /* Return if this is the lowest page wanted. */ if (LF_ISSET(S_PARENT) && stop == h->level) { - BT_STK_ENTER(t, h, indx, lock, ret); + BT_STK_ENTER(cp, h, indx, lock, ret); return (ret); } - BT_STK_PUSH(t, h, indx, lock, ret); - if (ret) + BT_STK_PUSH(cp, h, indx, lock, ret); + if (ret != 0) goto err; - if ((ret = __bam_lget(dbp, 0, pg, - LF_ISSET(S_WRITE) ? DB_LOCK_WRITE : DB_LOCK_READ, - &lock)) != 0) + if ((ret = + __bam_lget(dbc, 0, pg, DB_LOCK_WRITE, &lock)) != 0) goto err; } else { - (void)memp_fput(dbp->mpf, h, 0); - /* * Decide if we want to return a pointer to the next * page in the stack. If we do, write lock it and * never unlock it. */ - if (LF_ISSET(S_PARENT) && - (u_int8_t)(stop + 1) >= (u_int8_t)(h->level - 1)) + if ((LF_ISSET(S_PARENT) && + (u_int8_t)(stop + 1) >= (u_int8_t)(h->level - 1)) || + (h->level - 1) == LEAFLEVEL) stack = 1; - if ((ret = __bam_lget(dbp, 1, pg, - LF_ISSET(S_WRITE) ? DB_LOCK_WRITE : DB_LOCK_READ, - &lock)) != 0) + (void)memp_fput(dbp->mpf, h, 0); + + if ((ret = + __bam_lget(dbc, 1, pg, stack && LF_ISSET(S_WRITE) ? + DB_LOCK_WRITE : DB_LOCK_READ, &lock)) != 0) goto err; } - if ((ret = __bam_pget(dbp, &h, &pg, 0)) != 0) + if ((ret = memp_fget(dbp->mpf, &pg, 0, &h)) != 0) goto err; } /* NOTREACHED */ -err: BT_STK_POP(t); - __bam_stkrel(dbp); +err: BT_STK_POP(cp); + __bam_stkrel(dbc, 0); return (ret); } @@ -275,25 +276,29 @@ err: BT_STK_POP(t); * __bam_adjust -- * Adjust the tree after adding or deleting a record. * - * PUBLIC: int __bam_adjust __P((DB *, BTREE *, int32_t)); + * PUBLIC: int __bam_adjust __P((DBC *, int32_t)); */ int -__bam_adjust(dbp, t, adjust) - DB *dbp; - BTREE *t; +__bam_adjust(dbc, adjust) + DBC *dbc; int32_t adjust; { + CURSOR *cp; + DB *dbp; EPG *epg; PAGE *h; int ret; + dbp = dbc->dbp; + cp = dbc->internal; + /* Update the record counts for the tree. */ - for (epg = t->bt_sp; epg <= t->bt_csp; ++epg) { + for (epg = cp->sp; epg <= cp->csp; ++epg) { h = epg->page; if (TYPE(h) == P_IBTREE || TYPE(h) == P_IRECNO) { - if (DB_LOGGING(dbp) && + if (DB_LOGGING(dbc) && (ret = __bam_cadjust_log(dbp->dbenv->lg_info, - dbp->txn, &LSN(h), 0, dbp->log_fileid, + dbc->txn, &LSN(h), 0, dbp->log_fileid, PGNO(h), &LSN(h), (u_int32_t)epg->indx, adjust, 1)) != 0) return (ret); @@ -317,28 +322,31 @@ __bam_adjust(dbp, t, adjust) * __bam_nrecs -- * Return the number of records in the tree. * - * PUBLIC: int __bam_nrecs __P((DB *, db_recno_t *)); + * PUBLIC: int __bam_nrecs __P((DBC *, db_recno_t *)); */ int -__bam_nrecs(dbp, rep) - DB *dbp; +__bam_nrecs(dbc, rep) + DBC *dbc; db_recno_t *rep; { + DB *dbp; DB_LOCK lock; PAGE *h; db_pgno_t pgno; int ret; + dbp = dbc->dbp; + pgno = PGNO_ROOT; - if ((ret = __bam_lget(dbp, 0, pgno, DB_LOCK_READ, &lock)) != 0) + if ((ret = __bam_lget(dbc, 0, pgno, DB_LOCK_READ, &lock)) != 0) return (ret); - if ((ret = __bam_pget(dbp, &h, &pgno, 0)) != 0) + if ((ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0) return (ret); *rep = RE_NREC(h); (void)memp_fput(dbp->mpf, h, 0); - (void)__BT_TLPUT(dbp, lock); + (void)__BT_TLPUT(dbc, lock); return (0); } diff --git a/db2/btree/bt_search.c b/db2/btree/bt_search.c index 09ce46d90a..1f439a4261 100644 --- a/db2/btree/bt_search.c +++ b/db2/btree/bt_search.c @@ -47,7 +47,7 @@ #include "config.h" #ifndef lint -static const char sccsid[] = "@(#)bt_search.c 10.15 (Sleepycat) 5/6/98"; +static const char sccsid[] = "@(#)bt_search.c 10.25 (Sleepycat) 12/16/98"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -65,38 +65,41 @@ static const char sccsid[] = "@(#)bt_search.c 10.15 (Sleepycat) 5/6/98"; * __bam_search -- * Search a btree for a key. * - * PUBLIC: int __bam_search __P((DB *, + * PUBLIC: int __bam_search __P((DBC *, * PUBLIC: const DBT *, u_int32_t, int, db_recno_t *, int *)); */ int -__bam_search(dbp, key, flags, stop, recnop, exactp) - DB *dbp; +__bam_search(dbc, key, flags, stop, recnop, exactp) + DBC *dbc; const DBT *key; u_int32_t flags; int stop, *exactp; db_recno_t *recnop; { BTREE *t; + CURSOR *cp; + DB *dbp; DB_LOCK lock; - EPG cur; PAGE *h; db_indx_t base, i, indx, lim; db_pgno_t pg; db_recno_t recno; int cmp, jump, ret, stack; + dbp = dbc->dbp; + cp = dbc->internal; t = dbp->internal; recno = 0; - BT_STK_CLR(t); + BT_STK_CLR(cp); /* * There are several ways we search a btree tree. The flags argument * specifies if we're acquiring read or write locks, if we position * to the first or last item in a set of duplicates, if we return - * deleted items, and if we are locking pairs of pages. See btree.h - * for more details. In addition, if we're doing record numbers, we - * have to lock the entire tree regardless. + * deleted items, and if we are locking pairs of pages. In addition, + * if we're modifying record numbers, we have to lock the entire tree + * regardless. See btree.h for more details. * * If write-locking pages, we need to know whether or not to acquire a * write lock on a page before getting it. This depends on how deep it @@ -108,11 +111,11 @@ __bam_search(dbp, key, flags, stop, recnop, exactp) */ pg = PGNO_ROOT; stack = F_ISSET(dbp, DB_BT_RECNUM) && LF_ISSET(S_STACK); - if ((ret = __bam_lget(dbp, + if ((ret = __bam_lget(dbc, 0, pg, stack ? DB_LOCK_WRITE : DB_LOCK_READ, &lock)) != 0) return (ret); - if ((ret = __bam_pget(dbp, &h, &pg, 0)) != 0) { - (void)__BT_LPUT(dbp, lock); + if ((ret = memp_fget(dbp->mpf, &pg, 0, &h)) != 0) { + (void)__BT_LPUT(dbc, lock); return (ret); } @@ -128,14 +131,13 @@ __bam_search(dbp, key, flags, stop, recnop, exactp) ((LF_ISSET(S_PARENT) && (u_int8_t)(stop + 1) >= h->level) || (LF_ISSET(S_WRITE) && h->level == LEAFLEVEL))) { (void)memp_fput(dbp->mpf, h, 0); - (void)__BT_LPUT(dbp, lock); - if ((ret = __bam_lget(dbp, 0, pg, DB_LOCK_WRITE, &lock)) != 0) + (void)__BT_LPUT(dbc, lock); + if ((ret = __bam_lget(dbc, 0, pg, DB_LOCK_WRITE, &lock)) != 0) return (ret); - if ((ret = __bam_pget(dbp, &h, &pg, 0)) != 0) { - (void)__BT_LPUT(dbp, lock); + if ((ret = memp_fget(dbp->mpf, &pg, 0, &h)) != 0) { + (void)__BT_LPUT(dbc, lock); return (ret); } - stack = 1; } @@ -147,12 +149,12 @@ __bam_search(dbp, key, flags, stop, recnop, exactp) * per page item. If we find an exact match on a leaf page, * we're done. */ - cur.page = h; jump = TYPE(h) == P_LBTREE ? P_INDX : O_INDX; for (base = 0, lim = NUM_ENT(h) / (db_indx_t)jump; lim != 0; lim >>= 1) { - cur.indx = indx = base + ((lim >> 1) * jump); - if ((cmp = __bam_cmp(dbp, key, &cur)) == 0) { + indx = base + ((lim >> 1) * jump); + if ((cmp = + __bam_cmp(dbp, key, h, indx, t->bt_compare)) == 0) { if (TYPE(h) == P_LBTREE) goto match; goto next; @@ -184,7 +186,7 @@ __bam_search(dbp, key, flags, stop, recnop, exactp) * to find an undeleted record. This is handled in the * __bam_c_search() routine. */ - BT_STK_ENTER(t, h, base, lock, ret); + BT_STK_ENTER(cp, h, base, lock, ret); return (ret); } @@ -208,39 +210,39 @@ next: pg = GET_BINTERNAL(h, indx)->pgno; if (stack) { /* Return if this is the lowest page wanted. */ if (LF_ISSET(S_PARENT) && stop == h->level) { - BT_STK_ENTER(t, h, indx, lock, ret); + BT_STK_ENTER(cp, h, indx, lock, ret); return (ret); } - BT_STK_PUSH(t, h, indx, lock, ret); + BT_STK_PUSH(cp, h, indx, lock, ret); if (ret != 0) goto err; if ((ret = - __bam_lget(dbp, 0, pg, DB_LOCK_WRITE, &lock)) != 0) + __bam_lget(dbc, 0, pg, DB_LOCK_WRITE, &lock)) != 0) goto err; } else { - (void)memp_fput(dbp->mpf, h, 0); - /* - * Decide if we want to return a pointer to the next - * page in the stack. If we do, write lock it and - * never unlock it. + * Decide if we want to return a reference to the next + * page in the return stack. If so, lock it and never + * unlock it. */ if ((LF_ISSET(S_PARENT) && (u_int8_t)(stop + 1) >= (u_int8_t)(h->level - 1)) || (h->level - 1) == LEAFLEVEL) stack = 1; + (void)memp_fput(dbp->mpf, h, 0); + if ((ret = - __bam_lget(dbp, 1, pg, stack && LF_ISSET(S_WRITE) ? + __bam_lget(dbc, 1, pg, stack && LF_ISSET(S_WRITE) ? DB_LOCK_WRITE : DB_LOCK_READ, &lock)) != 0) goto err; } - if ((ret = __bam_pget(dbp, &h, &pg, 0)) != 0) + if ((ret = memp_fget(dbp->mpf, &pg, 0, &h)) != 0) goto err; } - /* NOTREACHED */ + match: *exactp = 1; /* @@ -288,17 +290,17 @@ match: *exactp = 1; goto notfound; } - BT_STK_ENTER(t, h, indx, lock, ret); + BT_STK_ENTER(cp, h, indx, lock, ret); return (ret); notfound: (void)memp_fput(dbp->mpf, h, 0); - (void)__BT_LPUT(dbp, lock); + (void)__BT_LPUT(dbc, lock); ret = DB_NOTFOUND; -err: if (t->bt_csp > t->bt_sp) { - BT_STK_POP(t); - __bam_stkrel(dbp); +err: if (cp->csp > cp->sp) { + BT_STK_POP(cp); + __bam_stkrel(dbc, 0); } return (ret); } @@ -307,20 +309,35 @@ err: if (t->bt_csp > t->bt_sp) { * __bam_stkrel -- * Release all pages currently held in the stack. * - * PUBLIC: int __bam_stkrel __P((DB *)); + * PUBLIC: int __bam_stkrel __P((DBC *, int)); */ int -__bam_stkrel(dbp) - DB *dbp; +__bam_stkrel(dbc, nolocks) + DBC *dbc; + int nolocks; { - BTREE *t; + CURSOR *cp; + DB *dbp; EPG *epg; - t = dbp->internal; - for (epg = t->bt_sp; epg <= t->bt_csp; ++epg) { - (void)memp_fput(dbp->mpf, epg->page, 0); - (void)__BT_TLPUT(dbp, epg->lock); + dbp = dbc->dbp; + cp = dbc->internal; + + /* Release inner pages first. */ + for (epg = cp->sp; epg <= cp->csp; ++epg) { + if (epg->page != NULL) + (void)memp_fput(dbp->mpf, epg->page, 0); + if (epg->lock != LOCK_INVALID) { + if (nolocks) + (void)__BT_LPUT(dbc, epg->lock); + else + (void)__BT_TLPUT(dbc, epg->lock); + } } + + /* Clear the stack, all pages have been released. */ + BT_STK_CLR(cp); + return (0); } @@ -328,24 +345,25 @@ __bam_stkrel(dbp) * __bam_stkgrow -- * Grow the stack. * - * PUBLIC: int __bam_stkgrow __P((BTREE *)); + * PUBLIC: int __bam_stkgrow __P((CURSOR *)); */ int -__bam_stkgrow(t) - BTREE *t; +__bam_stkgrow(cp) + CURSOR *cp; { EPG *p; size_t entries; + int ret; - entries = t->bt_esp - t->bt_sp; + entries = cp->esp - cp->sp; - if ((p = (EPG *)__db_calloc(entries * 2, sizeof(EPG))) == NULL) - return (ENOMEM); - memcpy(p, t->bt_sp, entries * sizeof(EPG)); - if (t->bt_sp != t->bt_stack) - FREE(t->bt_sp, entries * sizeof(EPG)); - t->bt_sp = p; - t->bt_csp = p + entries; - t->bt_esp = p + entries * 2; + if ((ret = __os_calloc(entries * 2, sizeof(EPG), &p)) != 0) + return (ret); + memcpy(p, cp->sp, entries * sizeof(EPG)); + if (cp->sp != cp->stack) + __os_free(cp->sp, entries * sizeof(EPG)); + cp->sp = p; + cp->csp = p + entries; + cp->esp = p + entries * 2; return (0); } diff --git a/db2/btree/bt_split.c b/db2/btree/bt_split.c index da9417c781..1d8e926d85 100644 --- a/db2/btree/bt_split.c +++ b/db2/btree/bt_split.c @@ -44,7 +44,7 @@ #include "config.h" #ifndef lint -static const char sccsid[] = "@(#)bt_split.c 10.23 (Sleepycat) 5/23/98"; +static const char sccsid[] = "@(#)bt_split.c 10.33 (Sleepycat) 10/13/98"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -59,27 +59,31 @@ static const char sccsid[] = "@(#)bt_split.c 10.23 (Sleepycat) 5/23/98"; #include "db_page.h" #include "btree.h" -static int __bam_page __P((DB *, EPG *, EPG *)); -static int __bam_pinsert __P((DB *, EPG *, PAGE *, PAGE *)); -static int __bam_psplit __P((DB *, EPG *, PAGE *, PAGE *, int)); -static int __bam_root __P((DB *, EPG *)); +static int __bam_broot __P((DBC *, PAGE *, PAGE *, PAGE *)); +static int __bam_page __P((DBC *, EPG *, EPG *)); +static int __bam_pinsert __P((DBC *, EPG *, PAGE *, PAGE *)); +static int __bam_psplit __P((DBC *, EPG *, PAGE *, PAGE *, db_indx_t *)); +static int __bam_root __P((DBC *, EPG *)); +static int __ram_root __P((DBC *, PAGE *, PAGE *, PAGE *)); /* * __bam_split -- * Split a page. * - * PUBLIC: int __bam_split __P((DB *, void *)); + * PUBLIC: int __bam_split __P((DBC *, void *)); */ int -__bam_split(dbp, arg) - DB *dbp; +__bam_split(dbc, arg) + DBC *dbc; void *arg; { - BTREE *t; + CURSOR *cp; + DB *dbp; enum { UP, DOWN } dir; int exact, level, ret; - t = dbp->internal; + dbp = dbc->dbp; + cp = dbc->internal; /* * The locking protocol we use to avoid deadlock to acquire locks by @@ -113,15 +117,16 @@ __bam_split(dbp, arg) * Acquire a page and its parent, locked. */ if ((ret = (dbp->type == DB_BTREE ? - __bam_search(dbp, arg, S_WRPAIR, level, NULL, &exact) : - __bam_rsearch(dbp, + __bam_search(dbc, arg, S_WRPAIR, level, NULL, &exact) : + __bam_rsearch(dbc, (db_recno_t *)arg, S_WRPAIR, level, &exact))) != 0) return (ret); /* Split the page. */ - ret = t->bt_csp[0].page->pgno == PGNO_ROOT ? - __bam_root(dbp, &t->bt_csp[0]) : - __bam_page(dbp, &t->bt_csp[-1], &t->bt_csp[0]); + ret = cp->csp[0].page->pgno == PGNO_ROOT ? + __bam_root(dbc, &cp->csp[0]) : + __bam_page(dbc, &cp->csp[-1], &cp->csp[0]); + BT_STK_CLR(cp); switch (ret) { case 0: @@ -155,15 +160,16 @@ __bam_split(dbp, arg) * Split the root page of a btree. */ static int -__bam_root(dbp, cp) - DB *dbp; +__bam_root(dbc, cp) + DBC *dbc; EPG *cp; { - BTREE *t; + DB *dbp; PAGE *lp, *rp; + db_indx_t split; int ret; - t = dbp->internal; + dbp = dbc->dbp; /* Yeah, right. */ if (cp->page->level >= MAXBTREELEVEL) { @@ -173,8 +179,8 @@ __bam_root(dbp, cp) /* Create new left and right pages for the split. */ lp = rp = NULL; - if ((ret = __bam_new(dbp, TYPE(cp->page), &lp)) != 0 || - (ret = __bam_new(dbp, TYPE(cp->page), &rp)) != 0) + if ((ret = __bam_new(dbc, TYPE(cp->page), &lp)) != 0 || + (ret = __bam_new(dbc, TYPE(cp->page), &rp)) != 0) goto err; P_INIT(lp, dbp->pgsize, lp->pgno, PGNO_INVALID, ISINTERNAL(cp->page) ? PGNO_INVALID : rp->pgno, @@ -184,18 +190,18 @@ __bam_root(dbp, cp) cp->page->level, TYPE(cp->page)); /* Split the page. */ - if ((ret = __bam_psplit(dbp, cp, lp, rp, 1)) != 0) + if ((ret = __bam_psplit(dbc, cp, lp, rp, &split)) != 0) goto err; /* Log the change. */ - if (DB_LOGGING(dbp)) { + if (DB_LOGGING(dbc)) { DBT __a; DB_LSN __lsn; memset(&__a, 0, sizeof(__a)); __a.data = cp->page; __a.size = dbp->pgsize; ZERO_LSN(__lsn); - if ((ret = __bam_split_log(dbp->dbenv->lg_info, dbp->txn, + if ((ret = __bam_split_log(dbp->dbenv->lg_info, dbc->txn, &LSN(cp->page), 0, dbp->log_fileid, PGNO(lp), &LSN(lp), PGNO(rp), &LSN(rp), (u_int32_t)NUM_ENT(lp), 0, &__lsn, &__a)) != 0) @@ -205,26 +211,27 @@ __bam_root(dbp, cp) /* Clean up the new root page. */ if ((ret = (dbp->type == DB_RECNO ? - __ram_root(dbp, cp->page, lp, rp) : - __bam_broot(dbp, cp->page, lp, rp))) != 0) + __ram_root(dbc, cp->page, lp, rp) : + __bam_broot(dbc, cp->page, lp, rp))) != 0) goto err; + /* Adjust any cursors. Do it last so we don't have to undo it. */ + __bam_ca_split(dbp, cp->page->pgno, lp->pgno, rp->pgno, split, 1); + /* Success -- write the real pages back to the store. */ (void)memp_fput(dbp->mpf, cp->page, DB_MPOOL_DIRTY); - (void)__BT_TLPUT(dbp, cp->lock); + (void)__BT_TLPUT(dbc, cp->lock); (void)memp_fput(dbp->mpf, lp, DB_MPOOL_DIRTY); (void)memp_fput(dbp->mpf, rp, DB_MPOOL_DIRTY); - ++t->lstat.bt_split; - ++t->lstat.bt_rootsplit; return (0); err: if (lp != NULL) - (void)__bam_free(dbp, lp); + (void)__bam_free(dbc, lp); if (rp != NULL) - (void)__bam_free(dbp, rp); + (void)__bam_free(dbc, rp); (void)memp_fput(dbp->mpf, cp->page, 0); - (void)__BT_TLPUT(dbp, cp->lock); + (void)__BT_TLPUT(dbc, cp->lock); return (ret); } @@ -233,19 +240,22 @@ err: if (lp != NULL) * Split the non-root page of a btree. */ static int -__bam_page(dbp, pp, cp) - DB *dbp; +__bam_page(dbc, pp, cp) + DBC *dbc; EPG *pp, *cp; { + DB *dbp; DB_LOCK tplock; PAGE *lp, *rp, *tp; + db_indx_t split; int ret; + dbp = dbc->dbp; lp = rp = tp = NULL; ret = -1; /* Create new right page for the split. */ - if ((ret = __bam_new(dbp, TYPE(cp->page), &rp)) != 0) + if ((ret = __bam_new(dbc, TYPE(cp->page), &rp)) != 0) goto err; P_INIT(rp, dbp->pgsize, rp->pgno, ISINTERNAL(cp->page) ? PGNO_INVALID : cp->page->pgno, @@ -253,13 +263,8 @@ __bam_page(dbp, pp, cp) cp->page->level, TYPE(cp->page)); /* Create new left page for the split. */ - if ((lp = (PAGE *)__db_malloc(dbp->pgsize)) == NULL) { - ret = ENOMEM; + if ((ret = __os_malloc(dbp->pgsize, NULL, &lp)) != 0) goto err; - } -#ifdef DIAGNOSTIC - memset(lp, 0xff, dbp->pgsize); -#endif P_INIT(lp, dbp->pgsize, cp->page->pgno, ISINTERNAL(cp->page) ? PGNO_INVALID : cp->page->prev_pgno, ISINTERNAL(cp->page) ? PGNO_INVALID : rp->pgno, @@ -276,7 +281,7 @@ __bam_page(dbp, pp, cp) * change, we swap the original and the allocated left page after the * split. */ - if ((ret = __bam_psplit(dbp, cp, lp, rp, 0)) != 0) + if ((ret = __bam_psplit(dbc, cp, lp, rp, &split)) != 0) goto err; /* @@ -293,19 +298,19 @@ __bam_page(dbp, pp, cp) * the page we're splitting. */ if (TYPE(cp->page) == P_LBTREE && rp->next_pgno != PGNO_INVALID) { - if ((ret = __bam_lget(dbp, + if ((ret = __bam_lget(dbc, 0, rp->next_pgno, DB_LOCK_WRITE, &tplock)) != 0) goto err; - if ((ret = __bam_pget(dbp, &tp, &rp->next_pgno, 0)) != 0) + if ((ret = memp_fget(dbp->mpf, &rp->next_pgno, 0, &tp)) != 0) goto err; } /* Insert the new pages into the parent page. */ - if ((ret = __bam_pinsert(dbp, pp, lp, rp)) != 0) + if ((ret = __bam_pinsert(dbc, pp, lp, rp)) != 0) goto err; /* Log the change. */ - if (DB_LOGGING(dbp)) { + if (DB_LOGGING(dbc)) { DBT __a; DB_LSN __lsn; memset(&__a, 0, sizeof(__a)); @@ -313,7 +318,7 @@ __bam_page(dbp, pp, cp) __a.size = dbp->pgsize; if (tp == NULL) ZERO_LSN(__lsn); - if ((ret = __bam_split_log(dbp->dbenv->lg_info, dbp->txn, + if ((ret = __bam_split_log(dbp->dbenv->lg_info, dbc->txn, &cp->page->lsn, 0, dbp->log_fileid, PGNO(cp->page), &LSN(cp->page), PGNO(rp), &LSN(rp), (u_int32_t)NUM_ENT(lp), tp == NULL ? 0 : PGNO(tp), @@ -329,56 +334,69 @@ __bam_page(dbp, pp, cp) memcpy(cp->page, lp, LOFFSET(lp)); memcpy((u_int8_t *)cp->page + HOFFSET(lp), (u_int8_t *)lp + HOFFSET(lp), dbp->pgsize - HOFFSET(lp)); - FREE(lp, dbp->pgsize); + __os_free(lp, dbp->pgsize); lp = NULL; /* Finish the next-page link. */ if (tp != NULL) tp->prev_pgno = rp->pgno; + /* Adjust any cursors. Do so last so we don't have to undo it. */ + __bam_ca_split(dbp, cp->page->pgno, cp->page->pgno, rp->pgno, split, 0); + /* Success -- write the real pages back to the store. */ (void)memp_fput(dbp->mpf, pp->page, DB_MPOOL_DIRTY); - (void)__BT_TLPUT(dbp, pp->lock); + (void)__BT_TLPUT(dbc, pp->lock); (void)memp_fput(dbp->mpf, cp->page, DB_MPOOL_DIRTY); - (void)__BT_TLPUT(dbp, cp->lock); + (void)__BT_TLPUT(dbc, cp->lock); (void)memp_fput(dbp->mpf, rp, DB_MPOOL_DIRTY); if (tp != NULL) { (void)memp_fput(dbp->mpf, tp, DB_MPOOL_DIRTY); - (void)__BT_TLPUT(dbp, tplock); + (void)__BT_TLPUT(dbc, tplock); } return (0); err: if (lp != NULL) - FREE(lp, dbp->pgsize); + __os_free(lp, dbp->pgsize); if (rp != NULL) - (void)__bam_free(dbp, rp); + (void)__bam_free(dbc, rp); if (tp != NULL) { (void)memp_fput(dbp->mpf, tp, 0); - (void)__BT_TLPUT(dbp, tplock); + if (ret == DB_NEEDSPLIT) + (void)__BT_LPUT(dbc, tplock); + else + (void)__BT_TLPUT(dbc, tplock); } (void)memp_fput(dbp->mpf, pp->page, 0); - (void)__BT_TLPUT(dbp, pp->lock); + if (ret == DB_NEEDSPLIT) + (void)__BT_LPUT(dbc, pp->lock); + else + (void)__BT_TLPUT(dbc, pp->lock); (void)memp_fput(dbp->mpf, cp->page, 0); - (void)__BT_TLPUT(dbp, cp->lock); + if (ret == DB_NEEDSPLIT) + (void)__BT_LPUT(dbc, cp->lock); + else + (void)__BT_TLPUT(dbc, cp->lock); return (ret); } /* * __bam_broot -- * Fix up the btree root page after it has been split. - * - * PUBLIC: int __bam_broot __P((DB *, PAGE *, PAGE *, PAGE *)); */ -int -__bam_broot(dbp, rootp, lp, rp) - DB *dbp; +static int +__bam_broot(dbc, rootp, lp, rp) + DBC *dbc; PAGE *rootp, *lp, *rp; { BINTERNAL bi, *child_bi; BKEYDATA *child_bk; + DB *dbp; DBT hdr, data; int ret; + dbp = dbc->dbp; + /* * If the root page was a leaf page, change it into an internal page. * We copy the key we split on (but not the key's data, in the case of @@ -405,7 +423,7 @@ __bam_broot(dbp, rootp, lp, rp) hdr.data = &bi; hdr.size = SSZA(BINTERNAL, data); if ((ret = - __db_pitem(dbp, rootp, 0, BINTERNAL_SIZE(0), &hdr, NULL)) != 0) + __db_pitem(dbc, rootp, 0, BINTERNAL_SIZE(0), &hdr, NULL)) != 0) return (ret); switch (TYPE(rp)) { @@ -424,13 +442,13 @@ __bam_broot(dbp, rootp, lp, rp) hdr.size = SSZA(BINTERNAL, data); data.data = child_bi->data; data.size = child_bi->len; - if ((ret = __db_pitem(dbp, rootp, 1, + if ((ret = __db_pitem(dbc, rootp, 1, BINTERNAL_SIZE(child_bi->len), &hdr, &data)) != 0) return (ret); /* Increment the overflow ref count. */ if (B_TYPE(child_bi->type) == B_OVERFLOW) - if ((ret = __db_ovref(dbp, + if ((ret = __db_ovref(dbc, ((BOVERFLOW *)(child_bi->data))->pgno, 1)) != 0) return (ret); break; @@ -450,7 +468,7 @@ __bam_broot(dbp, rootp, lp, rp) hdr.size = SSZA(BINTERNAL, data); data.data = child_bk->data; data.size = child_bk->len; - if ((ret = __db_pitem(dbp, rootp, 1, + if ((ret = __db_pitem(dbc, rootp, 1, BINTERNAL_SIZE(child_bk->len), &hdr, &data)) != 0) return (ret); break; @@ -467,13 +485,13 @@ __bam_broot(dbp, rootp, lp, rp) hdr.size = SSZA(BINTERNAL, data); data.data = child_bk; data.size = BOVERFLOW_SIZE; - if ((ret = __db_pitem(dbp, rootp, 1, + if ((ret = __db_pitem(dbc, rootp, 1, BINTERNAL_SIZE(BOVERFLOW_SIZE), &hdr, &data)) != 0) return (ret); /* Increment the overflow ref count. */ if (B_TYPE(child_bk->type) == B_OVERFLOW) - if ((ret = __db_ovref(dbp, + if ((ret = __db_ovref(dbc, ((BOVERFLOW *)child_bk)->pgno, 1)) != 0) return (ret); break; @@ -490,18 +508,19 @@ __bam_broot(dbp, rootp, lp, rp) /* * __ram_root -- * Fix up the recno root page after it has been split. - * - * PUBLIC: int __ram_root __P((DB *, PAGE *, PAGE *, PAGE *)); */ -int -__ram_root(dbp, rootp, lp, rp) - DB *dbp; +static int +__ram_root(dbc, rootp, lp, rp) + DBC *dbc; PAGE *rootp, *lp, *rp; { + DB *dbp; DBT hdr; RINTERNAL ri; int ret; + dbp = dbc->dbp; + /* Initialize the page. */ P_INIT(rootp, dbp->pgsize, PGNO_ROOT, PGNO_INVALID, PGNO_INVALID, lp->level + 1, P_IRECNO); @@ -514,12 +533,12 @@ __ram_root(dbp, rootp, lp, rp) /* Insert the left and right keys, set the header information. */ ri.pgno = lp->pgno; ri.nrecs = __bam_total(lp); - if ((ret = __db_pitem(dbp, rootp, 0, RINTERNAL_SIZE, &hdr, NULL)) != 0) + if ((ret = __db_pitem(dbc, rootp, 0, RINTERNAL_SIZE, &hdr, NULL)) != 0) return (ret); RE_NREC_SET(rootp, ri.nrecs); ri.pgno = rp->pgno; ri.nrecs = __bam_total(rp); - if ((ret = __db_pitem(dbp, rootp, 1, RINTERNAL_SIZE, &hdr, NULL)) != 0) + if ((ret = __db_pitem(dbc, rootp, 1, RINTERNAL_SIZE, &hdr, NULL)) != 0) return (ret); RE_NREC_ADJ(rootp, ri.nrecs); return (0); @@ -530,14 +549,15 @@ __ram_root(dbp, rootp, lp, rp) * Insert a new key into a parent page, completing the split. */ static int -__bam_pinsert(dbp, parent, lchild, rchild) - DB *dbp; +__bam_pinsert(dbc, parent, lchild, rchild) + DBC *dbc; EPG *parent; PAGE *lchild, *rchild; { BINTERNAL bi, *child_bi; BKEYDATA *child_bk, *tmp_bk; BTREE *t; + DB *dbp; DBT a, b, hdr, data; PAGE *ppage; RINTERNAL ri; @@ -546,6 +566,7 @@ __bam_pinsert(dbp, parent, lchild, rchild) u_int32_t n, nbytes, nksize; int ret; + dbp = dbc->dbp; t = dbp->internal; ppage = parent->page; @@ -600,13 +621,13 @@ __bam_pinsert(dbp, parent, lchild, rchild) memset(&data, 0, sizeof(data)); data.data = child_bi->data; data.size = child_bi->len; - if ((ret = __db_pitem(dbp, ppage, off, + if ((ret = __db_pitem(dbc, ppage, off, BINTERNAL_SIZE(child_bi->len), &hdr, &data)) != 0) return (ret); /* Increment the overflow ref count. */ if (B_TYPE(child_bi->type) == B_OVERFLOW) - if ((ret = __db_ovref(dbp, + if ((ret = __db_ovref(dbc, ((BOVERFLOW *)(child_bi->data))->pgno, 1)) != 0) return (ret); break; @@ -630,10 +651,9 @@ __bam_pinsert(dbp, parent, lchild, rchild) b.size = child_bk->len; b.data = child_bk->data; nksize = t->bt_prefix(&a, &b); - if ((n = BINTERNAL_PSIZE(nksize)) < nbytes) { - t->lstat.bt_pfxsaved += nbytes - n; + if ((n = BINTERNAL_PSIZE(nksize)) < nbytes) nbytes = n; - } else + else noprefix: nksize = child_bk->len; if (P_FREESPACE(ppage) < nbytes) @@ -650,7 +670,7 @@ noprefix: nksize = child_bk->len; memset(&data, 0, sizeof(data)); data.data = child_bk->data; data.size = nksize; - if ((ret = __db_pitem(dbp, ppage, off, + if ((ret = __db_pitem(dbc, ppage, off, BINTERNAL_SIZE(nksize), &hdr, &data)) != 0) return (ret); break; @@ -672,13 +692,13 @@ noprefix: nksize = child_bk->len; memset(&data, 0, sizeof(data)); data.data = child_bk; data.size = BOVERFLOW_SIZE; - if ((ret = __db_pitem(dbp, ppage, off, + if ((ret = __db_pitem(dbc, ppage, off, BINTERNAL_SIZE(BOVERFLOW_SIZE), &hdr, &data)) != 0) return (ret); /* Increment the overflow ref count. */ if (B_TYPE(child_bk->type) == B_OVERFLOW) - if ((ret = __db_ovref(dbp, + if ((ret = __db_ovref(dbc, ((BOVERFLOW *)child_bk)->pgno, 1)) != 0) return (ret); break; @@ -699,7 +719,7 @@ noprefix: nksize = child_bk->len; hdr.size = RINTERNAL_SIZE; ri.pgno = rchild->pgno; ri.nrecs = nrecs; - if ((ret = __db_pitem(dbp, + if ((ret = __db_pitem(dbc, ppage, off, RINTERNAL_SIZE, &hdr, NULL)) != 0) return (ret); break; @@ -710,9 +730,9 @@ noprefix: nksize = child_bk->len; /* Adjust the parent page's left page record count. */ if (dbp->type == DB_RECNO || F_ISSET(dbp, DB_BT_RECNUM)) { /* Log the change. */ - if (DB_LOGGING(dbp) && + if (DB_LOGGING(dbc) && (ret = __bam_cadjust_log(dbp->dbenv->lg_info, - dbp->txn, &LSN(ppage), 0, dbp->log_fileid, + dbc->txn, &LSN(ppage), 0, dbp->log_fileid, PGNO(ppage), &LSN(ppage), (u_int32_t)parent->indx, -(int32_t)nrecs, (int32_t)0)) != 0) return (ret); @@ -732,18 +752,18 @@ noprefix: nksize = child_bk->len; * Do the real work of splitting the page. */ static int -__bam_psplit(dbp, cp, lp, rp, cleft) - DB *dbp; +__bam_psplit(dbc, cp, lp, rp, splitret) + DBC *dbc; EPG *cp; PAGE *lp, *rp; - int cleft; + db_indx_t *splitret; { - BTREE *t; + DB *dbp; PAGE *pp; db_indx_t half, nbytes, off, splitp, top; int adjust, cnt, isbigkey, ret; - t = dbp->internal; + dbp = dbc->dbp; pp = cp->page; adjust = TYPE(pp) == P_LBTREE ? P_INDX : O_INDX; @@ -762,11 +782,8 @@ __bam_psplit(dbp, cp, lp, rp, cleft) else if (PREV_PGNO(pp) == PGNO_INVALID && cp->indx == 0) off = adjust; - ++t->lstat.bt_split; - if (off != 0) { - ++t->lstat.bt_fastsplit; + if (off != 0) goto sort; - } /* * Split the data to the left and right pages. Try not to split on @@ -887,8 +904,7 @@ sort: splitp = off; if ((ret = __bam_copy(dbp, pp, rp, splitp, NUM_ENT(pp))) != 0) return (ret); - /* Adjust the cursors. */ - __bam_ca_split(dbp, pp->pgno, lp->pgno, rp->pgno, splitp, cleft); + *splitret = splitp; return (0); } diff --git a/db2/btree/bt_stat.c b/db2/btree/bt_stat.c index 2236434b38..855ef40bbd 100644 --- a/db2/btree/bt_stat.c +++ b/db2/btree/bt_stat.c @@ -8,7 +8,7 @@ #include "config.h" #ifndef lint -static const char sccsid[] = "@(#)bt_stat.c 10.17 (Sleepycat) 4/26/98"; +static const char sccsid[] = "@(#)bt_stat.c 10.27 (Sleepycat) 11/25/98"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -22,8 +22,6 @@ static const char sccsid[] = "@(#)bt_stat.c 10.17 (Sleepycat) 4/26/98"; #include "db_page.h" #include "btree.h" -static void __bam_add_rstat __P((DB_BTREE_LSTAT *, DB_BTREE_STAT *)); - /* * __bam_stat -- * Gather/print the btree statistics @@ -31,62 +29,62 @@ static void __bam_add_rstat __P((DB_BTREE_LSTAT *, DB_BTREE_STAT *)); * PUBLIC: int __bam_stat __P((DB *, void *, void *(*)(size_t), u_int32_t)); */ int -__bam_stat(argdbp, spp, db_malloc, flags) - DB *argdbp; +__bam_stat(dbp, spp, db_malloc, flags) + DB *dbp; void *spp; void *(*db_malloc) __P((size_t)); u_int32_t flags; { BTMETA *meta; BTREE *t; - DB *dbp; + DBC *dbc; DB_BTREE_STAT *sp; DB_LOCK lock; PAGE *h; db_pgno_t lastpgno, pgno; - int ret; + int ret, t_ret; - DEBUG_LWRITE(argdbp, NULL, "bam_stat", NULL, NULL, flags); + DB_PANIC_CHECK(dbp); /* Check for invalid flags. */ - if ((ret = __db_statchk(argdbp, flags)) != 0) + if ((ret = __db_statchk(dbp, flags)) != 0) return (ret); - if (spp == NULL) - return (0); + if ((ret = dbp->cursor(dbp, NULL, &dbc, 0)) != 0) + return (ret); + + DEBUG_LWRITE(dbc, NULL, "bam_stat", NULL, NULL, flags); - GETHANDLE(argdbp, NULL, &dbp, ret); t = dbp->internal; + if (spp == NULL) + return (0); + /* Allocate and clear the structure. */ - if ((sp = db_malloc == NULL ? - (DB_BTREE_STAT *)__db_malloc(sizeof(*sp)) : - (DB_BTREE_STAT *)db_malloc(sizeof(*sp))) == NULL) { - ret = ENOMEM; + if ((ret = __os_malloc(sizeof(*sp), db_malloc, &sp)) != 0) goto err; - } memset(sp, 0, sizeof(*sp)); /* If the app just wants the record count, make it fast. */ - if (LF_ISSET(DB_RECORDCOUNT)) { + if (flags == DB_RECORDCOUNT) { pgno = PGNO_ROOT; - if ((ret = __bam_lget(dbp, 0, pgno, DB_LOCK_READ, &lock)) != 0) + if ((ret = __bam_lget(dbc, 0, pgno, DB_LOCK_READ, &lock)) != 0) goto err; - if ((ret = __bam_pget(dbp, (PAGE **)&h, &pgno, 0)) != 0) + if ((ret = memp_fget(dbp->mpf, &pgno, 0, (PAGE **)&h)) != 0) goto err; sp->bt_nrecs = RE_NREC(h); (void)memp_fput(dbp->mpf, h, 0); - (void)__BT_LPUT(dbp, lock); + (void)__BT_LPUT(dbc, lock); goto done; } /* Get the meta-data page. */ pgno = PGNO_METADATA; - if ((ret = __bam_lget(dbp, 0, pgno, DB_LOCK_READ, &lock)) != 0) + if ((ret = __bam_lget(dbc, 0, pgno, DB_LOCK_READ, &lock)) != 0) goto err; - if ((ret = __bam_pget(dbp, (PAGE **)&meta, &pgno, 0)) != 0) + if ((ret = memp_fget(dbp->mpf, &pgno, 0, (PAGE **)&meta)) != 0) goto err; /* Translate the metadata flags. */ @@ -110,24 +108,13 @@ __bam_stat(argdbp, spp, db_malloc, flags) /* Get the page size from the DB. */ sp->bt_pagesize = dbp->pgsize; - /* Initialize counters with the meta-data page information. */ - __bam_add_rstat(&meta->stat, sp); - - /* - * Add in the local information from this handle. - * - * !!! - * This is a bit odd, but it gets us closer to the truth. - */ - __bam_add_rstat(&t->lstat, sp); - /* Walk the free list, counting pages. */ for (sp->bt_free = 0, pgno = meta->free; pgno != PGNO_INVALID;) { ++sp->bt_free; - if ((ret = __bam_pget(dbp, &h, &pgno, 0)) != 0) { + if ((ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0) { (void)memp_fput(dbp->mpf, meta, 0); - (void)__BT_TLPUT(dbp, lock); + (void)__BT_TLPUT(dbc, lock); goto err; } pgno = h->next_pgno; @@ -136,7 +123,7 @@ __bam_stat(argdbp, spp, db_malloc, flags) /* Discard the meta-data page. */ (void)memp_fput(dbp->mpf, meta, 0); - (void)__BT_TLPUT(dbp, lock); + (void)__BT_TLPUT(dbc, lock); /* Determine the last page of the database. */ if ((ret = memp_fget(dbp->mpf, &lastpgno, DB_MPOOL_LAST, &h)) != 0) @@ -145,10 +132,10 @@ __bam_stat(argdbp, spp, db_malloc, flags) /* Get the root page. */ pgno = PGNO_ROOT; - if ((ret = __bam_lget(dbp, 0, PGNO_ROOT, DB_LOCK_READ, &lock)) != 0) + if ((ret = __bam_lget(dbc, 0, PGNO_ROOT, DB_LOCK_READ, &lock)) != 0) goto err; - if ((ret = __bam_pget(dbp, &h, &pgno, 0)) != 0) { - (void)__BT_LPUT(dbp, lock); + if ((ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0) { + (void)__BT_LPUT(dbc, lock); goto err; } @@ -185,19 +172,19 @@ __bam_stat(argdbp, spp, db_malloc, flags) break; default: (void)memp_fput(dbp->mpf, h, 0); - (void)__BT_LPUT(dbp, lock); + (void)__BT_LPUT(dbc, lock); return (__db_pgfmt(dbp, pgno)); } (void)memp_fput(dbp->mpf, h, 0); - (void)__BT_LPUT(dbp, lock); + (void)__BT_LPUT(dbc, lock); if (++pgno > lastpgno) break; - if (__bam_lget(dbp, 0, pgno, DB_LOCK_READ, &lock)) + if (__bam_lget(dbc, 0, pgno, DB_LOCK_READ, &lock)) break; if (memp_fget(dbp->mpf, &pgno, 0, &h) != 0) { - (void)__BT_LPUT(dbp, lock); + (void)__BT_LPUT(dbc, lock); break; } } @@ -205,50 +192,7 @@ __bam_stat(argdbp, spp, db_malloc, flags) done: *(DB_BTREE_STAT **)spp = sp; ret = 0; -err: PUTHANDLE(dbp); +err: if ((t_ret = dbc->c_close(dbc)) != 0 && ret == 0) + ret = t_ret; return (ret); } - -/* - * __bam_add_mstat -- - * Add the local statistics to the meta-data page statistics. - * - * PUBLIC: void __bam_add_mstat __P((DB_BTREE_LSTAT *, DB_BTREE_LSTAT *)); - */ -void -__bam_add_mstat(from, to) - DB_BTREE_LSTAT *from; - DB_BTREE_LSTAT *to; -{ - to->bt_freed += from->bt_freed; - to->bt_pfxsaved += from->bt_pfxsaved; - to->bt_split += from->bt_split; - to->bt_rootsplit += from->bt_rootsplit; - to->bt_fastsplit += from->bt_fastsplit; - to->bt_added += from->bt_added; - to->bt_deleted += from->bt_deleted; - to->bt_get += from->bt_get; - to->bt_cache_hit += from->bt_cache_hit; - to->bt_cache_miss += from->bt_cache_miss; -} - -/* - * __bam_add_rstat -- - * Add the local statistics to the returned statistics. - */ -static void -__bam_add_rstat(from, to) - DB_BTREE_LSTAT *from; - DB_BTREE_STAT *to; -{ - to->bt_freed += from->bt_freed; - to->bt_pfxsaved += from->bt_pfxsaved; - to->bt_split += from->bt_split; - to->bt_rootsplit += from->bt_rootsplit; - to->bt_fastsplit += from->bt_fastsplit; - to->bt_added += from->bt_added; - to->bt_deleted += from->bt_deleted; - to->bt_get += from->bt_get; - to->bt_cache_hit += from->bt_cache_hit; - to->bt_cache_miss += from->bt_cache_miss; -} diff --git a/db2/btree/btree_auto.c b/db2/btree/btree_auto.c index 75eadb1d62..95ea76e2cd 100644 --- a/db2/btree/btree_auto.c +++ b/db2/btree/btree_auto.c @@ -10,7 +10,6 @@ #endif #include "db_int.h" -#include "shqueue.h" #include "db_page.h" #include "db_dispatch.h" #include "btree.h" @@ -43,8 +42,7 @@ int __bam_pg_alloc_log(logp, txnid, ret_lsnp, flags, rectype = DB_bam_pg_alloc; txn_num = txnid == NULL ? 0 : txnid->txnid; if (txnid == NULL) { - null_lsn.file = 0; - null_lsn.offset = 0; + ZERO_LSN(null_lsn); lsnp = &null_lsn; } else lsnp = &txnid->last_lsn; @@ -55,8 +53,8 @@ int __bam_pg_alloc_log(logp, txnid, ret_lsnp, flags, + sizeof(pgno) + sizeof(ptype) + sizeof(next); - if ((logrec.data = (void *)__db_malloc(logrec.size)) == NULL) - return (ENOMEM); + if ((ret = __os_malloc(logrec.size, NULL, &logrec.data)) != 0) + return (ret); bp = logrec.data; memcpy(bp, &rectype, sizeof(rectype)); @@ -90,7 +88,7 @@ int __bam_pg_alloc_log(logp, txnid, ret_lsnp, flags, ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags); if (txnid != NULL) txnid->last_lsn = *ret_lsnp; - __db_free(logrec.data); + __os_free(logrec.data, 0); return (ret); } @@ -135,7 +133,7 @@ __bam_pg_alloc_print(notused1, dbtp, lsnp, notused2, notused3) printf("\tptype: %lu\n", (u_long)argp->ptype); printf("\tnext: %lu\n", (u_long)argp->next); printf("\n"); - __db_free(argp); + __os_free(argp, 0); return (0); } @@ -149,11 +147,12 @@ __bam_pg_alloc_read(recbuf, argpp) { __bam_pg_alloc_args *argp; u_int8_t *bp; + int ret; - argp = (__bam_pg_alloc_args *)__db_malloc(sizeof(__bam_pg_alloc_args) + - sizeof(DB_TXN)); - if (argp == NULL) - return (ENOMEM); + ret = __os_malloc(sizeof(__bam_pg_alloc_args) + + sizeof(DB_TXN), NULL, &argp); + if (ret != 0) + return (ret); argp->txnid = (DB_TXN *)&argp[1]; bp = recbuf; memcpy(&argp->type, bp, sizeof(argp->type)); @@ -206,8 +205,7 @@ int __bam_pg_free_log(logp, txnid, ret_lsnp, flags, rectype = DB_bam_pg_free; txn_num = txnid == NULL ? 0 : txnid->txnid; if (txnid == NULL) { - null_lsn.file = 0; - null_lsn.offset = 0; + ZERO_LSN(null_lsn); lsnp = &null_lsn; } else lsnp = &txnid->last_lsn; @@ -217,8 +215,8 @@ int __bam_pg_free_log(logp, txnid, ret_lsnp, flags, + sizeof(*meta_lsn) + sizeof(u_int32_t) + (header == NULL ? 0 : header->size) + sizeof(next); - if ((logrec.data = (void *)__db_malloc(logrec.size)) == NULL) - return (ENOMEM); + if ((ret = __os_malloc(logrec.size, NULL, &logrec.data)) != 0) + return (ret); bp = logrec.data; memcpy(bp, &rectype, sizeof(rectype)); @@ -255,7 +253,7 @@ int __bam_pg_free_log(logp, txnid, ret_lsnp, flags, ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags); if (txnid != NULL) txnid->last_lsn = *ret_lsnp; - __db_free(logrec.data); + __os_free(logrec.data, 0); return (ret); } @@ -306,7 +304,7 @@ __bam_pg_free_print(notused1, dbtp, lsnp, notused2, notused3) printf("\n"); printf("\tnext: %lu\n", (u_long)argp->next); printf("\n"); - __db_free(argp); + __os_free(argp, 0); return (0); } @@ -320,11 +318,12 @@ __bam_pg_free_read(recbuf, argpp) { __bam_pg_free_args *argp; u_int8_t *bp; + int ret; - argp = (__bam_pg_free_args *)__db_malloc(sizeof(__bam_pg_free_args) + - sizeof(DB_TXN)); - if (argp == NULL) - return (ENOMEM); + ret = __os_malloc(sizeof(__bam_pg_free_args) + + sizeof(DB_TXN), NULL, &argp); + if (ret != 0) + return (ret); argp->txnid = (DB_TXN *)&argp[1]; bp = recbuf; memcpy(&argp->type, bp, sizeof(argp->type)); @@ -383,8 +382,7 @@ int __bam_split_log(logp, txnid, ret_lsnp, flags, rectype = DB_bam_split; txn_num = txnid == NULL ? 0 : txnid->txnid; if (txnid == NULL) { - null_lsn.file = 0; - null_lsn.offset = 0; + ZERO_LSN(null_lsn); lsnp = &null_lsn; } else lsnp = &txnid->last_lsn; @@ -398,8 +396,8 @@ int __bam_split_log(logp, txnid, ret_lsnp, flags, + sizeof(npgno) + sizeof(*nlsn) + sizeof(u_int32_t) + (pg == NULL ? 0 : pg->size); - if ((logrec.data = (void *)__db_malloc(logrec.size)) == NULL) - return (ENOMEM); + if ((ret = __os_malloc(logrec.size, NULL, &logrec.data)) != 0) + return (ret); bp = logrec.data; memcpy(bp, &rectype, sizeof(rectype)); @@ -450,7 +448,7 @@ int __bam_split_log(logp, txnid, ret_lsnp, flags, ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags); if (txnid != NULL) txnid->last_lsn = *ret_lsnp; - __db_free(logrec.data); + __os_free(logrec.data, 0); return (ret); } @@ -507,7 +505,7 @@ __bam_split_print(notused1, dbtp, lsnp, notused2, notused3) } printf("\n"); printf("\n"); - __db_free(argp); + __os_free(argp, 0); return (0); } @@ -521,11 +519,12 @@ __bam_split_read(recbuf, argpp) { __bam_split_args *argp; u_int8_t *bp; + int ret; - argp = (__bam_split_args *)__db_malloc(sizeof(__bam_split_args) + - sizeof(DB_TXN)); - if (argp == NULL) - return (ENOMEM); + ret = __os_malloc(sizeof(__bam_split_args) + + sizeof(DB_TXN), NULL, &argp); + if (ret != 0) + return (ret); argp->txnid = (DB_TXN *)&argp[1]; bp = recbuf; memcpy(&argp->type, bp, sizeof(argp->type)); @@ -587,8 +586,7 @@ int __bam_rsplit_log(logp, txnid, ret_lsnp, flags, rectype = DB_bam_rsplit; txn_num = txnid == NULL ? 0 : txnid->txnid; if (txnid == NULL) { - null_lsn.file = 0; - null_lsn.offset = 0; + ZERO_LSN(null_lsn); lsnp = &null_lsn; } else lsnp = &txnid->last_lsn; @@ -599,8 +597,8 @@ int __bam_rsplit_log(logp, txnid, ret_lsnp, flags, + sizeof(nrec) + sizeof(u_int32_t) + (rootent == NULL ? 0 : rootent->size) + sizeof(*rootlsn); - if ((logrec.data = (void *)__db_malloc(logrec.size)) == NULL) - return (ENOMEM); + if ((ret = __os_malloc(logrec.size, NULL, &logrec.data)) != 0) + return (ret); bp = logrec.data; memcpy(bp, &rectype, sizeof(rectype)); @@ -647,7 +645,7 @@ int __bam_rsplit_log(logp, txnid, ret_lsnp, flags, ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags); if (txnid != NULL) txnid->last_lsn = *ret_lsnp; - __db_free(logrec.data); + __os_free(logrec.data, 0); return (ret); } @@ -707,7 +705,7 @@ __bam_rsplit_print(notused1, dbtp, lsnp, notused2, notused3) printf("\trootlsn: [%lu][%lu]\n", (u_long)argp->rootlsn.file, (u_long)argp->rootlsn.offset); printf("\n"); - __db_free(argp); + __os_free(argp, 0); return (0); } @@ -721,11 +719,12 @@ __bam_rsplit_read(recbuf, argpp) { __bam_rsplit_args *argp; u_int8_t *bp; + int ret; - argp = (__bam_rsplit_args *)__db_malloc(sizeof(__bam_rsplit_args) + - sizeof(DB_TXN)); - if (argp == NULL) - return (ENOMEM); + ret = __os_malloc(sizeof(__bam_rsplit_args) + + sizeof(DB_TXN), NULL, &argp); + if (ret != 0) + return (ret); argp->txnid = (DB_TXN *)&argp[1]; bp = recbuf; memcpy(&argp->type, bp, sizeof(argp->type)); @@ -782,8 +781,7 @@ int __bam_adj_log(logp, txnid, ret_lsnp, flags, rectype = DB_bam_adj; txn_num = txnid == NULL ? 0 : txnid->txnid; if (txnid == NULL) { - null_lsn.file = 0; - null_lsn.offset = 0; + ZERO_LSN(null_lsn); lsnp = &null_lsn; } else lsnp = &txnid->last_lsn; @@ -794,8 +792,8 @@ int __bam_adj_log(logp, txnid, ret_lsnp, flags, + sizeof(indx) + sizeof(indx_copy) + sizeof(is_insert); - if ((logrec.data = (void *)__db_malloc(logrec.size)) == NULL) - return (ENOMEM); + if ((ret = __os_malloc(logrec.size, NULL, &logrec.data)) != 0) + return (ret); bp = logrec.data; memcpy(bp, &rectype, sizeof(rectype)); @@ -826,7 +824,7 @@ int __bam_adj_log(logp, txnid, ret_lsnp, flags, ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags); if (txnid != NULL) txnid->last_lsn = *ret_lsnp; - __db_free(logrec.data); + __os_free(logrec.data, 0); return (ret); } @@ -870,7 +868,7 @@ __bam_adj_print(notused1, dbtp, lsnp, notused2, notused3) printf("\tindx_copy: %lu\n", (u_long)argp->indx_copy); printf("\tis_insert: %lu\n", (u_long)argp->is_insert); printf("\n"); - __db_free(argp); + __os_free(argp, 0); return (0); } @@ -884,11 +882,12 @@ __bam_adj_read(recbuf, argpp) { __bam_adj_args *argp; u_int8_t *bp; + int ret; - argp = (__bam_adj_args *)__db_malloc(sizeof(__bam_adj_args) + - sizeof(DB_TXN)); - if (argp == NULL) - return (ENOMEM); + ret = __os_malloc(sizeof(__bam_adj_args) + + sizeof(DB_TXN), NULL, &argp); + if (ret != 0) + return (ret); argp->txnid = (DB_TXN *)&argp[1]; bp = recbuf; memcpy(&argp->type, bp, sizeof(argp->type)); @@ -941,8 +940,7 @@ int __bam_cadjust_log(logp, txnid, ret_lsnp, flags, rectype = DB_bam_cadjust; txn_num = txnid == NULL ? 0 : txnid->txnid; if (txnid == NULL) { - null_lsn.file = 0; - null_lsn.offset = 0; + ZERO_LSN(null_lsn); lsnp = &null_lsn; } else lsnp = &txnid->last_lsn; @@ -953,8 +951,8 @@ int __bam_cadjust_log(logp, txnid, ret_lsnp, flags, + sizeof(indx) + sizeof(adjust) + sizeof(total); - if ((logrec.data = (void *)__db_malloc(logrec.size)) == NULL) - return (ENOMEM); + if ((ret = __os_malloc(logrec.size, NULL, &logrec.data)) != 0) + return (ret); bp = logrec.data; memcpy(bp, &rectype, sizeof(rectype)); @@ -985,7 +983,7 @@ int __bam_cadjust_log(logp, txnid, ret_lsnp, flags, ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags); if (txnid != NULL) txnid->last_lsn = *ret_lsnp; - __db_free(logrec.data); + __os_free(logrec.data, 0); return (ret); } @@ -1029,7 +1027,7 @@ __bam_cadjust_print(notused1, dbtp, lsnp, notused2, notused3) printf("\tadjust: %ld\n", (long)argp->adjust); printf("\ttotal: %ld\n", (long)argp->total); printf("\n"); - __db_free(argp); + __os_free(argp, 0); return (0); } @@ -1043,11 +1041,12 @@ __bam_cadjust_read(recbuf, argpp) { __bam_cadjust_args *argp; u_int8_t *bp; + int ret; - argp = (__bam_cadjust_args *)__db_malloc(sizeof(__bam_cadjust_args) + - sizeof(DB_TXN)); - if (argp == NULL) - return (ENOMEM); + ret = __os_malloc(sizeof(__bam_cadjust_args) + + sizeof(DB_TXN), NULL, &argp); + if (ret != 0) + return (ret); argp->txnid = (DB_TXN *)&argp[1]; bp = recbuf; memcpy(&argp->type, bp, sizeof(argp->type)); @@ -1097,8 +1096,7 @@ int __bam_cdel_log(logp, txnid, ret_lsnp, flags, rectype = DB_bam_cdel; txn_num = txnid == NULL ? 0 : txnid->txnid; if (txnid == NULL) { - null_lsn.file = 0; - null_lsn.offset = 0; + ZERO_LSN(null_lsn); lsnp = &null_lsn; } else lsnp = &txnid->last_lsn; @@ -1107,8 +1105,8 @@ int __bam_cdel_log(logp, txnid, ret_lsnp, flags, + sizeof(pgno) + sizeof(*lsn) + sizeof(indx); - if ((logrec.data = (void *)__db_malloc(logrec.size)) == NULL) - return (ENOMEM); + if ((ret = __os_malloc(logrec.size, NULL, &logrec.data)) != 0) + return (ret); bp = logrec.data; memcpy(bp, &rectype, sizeof(rectype)); @@ -1135,7 +1133,7 @@ int __bam_cdel_log(logp, txnid, ret_lsnp, flags, ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags); if (txnid != NULL) txnid->last_lsn = *ret_lsnp; - __db_free(logrec.data); + __os_free(logrec.data, 0); return (ret); } @@ -1177,7 +1175,7 @@ __bam_cdel_print(notused1, dbtp, lsnp, notused2, notused3) (u_long)argp->lsn.file, (u_long)argp->lsn.offset); printf("\tindx: %lu\n", (u_long)argp->indx); printf("\n"); - __db_free(argp); + __os_free(argp, 0); return (0); } @@ -1191,11 +1189,12 @@ __bam_cdel_read(recbuf, argpp) { __bam_cdel_args *argp; u_int8_t *bp; + int ret; - argp = (__bam_cdel_args *)__db_malloc(sizeof(__bam_cdel_args) + - sizeof(DB_TXN)); - if (argp == NULL) - return (ENOMEM); + ret = __os_malloc(sizeof(__bam_cdel_args) + + sizeof(DB_TXN), NULL, &argp); + if (ret != 0) + return (ret); argp->txnid = (DB_TXN *)&argp[1]; bp = recbuf; memcpy(&argp->type, bp, sizeof(argp->type)); @@ -1250,8 +1249,7 @@ int __bam_repl_log(logp, txnid, ret_lsnp, flags, rectype = DB_bam_repl; txn_num = txnid == NULL ? 0 : txnid->txnid; if (txnid == NULL) { - null_lsn.file = 0; - null_lsn.offset = 0; + ZERO_LSN(null_lsn); lsnp = &null_lsn; } else lsnp = &txnid->last_lsn; @@ -1265,8 +1263,8 @@ int __bam_repl_log(logp, txnid, ret_lsnp, flags, + sizeof(u_int32_t) + (repl == NULL ? 0 : repl->size) + sizeof(prefix) + sizeof(suffix); - if ((logrec.data = (void *)__db_malloc(logrec.size)) == NULL) - return (ENOMEM); + if ((ret = __os_malloc(logrec.size, NULL, &logrec.data)) != 0) + return (ret); bp = logrec.data; memcpy(bp, &rectype, sizeof(rectype)); @@ -1319,7 +1317,7 @@ int __bam_repl_log(logp, txnid, ret_lsnp, flags, ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags); if (txnid != NULL) txnid->last_lsn = *ret_lsnp; - __db_free(logrec.data); + __os_free(logrec.data, 0); return (ret); } @@ -1382,7 +1380,7 @@ __bam_repl_print(notused1, dbtp, lsnp, notused2, notused3) printf("\tprefix: %lu\n", (u_long)argp->prefix); printf("\tsuffix: %lu\n", (u_long)argp->suffix); printf("\n"); - __db_free(argp); + __os_free(argp, 0); return (0); } @@ -1396,11 +1394,12 @@ __bam_repl_read(recbuf, argpp) { __bam_repl_args *argp; u_int8_t *bp; + int ret; - argp = (__bam_repl_args *)__db_malloc(sizeof(__bam_repl_args) + - sizeof(DB_TXN)); - if (argp == NULL) - return (ENOMEM); + ret = __os_malloc(sizeof(__bam_repl_args) + + sizeof(DB_TXN), NULL, &argp); + if (ret != 0) + return (ret); argp->txnid = (DB_TXN *)&argp[1]; bp = recbuf; memcpy(&argp->type, bp, sizeof(argp->type)); diff --git a/db2/common/db_appinit.c b/db2/common/db_appinit.c index 6ec007be0a..e02b1a872d 100644 --- a/db2/common/db_appinit.c +++ b/db2/common/db_appinit.c @@ -8,7 +8,7 @@ #include "config.h" #ifndef lint -static const char sccsid[] = "@(#)db_appinit.c 10.52 (Sleepycat) 6/2/98"; +static const char sccsid[] = "@(#)db_appinit.c 10.66 (Sleepycat) 12/7/98"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -16,7 +16,6 @@ static const char sccsid[] = "@(#)db_appinit.c 10.52 (Sleepycat) 6/2/98"; #include <ctype.h> #include <errno.h> -#include <signal.h> #include <stdlib.h> #include <string.h> #include <unistd.h> @@ -34,10 +33,22 @@ static const char sccsid[] = "@(#)db_appinit.c 10.52 (Sleepycat) 6/2/98"; static int __db_home __P((DB_ENV *, const char *, u_int32_t)); static int __db_parse __P((DB_ENV *, char *)); -static int __db_tmp_dir __P((DB_ENV *, u_int32_t)); static int __db_tmp_open __P((DB_ENV *, u_int32_t, char *, int *)); /* + * This conflict array is used for concurrent db access (cdb). It + * uses the same locks as the db_rw_conflict array, but adds an IW + * mode to be used for write cursors. + */ +static u_int8_t const db_cdb_conflicts[] = { + /* N R W IW */ + /* N */ 0, 0, 0, 0, + /* R */ 0, 0, 1, 0, + /* W */ 0, 1, 1, 1, + /* IW */ 0, 0, 1, 1 +}; + +/* * db_version -- * Return version information. */ @@ -70,21 +81,24 @@ db_appinit(db_home, db_config, dbenv, flags) char * const *p; char *lp, buf[MAXPATHLEN * 2]; + fp = NULL; + /* Validate arguments. */ if (dbenv == NULL) return (EINVAL); - #ifdef HAVE_SPINLOCKS #define OKFLAGS \ - (DB_CREATE | DB_NOMMAP | DB_THREAD | DB_INIT_LOCK | DB_INIT_LOG | \ - DB_INIT_MPOOL | DB_INIT_TXN | DB_MPOOL_PRIVATE | DB_RECOVER | \ - DB_RECOVER_FATAL | DB_TXN_NOSYNC | DB_USE_ENVIRON | DB_USE_ENVIRON_ROOT) + (DB_CREATE | DB_INIT_CDB | DB_INIT_LOCK | DB_INIT_LOG | \ + DB_INIT_MPOOL | DB_INIT_TXN | DB_MPOOL_PRIVATE | DB_NOMMAP | \ + DB_RECOVER | DB_RECOVER_FATAL | DB_THREAD | DB_TXN_NOSYNC | \ + DB_USE_ENVIRON | DB_USE_ENVIRON_ROOT) #else #define OKFLAGS \ - (DB_CREATE | DB_NOMMAP | DB_INIT_LOCK | DB_INIT_LOG | \ - DB_INIT_MPOOL | DB_INIT_TXN | DB_MPOOL_PRIVATE | DB_RECOVER | \ - DB_RECOVER_FATAL | DB_TXN_NOSYNC | DB_USE_ENVIRON | DB_USE_ENVIRON_ROOT) + (DB_CREATE | DB_INIT_CDB | DB_INIT_LOCK | DB_INIT_LOG | \ + DB_INIT_MPOOL | DB_INIT_TXN | DB_MPOOL_PRIVATE | DB_NOMMAP | \ + DB_RECOVER | DB_RECOVER_FATAL | DB_TXN_NOSYNC | \ + DB_USE_ENVIRON | DB_USE_ENVIRON_ROOT) #endif if ((ret = __db_fchk(dbenv, "db_appinit", flags, OKFLAGS)) != 0) return (ret); @@ -97,8 +111,6 @@ db_appinit(db_home, db_config, dbenv, flags) if (LF_ISSET(DB_THREAD)) F_SET(dbenv, DB_ENV_THREAD); - fp = NULL; - /* Set the database home. */ if ((ret = __db_home(dbenv, db_home, flags)) != 0) goto err; @@ -127,8 +139,17 @@ db_appinit(db_home, db_config, dbenv, flags) (void)strcat(buf, CONFIG_NAME); if ((fp = fopen(buf, "r")) != NULL) { while (fgets(buf, sizeof(buf), fp) != NULL) { - if ((lp = strchr(buf, '\n')) != NULL) - *lp = '\0'; + if ((lp = strchr(buf, '\n')) == NULL) { + __db_err(dbenv, + "%s: line too long", CONFIG_NAME); + ret = EINVAL; + goto err; + } + *lp = '\0'; + if (buf[0] == '\0' || + buf[0] == '#' || isspace(buf[0])) + continue; + if ((ret = __db_parse(dbenv, buf)) != 0) goto err; } @@ -138,11 +159,14 @@ db_appinit(db_home, db_config, dbenv, flags) } /* Set up the tmp directory path. */ - if (dbenv->db_tmp_dir == NULL && - (ret = __db_tmp_dir(dbenv, flags)) != 0) + if (dbenv->db_tmp_dir == NULL && (ret = __os_tmpdir(dbenv, flags)) != 0) goto err; - /* Indicate that the path names have been set. */ + /* + * Flag that the structure has been initialized by the application. + * Note, this must be set before calling into the subsystems as it + * is used when we're doing file naming. + */ F_SET(dbenv, DB_ENV_APPINIT); /* @@ -166,6 +190,18 @@ db_appinit(db_home, db_config, dbenv, flags) * Default permissions are read-write for both owner and group. */ mode = __db_omode("rwrw--"); + if (LF_ISSET(DB_INIT_CDB)) { + if (LF_ISSET(DB_INIT_LOCK | DB_INIT_LOG | DB_INIT_TXN)) { + ret = EINVAL; + goto err; + } + F_SET(dbenv, DB_ENV_CDB); + dbenv->lk_conflicts = db_cdb_conflicts; + dbenv->lk_modes = DB_LOCK_RW_N + 1; + if ((ret = lock_open(NULL, LF_ISSET(DB_CREATE | DB_THREAD), + mode, dbenv, &dbenv->lk_info)) != 0) + goto err; + } if (LF_ISSET(DB_INIT_LOCK) && (ret = lock_open(NULL, LF_ISSET(DB_CREATE | DB_THREAD), mode, dbenv, &dbenv->lk_info)) != 0) @@ -232,28 +268,32 @@ db_appexit(dbenv) if (dbenv->tx_info && (t_ret = txn_close(dbenv->tx_info)) != 0) if (ret == 0) ret = t_ret; - if (dbenv->mp_info && (t_ret = memp_close(dbenv->mp_info)) != 0) + if (dbenv->lg_info && (t_ret = log_close(dbenv->lg_info)) != 0) if (ret == 0) ret = t_ret; - if (dbenv->lg_info && (t_ret = log_close(dbenv->lg_info)) != 0) + if (dbenv->mp_info && (t_ret = memp_close(dbenv->mp_info)) != 0) if (ret == 0) ret = t_ret; if (dbenv->lk_info && (t_ret = lock_close(dbenv->lk_info)) != 0) if (ret == 0) ret = t_ret; + /* Clear initialized flag (after subsystems, it affects naming). */ + F_CLR(dbenv, DB_ENV_APPINIT); + /* Free allocated memory. */ if (dbenv->db_home != NULL) - FREES(dbenv->db_home); + __os_freestr(dbenv->db_home); if ((p = dbenv->db_data_dir) != NULL) { for (; *p != NULL; ++p) - FREES(*p); - FREE(dbenv->db_data_dir, dbenv->data_cnt * sizeof(char **)); + __os_freestr(*p); + __os_free(dbenv->db_data_dir, + dbenv->data_cnt * sizeof(char **)); } if (dbenv->db_log_dir != NULL) - FREES(dbenv->db_log_dir); + __os_freestr(dbenv->db_log_dir); if (dbenv->db_tmp_dir != NULL) - FREES(dbenv->db_tmp_dir); + __os_freestr(dbenv->db_tmp_dir); return (ret); } @@ -261,7 +301,7 @@ db_appexit(dbenv) #define DB_ADDSTR(str) { \ if ((str) != NULL) { \ /* If leading slash, start over. */ \ - if (__db_abspath(str)) { \ + if (__os_abspath(str)) { \ p = start; \ slash = 0; \ } \ @@ -317,10 +357,9 @@ __db_appname(dbenv, appname, dir, file, tmp_oflags, fdp, namep) * path, we're done. If the directory is, simply append the file and * return. */ - if (file != NULL && __db_abspath(file)) - return ((*namep = - (char *)__db_strdup(file)) == NULL ? ENOMEM : 0); - if (dir != NULL && __db_abspath(dir)) { + if (file != NULL && __os_abspath(file)) + return (__os_strdup(file, namep)); + if (dir != NULL && __os_abspath(dir)) { a = dir; goto done; } @@ -417,7 +456,7 @@ retry: switch (appname) { if (0) { tmp: if (dbenv == NULL || !F_ISSET(dbenv, DB_ENV_APPINIT)) { memset(&etmp, 0, sizeof(etmp)); - if ((ret = __db_tmp_dir(&etmp, DB_USE_ENVIRON)) != 0) + if ((ret = __os_tmpdir(&etmp, DB_USE_ENVIRON)) != 0) return (ret); tmp_free = 1; a = etmp.db_tmp_dir; @@ -437,12 +476,11 @@ done: len = * name. */ #define DB_TRAIL "XXXXXX" - if ((start = - (char *)__db_malloc(len + sizeof(DB_TRAIL) + 10)) == NULL) { - __db_err(dbenv, "%s", strerror(ENOMEM)); + if ((ret = + __os_malloc(len + sizeof(DB_TRAIL) + 10, NULL, &start)) != 0) { if (tmp_free) - FREES(etmp.db_tmp_dir); - return (ENOMEM); + __os_freestr(etmp.db_tmp_dir); + return (ret); } slash = 0; @@ -452,28 +490,32 @@ done: len = DB_ADDSTR(file); *p = '\0'; + /* Discard any space allocated to find the temp directory. */ + if (tmp_free) { + __os_freestr(etmp.db_tmp_dir); + tmp_free = 0; + } + /* * If we're opening a data file, see if it exists. If it does, * return it, otherwise, try and find another one to open. */ - if (data_entry != -1 && __db_exists(start, NULL) != 0) { - FREES(start); + if (data_entry != -1 && __os_exists(start, NULL) != 0) { + __os_freestr(start); a = b = c = NULL; goto retry; } - /* Discard any space allocated to find the temp directory. */ - if (tmp_free) - FREES(etmp.db_tmp_dir); - /* Create the file if so requested. */ if (tmp_create && (ret = __db_tmp_open(dbenv, tmp_oflags, start, fdp)) != 0) { - FREES(start); + __os_freestr(start); return (ret); } - if (namep != NULL) + if (namep == NULL) + __os_freestr(start); + else *namep = start; return (0); } @@ -511,11 +553,7 @@ __db_home(dbenv, db_home, flags) if (p == NULL) return (0); - if ((dbenv->db_home = (char *)__db_strdup(p)) == NULL) { - __db_err(dbenv, "%s", strerror(ENOMEM)); - return (ENOMEM); - } - return (0); + return (__os_strdup(p, &dbenv->db_home)); } /* @@ -530,152 +568,73 @@ __db_parse(dbenv, s) int ret; char *local_s, *name, *value, **p, *tp; - ret = 0; - /* * We need to strdup the argument in case the caller passed us * static data. */ - if ((local_s = (char *)__db_strdup(s)) == NULL) - return (ENOMEM); + if ((ret = __os_strdup(s, &local_s)) != 0) + return (ret); - tp = local_s; - while ((name = strsep(&tp, " \t")) != NULL && *name == '\0') + /* + * Name/value pairs are parsed as two white-space separated strings. + * Leading and trailing white-space is trimmed from the value, but + * it may contain embedded white-space. Note: we use the isspace(3) + * macro because it's more portable, but that means that you can use + * characters like form-feed to separate the strings. + */ + name = local_s; + for (tp = name; *tp != '\0' && !isspace(*tp); ++tp) + ; + if (*tp == '\0' || tp == name) + goto illegal; + *tp = '\0'; + for (++tp; isspace(*tp); ++tp) ; - if (name == NULL) + if (*tp == '\0') goto illegal; - while ((value = strsep(&tp, " \t")) != NULL && *value == '\0') + value = tp; + for (++tp; *tp != '\0'; ++tp) + ; + for (--tp; isspace(*tp); --tp) ; - if (value == NULL) { + if (tp == value) { illegal: ret = EINVAL; __db_err(dbenv, "illegal name-value pair: %s", s); goto err; } + *++tp = '\0'; #define DATA_INIT_CNT 20 /* Start with 20 data slots. */ if (!strcmp(name, "DB_DATA_DIR")) { if (dbenv->db_data_dir == NULL) { - if ((dbenv->db_data_dir = - (char **)__db_calloc(DATA_INIT_CNT, - sizeof(char **))) == NULL) - goto nomem; + if ((ret = __os_calloc(DATA_INIT_CNT, + sizeof(char **), &dbenv->db_data_dir)) != 0) + goto err; dbenv->data_cnt = DATA_INIT_CNT; } else if (dbenv->data_next == dbenv->data_cnt - 1) { dbenv->data_cnt *= 2; - if ((dbenv->db_data_dir = - (char **)__db_realloc(dbenv->db_data_dir, - dbenv->data_cnt * sizeof(char **))) == NULL) - goto nomem; + if ((ret = __os_realloc(&dbenv->db_data_dir, + dbenv->data_cnt * sizeof(char **))) != 0) + goto err; } p = &dbenv->db_data_dir[dbenv->data_next++]; } else if (!strcmp(name, "DB_LOG_DIR")) { if (dbenv->db_log_dir != NULL) - FREES(dbenv->db_log_dir); + __os_freestr(dbenv->db_log_dir); p = &dbenv->db_log_dir; } else if (!strcmp(name, "DB_TMP_DIR")) { if (dbenv->db_tmp_dir != NULL) - FREES(dbenv->db_tmp_dir); + __os_freestr(dbenv->db_tmp_dir); p = &dbenv->db_tmp_dir; } else goto err; - if ((*p = (char *)__db_strdup(value)) == NULL) { -nomem: ret = ENOMEM; - __db_err(dbenv, "%s", strerror(ENOMEM)); - } + ret = __os_strdup(value, p); -err: FREES(local_s); +err: __os_freestr(local_s); return (ret); } -#ifdef macintosh -#include <TFileSpec.h> - -static char *sTempFolder; -#endif - -/* - * tmp -- - * Set the temporary directory path. - */ -static int -__db_tmp_dir(dbenv, flags) - DB_ENV *dbenv; - u_int32_t flags; -{ - static const char * list[] = { /* Ordered: see db_appinit(3). */ - "/var/tmp", - "/usr/tmp", - "/temp", /* WIN32. */ - "/tmp", - "C:/temp", /* WIN32. */ - "C:/tmp", /* WIN32. */ - NULL - }; - const char **lp, *p; - - /* Use the environment if it's permitted and initialized. */ - p = NULL; -#ifdef HAVE_GETEUID - if (LF_ISSET(DB_USE_ENVIRON) || - (LF_ISSET(DB_USE_ENVIRON_ROOT) && getuid() == 0)) { -#else - if (LF_ISSET(DB_USE_ENVIRON)) { -#endif - if ((p = getenv("TMPDIR")) != NULL && p[0] == '\0') { - __db_err(dbenv, "illegal TMPDIR environment variable"); - return (EINVAL); - } - /* WIN32 */ - if (p == NULL && (p = getenv("TEMP")) != NULL && p[0] == '\0') { - __db_err(dbenv, "illegal TEMP environment variable"); - return (EINVAL); - } - /* WIN32 */ - if (p == NULL && (p = getenv("TMP")) != NULL && p[0] == '\0') { - __db_err(dbenv, "illegal TMP environment variable"); - return (EINVAL); - } - /* Macintosh */ - if (p == NULL && - (p = getenv("TempFolder")) != NULL && p[0] == '\0') { - __db_err(dbenv, - "illegal TempFolder environment variable"); - return (EINVAL); - } - } - -#ifdef macintosh - /* Get the path to the temporary folder. */ - if (p == NULL) { - FSSpec spec; - - if (!Special2FSSpec(kTemporaryFolderType, - kOnSystemDisk, 0, &spec)) { - p = FSp2FullPath(&spec); - sTempFolder = __db_malloc(strlen(p) + 1); - strcpy(sTempFolder, p); - p = sTempFolder; - } - } -#endif - - /* Step through the list looking for a possibility. */ - if (p == NULL) - for (lp = list; *lp != NULL; ++lp) - if (__db_exists(p = *lp, NULL) == 0) - break; - - if (p == NULL) - return (0); - - if ((dbenv->db_tmp_dir = (char *)__db_strdup(p)) == NULL) { - __db_err(dbenv, "%s", strerror(ENOMEM)); - return (ENOMEM); - } - return (0); -} - /* * __db_tmp_open -- * Create a temporary file. @@ -687,9 +646,6 @@ __db_tmp_open(dbenv, flags, path, fdp) char *path; int *fdp; { -#ifdef HAVE_SIGFILLSET - sigset_t set, oset; -#endif u_long pid; int mode, isdir, ret; const char *p; @@ -699,7 +655,7 @@ __db_tmp_open(dbenv, flags, path, fdp) * Check the target directory; if you have six X's and it doesn't * exist, this runs for a *very* long time. */ - if ((ret = __db_exists(path, &isdir)) != 0) { + if ((ret = __os_exists(path, &isdir)) != 0) { __db_err(dbenv, "%s: %s", path, strerror(ret)); return (ret); } @@ -738,27 +694,9 @@ __db_tmp_open(dbenv, flags, path, fdp) LF_SET(DB_CREATE | DB_EXCL); mode = __db_omode("rw----"); - /* - * Try to open a file. We block every signal we can get our hands - * on so that, if we're interrupted at the wrong time, the temporary - * file isn't left around -- of course, if we drop core in-between - * the calls we'll hang forever, but that's probably okay. ;-} - */ -#ifdef HAVE_SIGFILLSET - if (LF_ISSET(DB_TEMPORARY)) - (void)sigfillset(&set); -#endif + /* Loop, trying to open a file. */ for (;;) { -#ifdef HAVE_SIGFILLSET - if (LF_ISSET(DB_TEMPORARY)) - (void)sigprocmask(SIG_BLOCK, &set, &oset); -#endif - ret = __db_open(path, flags, flags, mode, fdp); -#ifdef HAVE_SIGFILLSET - if (LF_ISSET(DB_TEMPORARY)) - (void)sigprocmask(SIG_SETMASK, &oset, NULL); -#endif - if (ret == 0) + if ((ret = __db_open(path, flags, flags, mode, fdp)) == 0) return (0); /* diff --git a/db2/common/db_apprec.c b/db2/common/db_apprec.c index 7f0cb3a212..5e8fec4659 100644 --- a/db2/common/db_apprec.c +++ b/db2/common/db_apprec.c @@ -11,7 +11,7 @@ static const char copyright[] = "@(#) Copyright (c) 1996, 1997, 1998\n\ Sleepycat Software Inc. All rights reserved.\n"; -static const char sccsid[] = "@(#)db_apprec.c 10.30 (Sleepycat) 5/3/98"; +static const char sccsid[] = "@(#)db_apprec.c 10.33 (Sleepycat) 10/5/98"; #endif #ifndef NO_SYSTEM_INCLUDES @@ -44,7 +44,8 @@ __db_apprec(dbenv, flags) { DBT data; DB_LOG *lp; - DB_LSN ckp_lsn, first_lsn, lsn; + DB_LSN ckp_lsn, first_lsn, lsn, open_lsn; + __txn_ckp_args *ckp_args; time_t now; u_int32_t is_thread; int ret; @@ -65,10 +66,16 @@ __db_apprec(dbenv, flags) /* * Recovery is done in three passes: + * Pass #0: + * We need to find the position from which we will open files + * We need to open files beginning with the last to next + * checkpoint because we might have crashed after writing the + * last checkpoint record, but before having written out all + * the open file information. * Pass #1: - * Read forward through the log from the last checkpoint to the - * end of the log, opening and closing files so that at the end - * of the log we have the "current" set of files open. + * Read forward through the log from the second to last checkpoint + * opening and closing files so that at the end of the log we have + * the "current" set of files open. * Pass #2: * Read backward through the log undoing any uncompleted TXNs. * If doing catastrophic recovery, we read to the beginning of @@ -84,33 +91,50 @@ __db_apprec(dbenv, flags) */ /* - * Find the last checkpoint in the log. This is the point from which - * we want to begin pass #1 (the TXN_OPENFILES pass). + * Find the second to last checkpoint in the log. This is the point + * from which we want to begin pass #1 (the TXN_OPENFILES pass). */ memset(&data, 0, sizeof(data)); + ckp_args = NULL; + if ((ret = log_get(lp, &ckp_lsn, &data, DB_CHECKPOINT)) != 0) { /* * If we don't find a checkpoint, start from the beginning. * If that fails, we're done. Note, we do not require that * there be log records if we're performing recovery. */ - if ((ret = log_get(lp, &ckp_lsn, &data, DB_FIRST)) != 0) { +first: if ((ret = log_get(lp, &ckp_lsn, &data, DB_FIRST)) != 0) { if (ret == DB_NOTFOUND) ret = 0; else __db_err(dbenv, "First log record not found"); goto out; } - } + open_lsn = ckp_lsn; + } else if ((ret = __txn_ckp_read(data.data, &ckp_args)) != 0) { + __db_err(dbenv, "Invalid checkpoint record at [%ld][%ld]\n", + (u_long)ckp_lsn.file, (u_long)ckp_lsn.offset); + goto out; + } else if (IS_ZERO_LSN(ckp_args->last_ckp) || + (ret = log_get(lp, &ckp_args->last_ckp, &data, DB_SET)) != 0) + goto first; + else + open_lsn = ckp_args->last_ckp; /* * Now, ckp_lsn is either the lsn of the last checkpoint or the lsn - * of the first record in the log. Begin the TXN_OPENFILES pass from - * that lsn, and proceed to the end of the log. + * of the first record in the log. Open_lsn is the second to last + * checkpoint or the beinning of the log; begin the TXN_OPENFILES + * pass from that lsn, and proceed to the end of the log. */ - lsn = ckp_lsn; + lsn = open_lsn; for (;;) { - ret = __db_dispatch(lp, &data, &lsn, TXN_OPENFILES, txninfo); + if (dbenv->tx_recover != NULL) + ret = dbenv->tx_recover(lp, + &data, &lsn, TXN_OPENFILES, txninfo); + else + ret = __db_dispatch(lp, + &data, &lsn, TXN_OPENFILES, txninfo); if (ret != 0 && ret != DB_TXN_CKP) goto msgerr; if ((ret = log_get(lp, &lsn, &data, DB_NEXT)) != 0) { @@ -148,8 +172,12 @@ __db_apprec(dbenv, flags) for (ret = log_get(lp, &lsn, &data, DB_LAST); ret == 0 && log_compare(&lsn, &first_lsn) > 0; ret = log_get(lp, &lsn, &data, DB_PREV)) { - ret = __db_dispatch(lp, - &data, &lsn, TXN_BACKWARD_ROLL, txninfo); + if (dbenv->tx_recover != NULL) + ret = dbenv->tx_recover(lp, + &data, &lsn, TXN_BACKWARD_ROLL, txninfo); + else + ret = __db_dispatch(lp, + &data, &lsn, TXN_BACKWARD_ROLL, txninfo); if (ret != 0) { if (ret != DB_TXN_CKP) goto msgerr; @@ -165,7 +193,12 @@ __db_apprec(dbenv, flags) */ for (ret = log_get(lp, &lsn, &data, DB_NEXT); ret == 0; ret = log_get(lp, &lsn, &data, DB_NEXT)) { - ret = __db_dispatch(lp, &data, &lsn, TXN_FORWARD_ROLL, txninfo); + if (dbenv->tx_recover != NULL) + ret = dbenv->tx_recover(lp, + &data, &lsn, TXN_FORWARD_ROLL, txninfo); + else + ret = __db_dispatch(lp, + &data, &lsn, TXN_FORWARD_ROLL, txninfo); if (ret != 0) { if (ret != DB_TXN_CKP) goto msgerr; @@ -207,6 +240,8 @@ msgerr: __db_err(dbenv, "Recovery function for LSN %lu %lu failed", out: F_SET(lp, is_thread); __db_txnlist_end(txninfo); + if (ckp_args != NULL) + __os_free(ckp_args, sizeof(*ckp_args)); return (ret); } diff --git a/db2/common/db_err.c b/db2/common/db_err.c index 98a414279e..e935ddfcc5 100644 --- a/db2/common/db_err.c +++ b/db2/common/db_err.c @@ -8,13 +8,15 @@ #include "config.h" #ifndef lint -static const char sccsid[] = "@(#)db_err.c 10.25 (Sleepycat) 5/2/98"; +static const char sccsid[] = "@(#)db_err.c 10.42 (Sleepycat) 11/24/98"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES #include <sys/types.h> #include <errno.h> +#include <stdio.h> +#include <string.h> #ifdef __STDC__ #include <stdarg.h> @@ -24,10 +26,67 @@ static const char sccsid[] = "@(#)db_err.c 10.25 (Sleepycat) 5/2/98"; #endif #include "db_int.h" +#include "shqueue.h" +#include "db_shash.h" +#include "lock.h" +#include "lock_ext.h" +#include "log.h" +#include "log_ext.h" +#include "mp.h" +#include "mp_ext.h" +#include "txn.h" +#include "txn_ext.h" #include "common_ext.h" +#include "clib_ext.h" -static int __db_keyempty __P((const DB_ENV *)); -static int __db_rdonly __P((const DB_ENV *, const char *)); +/* + * __db_fchk -- + * General flags checking routine. + * + * PUBLIC: int __db_fchk __P((DB_ENV *, const char *, u_int32_t, u_int32_t)); + */ +int +__db_fchk(dbenv, name, flags, ok_flags) + DB_ENV *dbenv; + const char *name; + u_int32_t flags, ok_flags; +{ + return (flags & ~ok_flags ? __db_ferr(dbenv, name, 0) : 0); +} + +/* + * __db_fcchk -- + * General combination flags checking routine. + * + * PUBLIC: int __db_fcchk + * PUBLIC: __P((DB_ENV *, const char *, u_int32_t, u_int32_t, u_int32_t)); + */ +int +__db_fcchk(dbenv, name, flags, flag1, flag2) + DB_ENV *dbenv; + const char *name; + u_int32_t flags, flag1, flag2; +{ + return ((flags & flag1) && + (flags & flag2) ? __db_ferr(dbenv, name, 1) : 0); +} + +/* + * __db_ferr -- + * Common flag errors. + * + * PUBLIC: int __db_ferr __P((const DB_ENV *, const char *, int)); + */ +int +__db_ferr(dbenv, name, iscombo) + const DB_ENV *dbenv; + const char *name; + int iscombo; +{ + __db_err(dbenv, "illegal flag %sspecified to %s", + iscombo ? "combination " : "", name); + return (EINVAL); +} /* * __db_err -- @@ -55,561 +114,98 @@ __db_err(dbenv, fmt, va_alist) if (dbenv == NULL) return; + if (dbenv->db_errcall != NULL) { #ifdef __STDC__ - va_start(ap, fmt); + va_start(ap, fmt); #else - va_start(ap); + va_start(ap); #endif - if (dbenv->db_errcall != NULL) { (void)vsnprintf(errbuf, sizeof(errbuf), fmt, ap); dbenv->db_errcall(dbenv->db_errpfx, errbuf); + va_end(ap); } if (dbenv->db_errfile != NULL) { if (dbenv->db_errpfx != NULL) (void)fprintf(dbenv->db_errfile, "%s: ", dbenv->db_errpfx); +#ifdef __STDC__ + va_start(ap, fmt); +#else + va_start(ap); +#endif (void)vfprintf(dbenv->db_errfile, fmt, ap); (void)fprintf(dbenv->db_errfile, "\n"); (void)fflush(dbenv->db_errfile); + va_end(ap); } - va_end(ap); -} - -/* - * XXX - * Provide ANSI C prototypes for the panic functions. Some compilers, (e.g., - * MS VC 4.2) get upset if they aren't here, even though the K&R declaration - * appears before the assignment in the __db__panic() call. - */ -static int __db_ecursor __P((DB *, DB_TXN *, DBC **)); -static int __db_edel __P((DB *, DB_TXN *, DBT *, u_int32_t)); -static int __db_efd __P((DB *, int *)); -static int __db_egp __P((DB *, DB_TXN *, DBT *, DBT *, u_int32_t)); -static int __db_estat __P((DB *, void *, void *(*)(size_t), u_int32_t)); -static int __db_esync __P((DB *, u_int32_t)); - -/* - * __db_ecursor -- - * After-panic cursor routine. - */ -static int -__db_ecursor(a, b, c) - DB *a; - DB_TXN *b; - DBC **c; -{ - COMPQUIET(a, NULL); - COMPQUIET(b, NULL); - COMPQUIET(c, NULL); - - return (EPERM); -} - -/* - * __db_edel -- - * After-panic delete routine. - */ -static int -__db_edel(a, b, c, d) - DB *a; - DB_TXN *b; - DBT *c; - u_int32_t d; -{ - COMPQUIET(a, NULL); - COMPQUIET(b, NULL); - COMPQUIET(c, NULL); - COMPQUIET(d, 0); - - return (EPERM); } /* - * __db_efd -- - * After-panic fd routine. - */ -static int -__db_efd(a, b) - DB *a; - int *b; -{ - COMPQUIET(a, NULL); - COMPQUIET(b, NULL); - - return (EPERM); -} - -/* - * __db_egp -- - * After-panic get/put routine. - */ -static int -__db_egp(a, b, c, d, e) - DB *a; - DB_TXN *b; - DBT *c, *d; - u_int32_t e; -{ - COMPQUIET(a, NULL); - COMPQUIET(b, NULL); - COMPQUIET(c, NULL); - COMPQUIET(d, NULL); - COMPQUIET(e, 0); - - return (EPERM); -} - -/* - * __db_estat -- - * After-panic stat routine. - */ -static int -__db_estat(a, b, c, d) - DB *a; - void *b; - void *(*c) __P((size_t)); - u_int32_t d; -{ - COMPQUIET(a, NULL); - COMPQUIET(b, NULL); - COMPQUIET(c, NULL); - COMPQUIET(d, 0); - - return (EPERM); -} - -/* - * __db_esync -- - * After-panic sync routine. - */ -static int -__db_esync(a, b) - DB *a; - u_int32_t b; -{ - COMPQUIET(a, NULL); - COMPQUIET(b, 0); - - return (EPERM); -} - -/* - * __db_panic -- - * Lock out the tree due to unrecoverable error. + * __db_pgerr -- + * Error when unable to retrieve a specified page. * - * PUBLIC: int __db_panic __P((DB *)); + * PUBLIC: int __db_pgerr __P((DB *, db_pgno_t)); */ int -__db_panic(dbp) +__db_pgerr(dbp, pgno) DB *dbp; + db_pgno_t pgno; { /* - * XXX - * We should shut down all of the process's cursors, too. - * - * We should call mpool and have it shut down the file, so we get - * other processes sharing this file as well. - * - * Chaos reigns within. - * Reflect, repent, and reboot. - * Order shall return. + * Three things are certain: + * Death, taxes, and lost data. + * Guess which has occurred. */ - dbp->cursor = __db_ecursor; - dbp->del = __db_edel; - dbp->fd = __db_efd; - dbp->get = __db_egp; - dbp->put = __db_egp; - dbp->stat = __db_estat; - dbp->sync = __db_esync; - - return (EPERM); + __db_err(dbp->dbenv, + "unable to create/retrieve page %lu", (u_long)pgno); + return (__db_panic(dbp->dbenv, EIO)); } -/* Check for invalid flags. */ -#undef DB_CHECK_FLAGS -#define DB_CHECK_FLAGS(dbenv, name, flags, ok_flags) \ - if ((flags) & ~(ok_flags)) \ - return (__db_ferr(dbenv, name, 0)); -/* Check for invalid flag combinations. */ -#undef DB_CHECK_FCOMBO -#define DB_CHECK_FCOMBO(dbenv, name, flags, flag1, flag2) \ - if ((flags) & (flag1) && (flags) & (flag2)) \ - return (__db_ferr(dbenv, name, 1)); - /* - * __db_fchk -- - * General flags checking routine. + * __db_pgfmt -- + * Error when a page has the wrong format. * - * PUBLIC: int __db_fchk __P((DB_ENV *, const char *, u_int32_t, u_int32_t)); + * PUBLIC: int __db_pgfmt __P((DB *, db_pgno_t)); */ int -__db_fchk(dbenv, name, flags, ok_flags) - DB_ENV *dbenv; - const char *name; - u_int32_t flags, ok_flags; +__db_pgfmt(dbp, pgno) + DB *dbp; + db_pgno_t pgno; { - DB_CHECK_FLAGS(dbenv, name, flags, ok_flags); - return (0); + __db_err(dbp->dbenv, + "page %lu: illegal page type or format", (u_long)pgno); + return (__db_panic(dbp->dbenv, EINVAL)); } /* - * __db_fcchk -- - * General combination flags checking routine. + * __db_panic -- + * Lock out the tree due to unrecoverable error. * - * PUBLIC: int __db_fcchk - * PUBLIC: __P((DB_ENV *, const char *, u_int32_t, u_int32_t, u_int32_t)); + * PUBLIC: int __db_panic __P((DB_ENV *, int)); */ int -__db_fcchk(dbenv, name, flags, flag1, flag2) +__db_panic(dbenv, errval) DB_ENV *dbenv; - const char *name; - u_int32_t flags, flag1, flag2; + int errval; { - DB_CHECK_FCOMBO(dbenv, name, flags, flag1, flag2); - return (0); -} + if (dbenv != NULL) { + dbenv->db_panic = errval; -/* - * __db_cdelchk -- - * Common cursor delete argument checking routine. - * - * PUBLIC: int __db_cdelchk __P((const DB *, u_int32_t, int, int)); - */ -int -__db_cdelchk(dbp, flags, isrdonly, isvalid) - const DB *dbp; - u_int32_t flags; - int isrdonly, isvalid; -{ - /* Check for changes to a read-only tree. */ - if (isrdonly) - return (__db_rdonly(dbp->dbenv, "c_del")); + (void)__log_panic(dbenv); + (void)__memp_panic(dbenv); + (void)__lock_panic(dbenv); + (void)__txn_panic(dbenv); - /* Check for invalid dbc->c_del() function flags. */ - DB_CHECK_FLAGS(dbp->dbenv, "c_del", flags, 0); - - /* - * The cursor must be initialized, return -1 for an invalid cursor, - * otherwise 0. - */ - return (isvalid ? 0 : EINVAL); -} + __db_err(dbenv, "PANIC: %s", strerror(errval)); -/* - * __db_cgetchk -- - * Common cursor get argument checking routine. - * - * PUBLIC: int __db_cgetchk __P((const DB *, DBT *, DBT *, u_int32_t, int)); - */ -int -__db_cgetchk(dbp, key, data, flags, isvalid) - const DB *dbp; - DBT *key, *data; - u_int32_t flags; - int isvalid; -{ - int key_einval, key_flags; - - key_flags = key_einval = 0; - - /* Check for invalid dbc->c_get() function flags. */ - switch (flags) { - case DB_CURRENT: - case DB_FIRST: - case DB_LAST: - case DB_NEXT: - case DB_PREV: - key_flags = 1; - break; - case DB_SET_RANGE: - key_einval = key_flags = 1; - break; - case DB_SET: - key_einval = 1; - break; - case DB_GET_RECNO: - if (!F_ISSET(dbp, DB_BT_RECNUM)) - goto err; - break; - case DB_SET_RECNO: - if (!F_ISSET(dbp, DB_BT_RECNUM)) - goto err; - key_einval = key_flags = 1; - break; - default: -err: return (__db_ferr(dbp->dbenv, "c_get", 0)); + if (dbenv->db_paniccall != NULL) + dbenv->db_paniccall(dbenv, errval); } - /* Check for invalid key/data flags. */ - if (key_flags) - DB_CHECK_FLAGS(dbp->dbenv, "key", key->flags, - DB_DBT_MALLOC | DB_DBT_USERMEM | DB_DBT_PARTIAL); - DB_CHECK_FLAGS(dbp->dbenv, "data", data->flags, - DB_DBT_MALLOC | DB_DBT_USERMEM | DB_DBT_PARTIAL); - - /* Check dbt's for valid flags when multi-threaded. */ - if (F_ISSET(dbp, DB_AM_THREAD)) { - if (!F_ISSET(data, DB_DBT_USERMEM | DB_DBT_MALLOC)) - return (__db_ferr(dbp->dbenv, "threaded data", 1)); - if (key_flags && - !F_ISSET(key, DB_DBT_USERMEM | DB_DBT_MALLOC)) - return (__db_ferr(dbp->dbenv, "threaded key", 1)); - } - - /* Check for missing keys. */ - if (key_einval && (key->data == NULL || key->size == 0)) - return (__db_keyempty(dbp->dbenv)); - /* - * The cursor must be initialized for DB_CURRENT, return -1 for an - * invalid cursor, otherwise 0. + * Chaos reigns within. + * Reflect, repent, and reboot. + * Order shall return. */ - return (isvalid || flags != DB_CURRENT ? 0 : EINVAL); -} - -/* - * __db_cputchk -- - * Common cursor put argument checking routine. - * - * PUBLIC: int __db_cputchk __P((const DB *, - * PUBLIC: const DBT *, DBT *, u_int32_t, int, int)); - */ -int -__db_cputchk(dbp, key, data, flags, isrdonly, isvalid) - const DB *dbp; - const DBT *key; - DBT *data; - u_int32_t flags; - int isrdonly, isvalid; -{ - int key_einval, key_flags; - - /* Check for changes to a read-only tree. */ - if (isrdonly) - return (__db_rdonly(dbp->dbenv, "c_put")); - - /* Check for invalid dbc->c_put() function flags. */ - key_einval = key_flags = 0; - switch (flags) { - case DB_AFTER: - case DB_BEFORE: - if (dbp->type == DB_RECNO && !F_ISSET(dbp, DB_RE_RENUMBER)) - goto err; - if (dbp->type != DB_RECNO && !F_ISSET(dbp, DB_AM_DUP)) - goto err; - break; - case DB_CURRENT: - break; - case DB_KEYFIRST: - case DB_KEYLAST: - if (dbp->type == DB_RECNO) - goto err; - key_einval = key_flags = 1; - break; - default: -err: return (__db_ferr(dbp->dbenv, "c_put", 0)); - } - - /* Check for invalid key/data flags. */ - if (key_flags) - DB_CHECK_FLAGS(dbp->dbenv, "key", key->flags, - DB_DBT_MALLOC | DB_DBT_USERMEM | DB_DBT_PARTIAL); - DB_CHECK_FLAGS(dbp->dbenv, "data", data->flags, - DB_DBT_MALLOC | DB_DBT_USERMEM | DB_DBT_PARTIAL); - - /* Check for missing keys. */ - if (key_einval && (key->data == NULL || key->size == 0)) - return (__db_keyempty(dbp->dbenv)); - - /* - * The cursor must be initialized for anything other than DB_KEYFIRST - * and DB_KEYLAST, return -1 for an invalid cursor, otherwise 0. - */ - return (isvalid || - (flags != DB_KEYFIRST && flags != DB_KEYLAST) ? 0 : EINVAL); -} - -/* - * __db_delchk -- - * Common delete argument checking routine. - * - * PUBLIC: int __db_delchk __P((const DB *, DBT *, u_int32_t, int)); - */ -int -__db_delchk(dbp, key, flags, isrdonly) - const DB *dbp; - DBT *key; - u_int32_t flags; - int isrdonly; -{ - /* Check for changes to a read-only tree. */ - if (isrdonly) - return (__db_rdonly(dbp->dbenv, "delete")); - - /* Check for invalid db->del() function flags. */ - DB_CHECK_FLAGS(dbp->dbenv, "delete", flags, 0); - - /* Check for missing keys. */ - if (key->data == NULL || key->size == 0) - return (__db_keyempty(dbp->dbenv)); - - return (0); -} - -/* - * __db_getchk -- - * Common get argument checking routine. - * - * PUBLIC: int __db_getchk __P((const DB *, const DBT *, DBT *, u_int32_t)); - */ -int -__db_getchk(dbp, key, data, flags) - const DB *dbp; - const DBT *key; - DBT *data; - u_int32_t flags; -{ - /* Check for invalid db->get() function flags. */ - DB_CHECK_FLAGS(dbp->dbenv, - "get", flags, F_ISSET(dbp, DB_BT_RECNUM) ? DB_SET_RECNO : 0); - - /* Check for invalid key/data flags. */ - DB_CHECK_FLAGS(dbp->dbenv, "key", key->flags, 0); - DB_CHECK_FLAGS(dbp->dbenv, "data", data->flags, - DB_DBT_MALLOC | DB_DBT_USERMEM | DB_DBT_PARTIAL); - DB_CHECK_FCOMBO(dbp->dbenv, - "data", data->flags, DB_DBT_MALLOC, DB_DBT_USERMEM); - if (F_ISSET(dbp, DB_AM_THREAD) && - !F_ISSET(data, DB_DBT_MALLOC | DB_DBT_USERMEM)) - return (__db_ferr(dbp->dbenv, "threaded data", 1)); - - /* Check for missing keys. */ - if (key->data == NULL || key->size == 0) - return (__db_keyempty(dbp->dbenv)); - - return (0); -} - -/* - * __db_putchk -- - * Common put argument checking routine. - * - * PUBLIC: int __db_putchk - * PUBLIC: __P((const DB *, DBT *, const DBT *, u_int32_t, int, int)); - */ -int -__db_putchk(dbp, key, data, flags, isrdonly, isdup) - const DB *dbp; - DBT *key; - const DBT *data; - u_int32_t flags; - int isrdonly, isdup; -{ - /* Check for changes to a read-only tree. */ - if (isrdonly) - return (__db_rdonly(dbp->dbenv, "put")); - - /* Check for invalid db->put() function flags. */ - DB_CHECK_FLAGS(dbp->dbenv, "put", flags, - DB_NOOVERWRITE | (dbp->type == DB_RECNO ? DB_APPEND : 0)); - - /* Check for invalid key/data flags. */ - DB_CHECK_FLAGS(dbp->dbenv, "key", key->flags, 0); - DB_CHECK_FLAGS(dbp->dbenv, "data", data->flags, - DB_DBT_MALLOC | DB_DBT_USERMEM | DB_DBT_PARTIAL); - DB_CHECK_FCOMBO(dbp->dbenv, - "data", data->flags, DB_DBT_MALLOC, DB_DBT_USERMEM); - - /* Check for missing keys. */ - if (key->data == NULL || key->size == 0) - return (__db_keyempty(dbp->dbenv)); - - /* Check for partial puts in the presence of duplicates. */ - if (isdup && F_ISSET(data, DB_DBT_PARTIAL)) { - __db_err(dbp->dbenv, -"a partial put in the presence of duplicates requires a cursor operation"); - return (EINVAL); - } - - return (0); -} - -/* - * __db_statchk -- - * Common stat argument checking routine. - * - * PUBLIC: int __db_statchk __P((const DB *, u_int32_t)); - */ -int -__db_statchk(dbp, flags) - const DB *dbp; - u_int32_t flags; -{ - /* Check for invalid db->stat() function flags. */ - DB_CHECK_FLAGS(dbp->dbenv, "stat", flags, DB_RECORDCOUNT); - - if (LF_ISSET(DB_RECORDCOUNT) && - dbp->type == DB_BTREE && !F_ISSET(dbp, DB_BT_RECNUM)) - return (__db_ferr(dbp->dbenv, "stat", 0)); - - return (0); -} - -/* - * __db_syncchk -- - * Common sync argument checking routine. - * - * PUBLIC: int __db_syncchk __P((const DB *, u_int32_t)); - */ -int -__db_syncchk(dbp, flags) - const DB *dbp; - u_int32_t flags; -{ - /* Check for invalid db->sync() function flags. */ - DB_CHECK_FLAGS(dbp->dbenv, "sync", flags, 0); - - return (0); -} - -/* - * __db_ferr -- - * Common flag errors. - * - * PUBLIC: int __db_ferr __P((const DB_ENV *, const char *, int)); - */ -int -__db_ferr(dbenv, name, iscombo) - const DB_ENV *dbenv; - const char *name; - int iscombo; -{ - __db_err(dbenv, "illegal flag %sspecified to %s", - iscombo ? "combination " : "", name); - return (EINVAL); -} - -/* - * __db_rdonly -- - * Common readonly message. - */ -static int -__db_rdonly(dbenv, name) - const DB_ENV *dbenv; - const char *name; -{ - __db_err(dbenv, "%s: attempt to modify a read-only tree", name); - return (EACCES); -} - -/* - * __db_keyempty -- - * Common missing or empty key value message. - */ -static int -__db_keyempty(dbenv) - const DB_ENV *dbenv; -{ - __db_err(dbenv, "missing or empty key value specified"); - return (EINVAL); + return (DB_RUNRECOVERY); } diff --git a/db2/common/db_region.c b/db2/common/db_region.c index 284af6176a..12abfa524d 100644 --- a/db2/common/db_region.c +++ b/db2/common/db_region.c @@ -8,7 +8,7 @@ #include "config.h" #ifndef lint -static const char sccsid[] = "@(#)db_region.c 10.46 (Sleepycat) 5/26/98"; +static const char sccsid[] = "@(#)db_region.c 10.53 (Sleepycat) 11/10/98"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -46,7 +46,7 @@ __db_rattach(infop) ret = retry_cnt = 0; /* Round off the requested size to the next page boundary. */ - DB_ROUNDOFF(infop->size); + DB_ROUNDOFF(infop->size, DB_VMPAGESIZE); /* Some architectures have hard limits on the maximum region size. */ #ifdef DB_REGIONSIZE_MAX @@ -61,7 +61,7 @@ loop: infop->addr = NULL; infop->fd = -1; infop->segid = INVALID_SEGID; if (infop->name != NULL) { - FREES(infop->name); + __os_freestr(infop->name); infop->name = NULL; } F_CLR(infop, REGION_CANGROW | REGION_CREATED); @@ -74,6 +74,11 @@ loop: infop->addr = NULL; * (Theoretically, we could probably get a file descriptor to lock * other types of shared regions, but I don't see any reason to * bother.) + * + * Since we may be using shared memory regions, e.g., shmget(2), + * and not mmap of regular files, the backing file may be only a + * few tens of bytes in length. So, this depends on the ability + * to fcntl lock file offsets much larger than the physical file. */ malloc_possible = 0; #endif @@ -91,15 +96,16 @@ loop: infop->addr = NULL; * than either anonymous memory or a shared file. */ if (malloc_possible && F_ISSET(infop, REGION_PRIVATE)) { - if ((infop->addr = __db_malloc(infop->size)) == NULL) - return (ENOMEM); + if ((ret = __os_malloc(infop->size, NULL, &infop->addr)) != 0) + return (ret); /* - * It's sometimes significantly faster to page-fault in all - * of the region's pages before we run the application, as - * we can see fairly nasty side-effects when we page-fault - * while holding various locks, i.e., the lock takes a long - * time, and other threads convoy behind the lock holder. + * It's sometimes significantly faster to page-fault in all of + * the region's pages before we run the application, as we see + * nasty side-effects when we page-fault while holding various + * locks, i.e., the lock takes a long time to acquire because + * of the underlying page fault, and the other threads convoy + * behind the lock holder. */ if (DB_GLOBAL(db_region_init)) for (p = infop->addr; @@ -159,7 +165,7 @@ loop: infop->addr = NULL; * 3. Memory backed by a regular file (mmap(2)). * * We instantiate a backing file in all cases, which contains at least - * the RLAYOUT structure, and in case #4, contains the actual region. + * the RLAYOUT structure, and in case #3, contains the actual region. * This is necessary for a couple of reasons: * * First, the mpool region uses temporary files to name regions, and @@ -218,7 +224,7 @@ loop: infop->addr = NULL; * And yes, this makes me want to take somebody and kill them, * but I can't think of any other solution. */ - if ((ret = __db_ioinfo(infop->name, + if ((ret = __os_ioinfo(infop->name, infop->fd, &mbytes, &bytes, NULL)) != 0) goto errmsg; size = mbytes * MEGABYTE + bytes; @@ -233,7 +239,7 @@ loop: infop->addr = NULL; if (size < sizeof(RLAYOUT)) goto retry; if ((ret = - __db_read(infop->fd, &rl, sizeof(rl), &nr)) != 0) + __os_read(infop->fd, &rl, sizeof(rl), &nr)) != 0) goto retry; if (rl.valid != DB_REGIONMAGIC) goto retry; @@ -284,6 +290,7 @@ loop: infop->addr = NULL; } else goto err; } + region_init: /* * Initialize the common region information. @@ -321,6 +328,7 @@ region_init: rlp->refcnt = 1; rlp->size = infop->size; db_version(&rlp->majver, &rlp->minver, &rlp->patch); + rlp->panic = 0; rlp->segid = infop->segid; rlp->flags = 0; if (F_ISSET(infop, REGION_ANONYMOUS)) @@ -347,13 +355,19 @@ region_init: * the file. */ if (F_ISSET(infop, REGION_ANONYMOUS)) { - if ((ret = __db_seek(infop->fd, 0, 0, 0, 0, 0)) != 0) + if ((ret = __os_seek(infop->fd, 0, 0, 0, 0, 0)) != 0) goto err; if ((ret = - __db_write(infop->fd, rlp, sizeof(*rlp), &nw)) != 0) + __os_write(infop->fd, rlp, sizeof(*rlp), &nw)) != 0) goto err; } } else { + /* Check to see if the region has had catastrophic failure. */ + if (rlp->panic) { + ret = DB_RUNRECOVERY; + goto err; + } + /* * Check the valid flag to ensure the region is initialized. * If the valid flag has not been set, the mutex may not have @@ -380,18 +394,6 @@ region_init: } /* - * Problem #2: We want a bigger region than has previously been - * created. Detected by checking if the region is smaller than - * our caller requested. If it is, we grow the region, (which - * does the detach and re-attach for us). - */ - if (grow_region != 0 && - (ret = __db_rgrow(infop, grow_region)) != 0) { - (void)__db_mutex_unlock(&rlp->lock, infop->fd); - goto err; - } - - /* * Problem #3: when we checked the size of the file, it was * still growing as part of creation. Detected by the fact * that infop->size isn't the same size as the region. @@ -419,16 +421,16 @@ retry: /* Discard the region. */ /* Discard the backing file. */ if (infop->fd != -1) { - (void)__db_close(infop->fd); + (void)__os_close(infop->fd); infop->fd = -1; if (F_ISSET(infop, REGION_CREATED)) - (void)__db_unlink(infop->name); + (void)__os_unlink(infop->name); } /* Discard the name. */ if (infop->name != NULL) { - FREES(infop->name); + __os_freestr(infop->name); infop->name = NULL; } @@ -438,7 +440,7 @@ retry: /* Discard the region. */ */ if (ret == 0) { if (++retry_cnt <= 3) { - __db_sleep(retry_cnt * 2, 0); + __os_sleep(retry_cnt * 2, 0); goto loop; } ret = EAGAIN; @@ -481,10 +483,11 @@ retry: /* Discard the region. */ F_SET(infop, REGION_REMOVED); F_CLR(infop, REGION_CANGROW); - (void)__db_close(infop->fd); - (void)__db_unlink(infop->name); + (void)__os_close(infop->fd); + (void)__os_unlink(infop->name); } } + return (ret); } @@ -514,7 +517,7 @@ __db_rdetach(infop) * action required is freeing the memory. */ if (F_ISSET(infop, REGION_MALLOC)) { - __db_free(infop->addr); + __os_free(infop->addr, 0); goto done; } @@ -549,7 +552,7 @@ __db_rdetach(infop) (void)__db_mutex_unlock(&rlp->lock, infop->fd); /* Close the backing file descriptor. */ - (void)__db_close(infop->fd); + (void)__os_close(infop->fd); infop->fd = -1; /* Discard our mapping of the region. */ @@ -561,13 +564,13 @@ __db_rdetach(infop) if ((t_ret = __db_unlinkregion(infop->name, infop) != 0) && ret == 0) ret = t_ret; - if ((t_ret = __db_unlink(infop->name) != 0) && ret == 0) + if ((t_ret = __os_unlink(infop->name) != 0) && ret == 0) ret = t_ret; } done: /* Discard the name. */ if (infop->name != NULL) { - FREES(infop->name); + __os_freestr(infop->name); infop->name = NULL; } @@ -629,8 +632,8 @@ __db_runlink(infop, force) * (REGION_PRIVATE) ones, regardless of whether or not it's used to * back the region. If that file doesn't exist, we're done. */ - if (__db_exists(name, NULL) != 0) { - FREES(name); + if (__os_exists(name, NULL) != 0) { + __os_freestr(name); return (0); } @@ -641,12 +644,12 @@ __db_runlink(infop, force) */ if ((ret = __db_open(name, DB_RDONLY, DB_RDONLY, 0, &fd)) != 0) goto errmsg; - if ((ret = __db_ioinfo(name, fd, &mbytes, &bytes, NULL)) != 0) + if ((ret = __os_ioinfo(name, fd, &mbytes, &bytes, NULL)) != 0) goto errmsg; size = mbytes * MEGABYTE + bytes; if (size <= sizeof(RLAYOUT)) { - if ((ret = __db_read(fd, &rl, sizeof(rl), &nr)) != 0) + if ((ret = __os_read(fd, &rl, sizeof(rl), &nr)) != 0) goto errmsg; if (rl.valid != DB_REGIONMAGIC) { __db_err(infop->dbenv, @@ -673,16 +676,16 @@ __db_runlink(infop, force) * because some architectures (e.g., Win32) won't unlink a file if * open file descriptors remain. */ - (void)__db_close(fd); - if ((t_ret = __db_unlink(name)) != 0 && ret == 0) + (void)__os_close(fd); + if ((t_ret = __os_unlink(name)) != 0 && ret == 0) ret = t_ret; if (0) { errmsg: __db_err(infop->dbenv, "%s: %s", name, strerror(ret)); -err: (void)__db_close(fd); +err: (void)__os_close(fd); } - FREES(name); + __os_freestr(name); return (ret); } @@ -715,7 +718,7 @@ __db_rgrow(infop, new_size) * determine the additional space required. */ rlp = (RLAYOUT *)infop->addr; - DB_ROUNDOFF(new_size); + DB_ROUNDOFF(new_size, DB_VMPAGESIZE); increment = new_size - rlp->size; if ((ret = __db_growregion(infop, increment)) != 0) @@ -745,7 +748,7 @@ __db_growregion(infop, increment) char buf[DB_VMPAGESIZE]; /* Seek to the end of the region. */ - if ((ret = __db_seek(infop->fd, 0, 0, 0, 0, SEEK_END)) != 0) + if ((ret = __os_seek(infop->fd, 0, 0, 0, 0, SEEK_END)) != 0) goto err; /* Write nuls to the new bytes. */ @@ -760,7 +763,7 @@ __db_growregion(infop, increment) /* Extend the region by writing each new page. */ for (i = 0; i < increment; i += DB_VMPAGESIZE) { if ((ret = - __db_write(infop->fd, buf, sizeof(buf), &nw)) != 0) + __os_write(infop->fd, buf, sizeof(buf), &nw)) != 0) goto err; if (nw != sizeof(buf)) goto eio; @@ -776,36 +779,44 @@ __db_growregion(infop, increment) */ pages = (increment - DB_VMPAGESIZE) / MEGABYTE; relative = (increment - DB_VMPAGESIZE) % MEGABYTE; - if ((ret = __db_seek(infop->fd, + if ((ret = __os_seek(infop->fd, MEGABYTE, pages, relative, 0, SEEK_CUR)) != 0) goto err; - if ((ret = __db_write(infop->fd, buf, sizeof(buf), &nw)) != 0) + if ((ret = __os_write(infop->fd, buf, sizeof(buf), &nw)) != 0) goto err; if (nw != sizeof(buf)) goto eio; /* - * It's sometimes significantly faster to page-fault in all - * of the region's pages before we run the application, as - * we can see fairly nasty side-effects when we page-fault - * while holding various locks, i.e., the lock takes a long - * time, and other threads convoy behind the lock holder. + * It's sometimes significantly faster to page-fault in all of + * the region's pages before we run the application, as we see + * nasty side-effects when we page-fault while holding various + * locks, i.e., the lock takes a long time to acquire because + * of the underlying page fault, and the other threads convoy + * behind the lock holder. + * + * We also use REGION_INIT to guarantee that there is enough + * disk space for the region, so we also write a byte to each + * page. Reading the byte is insufficient as some systems + * (e.g., Solaris) do not instantiate disk pages to satisfy + * a read, and so we don't know if there is enough disk space + * or not. */ if (DB_GLOBAL(db_region_init)) { pages = increment / MEGABYTE; relative = increment % MEGABYTE; - if ((ret = __db_seek(infop->fd, + if ((ret = __os_seek(infop->fd, MEGABYTE, pages, relative, 1, SEEK_END)) != 0) goto err; - /* Read a byte from each page. */ + /* Write a byte to each page. */ for (i = 0; i < increment; i += DB_VMPAGESIZE) { if ((ret = - __db_read(infop->fd, buf, 1, &nr)) != 0) + __os_write(infop->fd, buf, 1, &nr)) != 0) goto err; if (nr != 1) goto eio; - if ((ret = __db_seek(infop->fd, + if ((ret = __os_seek(infop->fd, 0, 0, DB_VMPAGESIZE - 1, 0, SEEK_CUR)) != 0) goto err; } diff --git a/db2/common/db_salloc.c b/db2/common/db_salloc.c index c02d7e18e9..d58b79f3c4 100644 --- a/db2/common/db_salloc.c +++ b/db2/common/db_salloc.c @@ -8,7 +8,7 @@ #include "config.h" #ifndef lint -static const char sccsid[] = "@(#)db_salloc.c 10.13 (Sleepycat) 5/10/98"; +static const char sccsid[] = "@(#)db_salloc.c 10.14 (Sleepycat) 11/16/98"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -170,7 +170,7 @@ __db_shalloc_free(regionp, ptr) /* Trash the returned memory. */ #ifdef DIAGNOSTIC - memset(ptr, 0xff, free_size); + memset(ptr, 0xdb, free_size); #endif /* @@ -4,7 +4,7 @@ * Copyright (c) 1996, 1997, 1998 * Sleepycat Software. All rights reserved. * - * @(#)db.h.src 10.131 (Sleepycat) 6/2/98 + * @(#)db.h 10.174 (Sleepycat) 1/3/99 */ #ifndef _DB_H_ @@ -56,34 +56,20 @@ * We also provide the standard u_int, u_long etc., if they're not provided * by the system. */ -#ifndef __BIT_TYPES_DEFINED__ -#define __BIT_TYPES_DEFINED__ - - - - - -#endif - - - - - #define DB_VERSION_MAJOR 2 -#define DB_VERSION_MINOR 4 -#define DB_VERSION_PATCH 14 -#define DB_VERSION_STRING "Sleepycat Software: DB 2.4.14: (6/2/98)" +#define DB_VERSION_MINOR 7 +#define DB_VERSION_PATCH 5 +#define DB_VERSION_STRING "Sleepycat Software: Berkeley DB 2.7.5: (04/18/99)" typedef u_int32_t db_pgno_t; /* Page number type. */ typedef u_int16_t db_indx_t; /* Page offset type. */ #define DB_MAX_PAGES 0xffffffff /* >= # of pages in a file */ typedef u_int32_t db_recno_t; /* Record number type. */ -typedef size_t DB_LOCK; /* Object returned by lock manager. */ #define DB_MAX_RECORDS 0xffffffff /* >= # of records in a tree */ -#define DB_FILE_ID_LEN 20 /* DB file ID length. */ +typedef size_t DB_LOCK; /* Object returned by lock manager. */ /* Forward structure declarations, so applications get type checking. */ struct __db; typedef struct __db DB; @@ -93,6 +79,7 @@ struct __db; typedef struct __db DB; struct __db_bt_stat; typedef struct __db_bt_stat DB_BTREE_STAT; struct __db_dbt; typedef struct __db_dbt DBT; struct __db_env; typedef struct __db_env DB_ENV; +struct __db_ilock; typedef struct __db_ilock DB_LOCK_ILOCK; struct __db_info; typedef struct __db_info DB_INFO; struct __db_lock_stat; typedef struct __db_lock_stat DB_LOCK_STAT; struct __db_lockregion; typedef struct __db_lockregion DB_LOCKREGION; @@ -121,8 +108,7 @@ struct __db_dbt { u_int32_t dlen; /* RO: get/put record length. */ u_int32_t doff; /* RO: get/put record offset. */ -#define DB_DBT_INTERNAL 0x01 /* Perform any mallocs using regular - malloc, not the user's malloc. */ +#define DB_DBT_INTERNAL 0x01 /* Ignore user's malloc (internal). */ #define DB_DBT_MALLOC 0x02 /* Return in allocated memory. */ #define DB_DBT_PARTIAL 0x04 /* Partial put/get. */ #define DB_DBT_USERMEM 0x08 /* Return in user's memory. */ @@ -130,38 +116,36 @@ struct __db_dbt { }; /* - * DB internal configuration. + * DB run-time interface configuration. * * There are a set of functions that the application can replace with its * own versions, and some other knobs which can be turned at run-time. */ -#define DB_FUNC_CALLOC 1 /* DELETED: ANSI C calloc. */ -#define DB_FUNC_CLOSE 2 /* POSIX 1003.1 close. */ -#define DB_FUNC_DIRFREE 3 /* DB: free directory list. */ -#define DB_FUNC_DIRLIST 4 /* DB: create directory list. */ -#define DB_FUNC_EXISTS 5 /* DB: return if file exists. */ -#define DB_FUNC_FREE 6 /* ANSI C free. */ -#define DB_FUNC_FSYNC 7 /* POSIX 1003.1 fsync. */ -#define DB_FUNC_IOINFO 8 /* DB: return file I/O information. */ -#define DB_FUNC_MALLOC 9 /* ANSI C malloc. */ -#define DB_FUNC_MAP 10 /* DB: map file into shared memory. */ -#define DB_FUNC_OPEN 11 /* POSIX 1003.1 open. */ -#define DB_FUNC_READ 12 /* POSIX 1003.1 read. */ -#define DB_FUNC_REALLOC 13 /* ANSI C realloc. */ +#define DB_FUNC_CLOSE 1 /* POSIX 1003.1 close. */ +#define DB_FUNC_DIRFREE 2 /* DB: free directory list. */ +#define DB_FUNC_DIRLIST 3 /* DB: create directory list. */ +#define DB_FUNC_EXISTS 4 /* DB: return if file exists. */ +#define DB_FUNC_FREE 5 /* ANSI C free. */ +#define DB_FUNC_FSYNC 6 /* POSIX 1003.1 fsync. */ +#define DB_FUNC_IOINFO 7 /* DB: return file I/O information. */ +#define DB_FUNC_MALLOC 8 /* ANSI C malloc. */ +#define DB_FUNC_MAP 9 /* DB: map file into shared memory. */ +#define DB_FUNC_OPEN 10 /* POSIX 1003.1 open. */ +#define DB_FUNC_READ 11 /* POSIX 1003.1 read. */ +#define DB_FUNC_REALLOC 12 /* ANSI C realloc. */ +#define DB_FUNC_RUNLINK 13 /* DB: remove a shared region. */ #define DB_FUNC_SEEK 14 /* POSIX 1003.1 lseek. */ #define DB_FUNC_SLEEP 15 /* DB: sleep secs/usecs. */ -#define DB_FUNC_STRDUP 16 /* DELETED: DB: strdup(3). */ -#define DB_FUNC_UNLINK 17 /* POSIX 1003.1 unlink. */ -#define DB_FUNC_UNMAP 18 /* DB: unmap shared memory file. */ -#define DB_FUNC_WRITE 19 /* POSIX 1003.1 write. */ -#define DB_FUNC_YIELD 20 /* DB: yield thread to scheduler. */ -#define DB_TSL_SPINS 21 /* DB: initialize spin count. */ -#define DB_FUNC_RUNLINK 22 /* DB: remove a shared region. */ -#define DB_REGION_ANON 23 /* DB: anonymous, unnamed regions. */ -#define DB_REGION_INIT 24 /* DB: page-fault regions in create. */ -#define DB_REGION_NAME 25 /* DB: anonymous, named regions. */ -#define DB_MUTEXLOCKS 26 /* DB: turn off all mutex locks. */ -#define DB_PAGEYIELD 27 /* DB: yield the CPU on pool get. */ +#define DB_FUNC_UNLINK 16 /* POSIX 1003.1 unlink. */ +#define DB_FUNC_UNMAP 17 /* DB: unmap shared memory file. */ +#define DB_FUNC_WRITE 18 /* POSIX 1003.1 write. */ +#define DB_FUNC_YIELD 19 /* DB: yield thread to scheduler. */ +#define DB_MUTEXLOCKS 20 /* DB: turn off all mutex locks. */ +#define DB_PAGEYIELD 21 /* DB: yield the CPU on pool get. */ +#define DB_REGION_ANON 22 /* DB: anonymous, unnamed regions. */ +#define DB_REGION_INIT 23 /* DB: page-fault regions in create. */ +#define DB_REGION_NAME 24 /* DB: anonymous, named regions. */ +#define DB_TSL_SPINS 25 /* DB: initialize spin count. */ /* * Database configuration and initialization. @@ -177,29 +161,18 @@ struct __db_dbt { * Flags understood by db_appinit(3). */ /* 0x000007 COMMON MASK. */ -#define DB_INIT_LOCK 0x000008 /* Initialize locking. */ -#define DB_INIT_LOG 0x000010 /* Initialize logging. */ -#define DB_INIT_MPOOL 0x000020 /* Initialize mpool. */ -#define DB_INIT_TXN 0x000040 /* Initialize transactions. */ -#define DB_MPOOL_PRIVATE 0x000080 /* Mpool: private memory pool. */ -#define __UNUSED_100 0x000100 +#define DB_INIT_CDB 0x000008 /* Concurrent Access Methods. */ +#define DB_INIT_LOCK 0x000010 /* Initialize locking. */ +#define DB_INIT_LOG 0x000020 /* Initialize logging. */ +#define DB_INIT_MPOOL 0x000040 /* Initialize mpool. */ +#define DB_INIT_TXN 0x000080 /* Initialize transactions. */ +#define DB_MPOOL_PRIVATE 0x000100 /* Mpool: private memory pool. */ #define DB_RECOVER 0x000200 /* Run normal recovery. */ #define DB_RECOVER_FATAL 0x000400 /* Run catastrophic recovery. */ #define DB_TXN_NOSYNC 0x000800 /* Do not sync log on commit. */ #define DB_USE_ENVIRON 0x001000 /* Use the environment. */ #define DB_USE_ENVIRON_ROOT 0x002000 /* Use the environment if root. */ -/* CURRENTLY UNUSED LOCK FLAGS. */ -#define DB_TXN_LOCK_2PL 0x000000 /* Two-phase locking. */ -#define DB_TXN_LOCK_OPTIMIST 0x000000 /* Optimistic locking. */ -#define DB_TXN_LOCK_MASK 0x000000 /* Lock flags mask. */ - -/* CURRENTLY UNUSED LOG FLAGS. */ -#define DB_TXN_LOG_REDO 0x000000 /* Redo-only logging. */ -#define DB_TXN_LOG_UNDO 0x000000 /* Undo-only logging. */ -#define DB_TXN_LOG_UNDOREDO 0x000000 /* Undo/redo write-ahead logging. */ -#define DB_TXN_LOG_MASK 0x000000 /* Log flags mask. */ - /* * Flags understood by db_open(3). * @@ -207,23 +180,22 @@ struct __db_dbt { * DB_SEQUENTIAL is currently internal, but may be exported some day. */ /* 0x000007 COMMON MASK. */ -/* 0x003fff ALREADY USED. */ -#define __UNUSED_4000 0x004000 -#define DB_EXCL 0x008000 /* O_EXCL: exclusive open. */ -#define DB_RDONLY 0x010000 /* O_RDONLY: read-only. */ -#define DB_SEQUENTIAL 0x020000 /* Indicate sequential access. */ -#define DB_TEMPORARY 0x040000 /* Remove on last close. */ -#define DB_TRUNCATE 0x080000 /* O_TRUNCATE: replace existing DB. */ +/* 0x001fff ALREADY USED. */ +#define DB_EXCL 0x002000 /* O_EXCL: exclusive open (internal). */ +#define DB_RDONLY 0x004000 /* O_RDONLY: read-only. */ +#define DB_SEQUENTIAL 0x008000 /* Sequential access (internal). */ +#define DB_TEMPORARY 0x010000 /* Remove on last close (internal). */ +#define DB_TRUNCATE 0x020000 /* O_TRUNCATE: replace existing DB. */ /* * Deadlock detector modes; used in the DBENV structure to configure the * locking subsystem. */ -#define DB_LOCK_NORUN 0x0 -#define DB_LOCK_DEFAULT 0x1 /* Default policy. */ -#define DB_LOCK_OLDEST 0x2 /* Abort oldest transaction. */ -#define DB_LOCK_RANDOM 0x3 /* Abort random transaction. */ -#define DB_LOCK_YOUNGEST 0x4 /* Abort youngest transaction. */ +#define DB_LOCK_NORUN 0 +#define DB_LOCK_DEFAULT 1 /* Default policy. */ +#define DB_LOCK_OLDEST 2 /* Abort oldest transaction. */ +#define DB_LOCK_RANDOM 3 /* Abort random transaction. */ +#define DB_LOCK_YOUNGEST 4 /* Abort youngest transaction. */ struct __db_env { int db_lorder; /* Byte order. */ @@ -233,6 +205,8 @@ struct __db_env { FILE *db_errfile; /* Error message file stream. */ const char *db_errpfx; /* Error message prefix. */ int db_verbose; /* Generate debugging messages. */ + int db_panic; /* Panic flag, callback function. */ + void (*db_paniccall) __P((DB_ENV *, int)); /* User paths. */ char *db_home; /* Database home. */ @@ -245,7 +219,7 @@ struct __db_env { /* Locking. */ DB_LOCKTAB *lk_info; /* Return from lock_open(). */ - u_int8_t *lk_conflicts; /* Two dimensional conflict matrix. */ + const u_int8_t *lk_conflicts; /* Two dimensional conflict matrix. */ u_int32_t lk_modes; /* Number of lock modes in table. */ u_int32_t lk_max; /* Maximum number of locks. */ u_int32_t lk_detect; /* Deadlock detect on all conflicts. */ @@ -265,9 +239,25 @@ struct __db_env { int (*tx_recover) /* Dispatch function for recovery. */ __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); + /* + * XA support. + * + * !!! + * Explicit representations of structures in queue.h. + * + * TAILQ_ENTRY(__db_env); + */ + struct { + struct __db_env *tqe_next; + struct __db_env **tqe_prev; + } links; + int xa_rmid; /* XA Resource Manager ID. */ + DB_TXN *xa_txn; /* XA Current transaction. */ + #define DB_ENV_APPINIT 0x01 /* Paths initialized by db_appinit(). */ -#define DB_ENV_STANDALONE 0x02 /* Test: freestanding environment. */ -#define DB_ENV_THREAD 0x04 /* DB_ENV is multi-threaded. */ +#define DB_ENV_CDB 0x02 /* Concurrent DB product. */ +#define DB_ENV_STANDALONE 0x04 /* Test: freestanding environment. */ +#define DB_ENV_THREAD 0x08 /* DB_ENV is multi-threaded. */ u_int32_t flags; /* Flags. */ }; @@ -275,7 +265,7 @@ struct __db_env { * Access methods. *******************************************************/ /* - * XXX + * !!! * Changes here must be reflected in java/src/com/sleepycat/db/Db.java. */ typedef enum { @@ -304,6 +294,8 @@ struct __db_info { /* Local heap allocation. */ void *(*db_malloc) __P((size_t)); + int (*dup_compare) /* Duplicate compare function. */ + __P((const DBT *, const DBT *)); /* Btree access method. */ u_int32_t bt_maxkey; /* Maximum keys per page. */ @@ -327,44 +319,51 @@ struct __db_info { #define DB_DELIMITER 0x0001 /* Recno: re_delim set. */ #define DB_DUP 0x0002 /* Btree, Hash: duplicate keys. */ -#define DB_FIXEDLEN 0x0004 /* Recno: fixed-length records. */ -#define DB_PAD 0x0008 /* Recno: re_pad set. */ -#define DB_RECNUM 0x0010 /* Btree: record numbers. */ -#define DB_RENUMBER 0x0020 /* Recno: renumber on insert/delete. */ -#define DB_SNAPSHOT 0x0040 /* Recno: snapshot the input. */ +#define DB_DUPSORT 0x0004 /* Btree, Hash: duplicate keys. */ +#define DB_FIXEDLEN 0x0008 /* Recno: fixed-length records. */ +#define DB_PAD 0x0010 /* Recno: re_pad set. */ +#define DB_RECNUM 0x0020 /* Btree: record numbers. */ +#define DB_RENUMBER 0x0040 /* Recno: renumber on insert/delete. */ +#define DB_SNAPSHOT 0x0080 /* Recno: snapshot the input. */ u_int32_t flags; }; /* - * DB access method and cursor operation codes. These are implemented as - * bit fields for future flexibility, but currently only a single one may - * be specified to any function. + * DB access method and cursor operation values. Each value is an operation + * code to which additional bit flags are added. */ -#define DB_AFTER 0x000001 /* c_put() */ -#define DB_APPEND 0x000002 /* put() */ -#define DB_BEFORE 0x000004 /* c_put() */ -#define DB_CHECKPOINT 0x000008 /* log_put(), log_get() */ -#define DB_CURRENT 0x000010 /* c_get(), c_put(), log_get() */ -#define DB_FIRST 0x000020 /* c_get(), log_get() */ -#define DB_FLUSH 0x000040 /* log_put() */ -#define DB_GET_RECNO 0x000080 /* get(), c_get() */ -#define DB_KEYFIRST 0x000100 /* c_put() */ -#define DB_KEYLAST 0x000200 /* c_put() */ -#define DB_LAST 0x000400 /* c_get(), log_get() */ -#define DB_NEXT 0x000800 /* c_get(), log_get() */ -#define DB_NOOVERWRITE 0x001000 /* put() */ -#define DB_NOSYNC 0x002000 /* close() */ -#define DB_PREV 0x004000 /* c_get(), log_get() */ -#define DB_RECORDCOUNT 0x008000 /* stat() */ -#define DB_SET 0x010000 /* c_get(), log_get() */ -#define DB_SET_RANGE 0x020000 /* c_get() */ -#define DB_SET_RECNO 0x040000 /* c_get() */ -#define DB_CURLSN 0x080000 /* log_put() */ +#define DB_AFTER 1 /* c_put() */ +#define DB_APPEND 2 /* put() */ +#define DB_BEFORE 3 /* c_put() */ +#define DB_CHECKPOINT 4 /* log_put(), log_get() */ +#define DB_CURLSN 5 /* log_put() */ +#define DB_CURRENT 6 /* c_get(), c_put(), log_get() */ +#define DB_FIRST 7 /* c_get(), log_get() */ +#define DB_FLUSH 8 /* log_put() */ +#define DB_GET_BOTH 9 /* get(), c_get() */ +#define DB_GET_RECNO 10 /* c_get() */ +#define DB_JOIN_ITEM 11 /* c_get(); do not do primary lookup */ +#define DB_KEYFIRST 12 /* c_put() */ +#define DB_KEYLAST 13 /* c_put() */ +#define DB_LAST 14 /* c_get(), log_get() */ +#define DB_NEXT 15 /* c_get(), log_get() */ +#define DB_NEXT_DUP 16 /* c_get() */ +#define DB_NOOVERWRITE 17 /* put() */ +#define DB_NOSYNC 18 /* close() */ +#define DB_PREV 19 /* c_get(), log_get() */ +#define DB_RECORDCOUNT 20 /* stat() */ +#define DB_SET 21 /* c_get(), log_get() */ +#define DB_SET_RANGE 22 /* c_get() */ +#define DB_SET_RECNO 23 /* get(), c_get() */ +#define DB_WRITELOCK 24 /* cursor() (internal) */ + +#define DB_OPFLAGS_MASK 0x1f /* Mask for operations flags. */ +#define DB_RMW 0x80000000 /* Acquire write flag immediately. */ /* * DB (user visible) error return codes. * - * XXX + * !!! * Changes to any of the user visible error return codes must be reflected * in java/src/com/sleepycat/db/Db.java. */ @@ -376,93 +375,84 @@ struct __db_info { #define DB_LOCK_NOTGRANTED ( -5) /* Lock unavailable, no-wait set. */ #define DB_LOCK_NOTHELD ( -6) /* Lock not held by locker. */ #define DB_NOTFOUND ( -7) /* Key/data pair not found (EOF). */ +#define DB_RUNRECOVERY ( -8) /* Panic return. */ /* DB (private) error return codes. */ -#define DB_DELETED ( -8) /* Recovery file marked deleted. */ -#define DB_NEEDSPLIT ( -9) /* Page needs to be split. */ -#define DB_REGISTERED (-10) /* Entry was previously registered. */ +#define DB_DELETED ( -9) /* Recovery file marked deleted. */ +#define DB_NEEDSPLIT (-10) /* Page needs to be split. */ #define DB_SWAPBYTES (-11) /* Database needs byte swapping. */ -#define DB_TXN_CKP (-12) /* Encountered ckp record in log. */ +#define DB_TXN_CKP (-12) /* Encountered ckp record in log. */ -struct __db_ilock { /* Internal DB access method lock. */ - db_pgno_t pgno; /* Page being locked. */ - /* File id. */ - u_int8_t fileid[DB_FILE_ID_LEN]; -}; +#define DB_FILE_ID_LEN 20 /* DB file ID length. */ /* DB access method description structure. */ struct __db { void *mutexp; /* Synchronization for free threading */ + + /* Documented, returned information. */ DBTYPE type; /* DB access method. */ + int byteswapped; /* Database byte order is swapped. */ + DB_ENV *dbenv; /* DB_ENV structure. */ DB_ENV *mp_dbenv; /* DB_ENV for local mpool creation. */ - DB *master; /* Original DB created by db_open. */ void *internal; /* Access method private. */ DB_MPOOL *mp; /* The access method's mpool. */ DB_MPOOLFILE *mpf; /* The access method's mpool file. */ /* - * XXX + * !!! * Explicit representations of structures in queue.h. * - * TAILQ_HEAD(curs_queue, __dbc); + * TAILQ_HEAD(free_queue, __dbc); + * TAILQ_HEAD(active_queue, __dbc); */ struct { struct __dbc *tqh_first; struct __dbc **tqh_last; - } curs_queue; - - /* - * XXX - * Explicit representations of structures in queue.h. - * - * LIST_HEAD(handleq, __db); - * LIST_ENTRY(__db); - */ - struct { - struct __db *lh_first; - } handleq; /* List of handles for this DB. */ + } free_queue; struct { - struct __db *le_next; - struct __db **le_prev; - } links; /* Links for the handle list. */ + struct __dbc *tqh_first; + struct __dbc **tqh_last; + } active_queue; + u_int8_t fileid[DB_FILE_ID_LEN]; /* Uniquely identify this file for + locking. */ u_int32_t log_fileid; /* Logging file id. */ - - DB_TXN *txn; /* Current transaction. */ - u_int32_t locker; /* Default process' locker id. */ - DBT lock_dbt; /* DBT referencing lock. */ - struct __db_ilock lock; /* Lock. */ - size_t pgsize; /* Logical page size of file. */ /* Local heap allocation. */ void *(*db_malloc) __P((size_t)); + int (*dup_compare) /* Duplicate compare function. */ + __P((const DBT *, const DBT *)); + u_int32_t (*h_hash) /* Hash function. */ + __P((const void *, u_int32_t)); /* Functions. */ + int (*am_close) __P((DB *)); int (*close) __P((DB *, u_int32_t)); - int (*cursor) __P((DB *, DB_TXN *, DBC **)); + int (*cursor) __P((DB *, DB_TXN *, DBC **, u_int32_t)); int (*del) __P((DB *, DB_TXN *, DBT *, u_int32_t)); int (*fd) __P((DB *, int *)); int (*get) __P((DB *, DB_TXN *, DBT *, DBT *, u_int32_t)); + int (*join) __P((DB *, DBC **, u_int32_t, DBC **)); int (*put) __P((DB *, DB_TXN *, DBT *, DBT *, u_int32_t)); int (*stat) __P((DB *, void *, void *(*)(size_t), u_int32_t)); int (*sync) __P((DB *, u_int32_t)); -#define DB_AM_DUP 0x000001 /* DB_DUP (internal). */ -#define DB_AM_INMEM 0x000002 /* In-memory; no sync on close. */ -#define DB_AM_LOCKING 0x000004 /* Perform locking. */ -#define DB_AM_LOGGING 0x000008 /* Perform logging. */ -#define DB_AM_MLOCAL 0x000010 /* Database memory pool is local. */ -#define DB_AM_PGDEF 0x000020 /* Page size was defaulted. */ -#define DB_AM_RDONLY 0x000040 /* Database is readonly. */ -#define DB_AM_RECOVER 0x000080 /* In recovery (do not log or lock). */ +#define DB_AM_CDB 0x000001 /* Concurrent Access Methods. */ +#define DB_AM_DUP 0x000002 /* DB_DUP (internal). */ +#define DB_AM_INMEM 0x000004 /* In-memory; no sync on close. */ +#define DB_AM_LOCKING 0x000008 /* Perform locking. */ +#define DB_AM_LOGGING 0x000010 /* Perform logging. */ +#define DB_AM_MLOCAL 0x000020 /* Database memory pool is local. */ +#define DB_AM_PGDEF 0x000040 /* Page size was defaulted. */ +#define DB_AM_RDONLY 0x000080 /* Database is readonly. */ #define DB_AM_SWAP 0x000100 /* Pages need to be byte-swapped. */ #define DB_AM_THREAD 0x000200 /* DB is multi-threaded. */ -#define DB_BT_RECNUM 0x000400 /* DB_RECNUM (internal) */ -#define DB_HS_DIRTYMETA 0x000800 /* Hash: Metadata page modified. */ +#define DB_BT_RECNUM 0x000400 /* DB_RECNUM (internal). */ +#define DB_DBM_ERROR 0x000800 /* Error in DBM/NDBM database. */ #define DB_RE_DELIMITER 0x001000 /* DB_DELIMITER (internal). */ #define DB_RE_FIXEDLEN 0x002000 /* DB_FIXEDLEN (internal). */ #define DB_RE_PAD 0x004000 /* DB_PAD (internal). */ @@ -471,13 +461,18 @@ struct __db { u_int32_t flags; }; +struct __db_ilock { /* Internal DB access method lock. */ + db_pgno_t pgno; /* Page being locked. */ + u_int8_t fileid[DB_FILE_ID_LEN];/* File id. */ +}; + /* Cursor description structure. */ struct __dbc { DB *dbp; /* Related DB access method. */ DB_TXN *txn; /* Associated transaction. */ /* - * XXX + * !!! * Explicit representations of structures in queue.h. * * TAILQ_ENTRY(__dbc); @@ -487,12 +482,30 @@ struct __dbc { struct __dbc **tqe_prev; } links; + u_int32_t lid; /* Default process' locker id. */ + u_int32_t locker; /* Locker for this operation. */ + DBT lock_dbt; /* DBT referencing lock. */ + DB_LOCK_ILOCK lock; /* Object to be locked. */ + DB_LOCK mylock; /* Lock held on this cursor. */ + + DBT rkey; /* Returned key. */ + DBT rdata; /* Returned data. */ + + int (*c_am_close) __P((DBC *)); + int (*c_am_destroy) __P((DBC *)); + int (*c_close) __P((DBC *)); + int (*c_del) __P((DBC *, u_int32_t)); + int (*c_get) __P((DBC *, DBT *, DBT *, u_int32_t)); + int (*c_put) __P((DBC *, DBT *, DBT *, u_int32_t)); + void *internal; /* Access method private. */ - int (*c_close) __P((DBC *)); - int (*c_del) __P((DBC *, u_int32_t)); - int (*c_get) __P((DBC *, DBT *, DBT *, u_int32_t)); - int (*c_put) __P((DBC *, DBT *, DBT *, u_int32_t)); +#define DBC_CONTINUE 0x001 /* Continue dup search: next item. */ +#define DBC_KEYSET 0x002 /* Continue dup search: current item. */ +#define DBC_RECOVER 0x004 /* In recovery (do not log or lock). */ +#define DBC_RMW 0x008 /* Acquire write flag in read op. */ +#define DBC_WRITER 0x010 /* Cursor immediately writing (CDB). */ + u_int32_t flags; }; /* Btree/recno statistics structure. */ @@ -510,24 +523,36 @@ struct __db_bt_stat { u_int32_t bt_dup_pg; /* Duplicate pages. */ u_int32_t bt_over_pg; /* Overflow pages. */ u_int32_t bt_free; /* Pages on the free list. */ - u_int32_t bt_freed; /* Pages freed for reuse. */ u_int32_t bt_int_pgfree; /* Bytes free in internal pages. */ u_int32_t bt_leaf_pgfree; /* Bytes free in leaf pages. */ u_int32_t bt_dup_pgfree; /* Bytes free in duplicate pages. */ u_int32_t bt_over_pgfree; /* Bytes free in overflow pages. */ - u_int32_t bt_pfxsaved; /* Bytes saved by prefix compression. */ - u_int32_t bt_split; /* Total number of splits. */ - u_int32_t bt_rootsplit; /* Root page splits. */ - u_int32_t bt_fastsplit; /* Fast splits. */ - u_int32_t bt_added; /* Items added. */ - u_int32_t bt_deleted; /* Items deleted. */ - u_int32_t bt_get; /* Items retrieved. */ - u_int32_t bt_cache_hit; /* Hits in fast-insert code. */ - u_int32_t bt_cache_miss; /* Misses in fast-insert code. */ u_int32_t bt_magic; /* Magic number. */ u_int32_t bt_version; /* Version number. */ }; +/* Hash statistics structure. */ +struct __db_h_stat { + u_int32_t hash_accesses; /* Number of accesses to this table. */ + u_int32_t hash_collisions; /* Number of collisions on search. */ + u_int32_t hash_expansions; /* Number of times we added a bucket. */ + u_int32_t hash_overflows; /* Number of overflow pages. */ + u_int32_t hash_bigpages; /* Number of big key/data pages. */ + u_int32_t hash_dup; /* Number of dup pages. */ + u_int32_t hash_free; /* Pages on the free list. */ + u_int32_t hash_bfree; /* Bytes free on bucket pages. */ + u_int32_t hash_dup_free; /* Bytes free on duplicate pages. */ + u_int32_t hash_big_bfree; /* Bytes free on big item pages. */ + u_int32_t hash_buckets; /* Number of hash buckets. */ + u_int32_t hash_put; /* Number of puts. */ + u_int32_t hash_deleted; /* Number of deletes. */ + u_int32_t hash_get; /* Number of gets. */ + u_int32_t hash_magic; /* Magic number. */ + u_int32_t hash_version; /* Version number. */ + u_int32_t hash_pagesize; /* Page size. */ + u_int32_t hash_nrecs; /* Number of records. */ +}; + #if defined(__cplusplus) extern "C" { #endif @@ -538,6 +563,8 @@ int db_open __P((const char *, DBTYPE, u_int32_t, int, DB_ENV *, DB_INFO *, DB **)); int db_value_set __P((int, int)); char *db_version __P((int *, int *, int *)); +int db_xa_open __P((const char *, + DBTYPE, u_int32_t, int, DB_INFO *, DB **)); #if defined(__cplusplus) } #endif @@ -548,8 +575,10 @@ char *db_version __P((int *, int *, int *)); #define DB_LOCKVERSION 1 #define DB_LOCKMAGIC 0x090193 -/* Flag values for lock_vec(). */ +/* Flag values for lock_vec(), lock_get(). */ #define DB_LOCK_NOWAIT 0x01 /* Don't wait on unavailable lock. */ +#define DB_LOCK_UPGRADE 0x02 /* Upgrade an existing lock instead + of granting a new one (internal). */ /* Flag values for lock_detect(). */ #define DB_LOCK_CONFLICT 0x01 /* Run on any conflict. */ @@ -557,12 +586,13 @@ char *db_version __P((int *, int *, int *)); /* * Request types. * - * XXX + * !!! * Changes here must be reflected in java/src/com/sleepycat/db/Db.java. */ typedef enum { DB_LOCK_DUMP=0, /* Display held locks. */ DB_LOCK_GET, /* Get the lock. */ + DB_LOCK_INHERIT, /* Pass locks to parent. */ DB_LOCK_PUT, /* Release the lock. */ DB_LOCK_PUT_ALL, /* Release locker's locks. */ DB_LOCK_PUT_OBJ /* Release locker's locks on obj. */ @@ -571,15 +601,20 @@ typedef enum { /* * Simple R/W lock modes and for multi-granularity intention locking. * - * XXX + * !!! + * These values are NOT random, as they are used as an index into the lock + * conflicts arrays, i.e., DB_LOCK_IWRITE must be == 3, and DB_LOCK_IREAD + * must be == 4. + * + * !!! * Changes here must be reflected in java/src/com/sleepycat/db/Db.java. */ typedef enum { DB_LOCK_NG=0, /* Not granted. */ DB_LOCK_READ, /* Shared/read. */ DB_LOCK_WRITE, /* Exclusive/write. */ - DB_LOCK_IREAD, /* Intent to share/read. */ DB_LOCK_IWRITE, /* Intent exclusive/write. */ + DB_LOCK_IREAD, /* Intent to share/read. */ DB_LOCK_IWR /* Intent to read and write. */ } db_lockmode_t; @@ -647,10 +682,14 @@ int lock_id __P((DB_LOCKTAB *, u_int32_t *)); int lock_open __P((const char *, u_int32_t, int, DB_ENV *, DB_LOCKTAB **)); int lock_put __P((DB_LOCKTAB *, DB_LOCK)); +int lock_tget __P((DB_LOCKTAB *, + DB_TXN *, u_int32_t, const DBT *, db_lockmode_t, DB_LOCK *)); int lock_stat __P((DB_LOCKTAB *, DB_LOCK_STAT **, void *(*)(size_t))); int lock_unlink __P((const char *, int, DB_ENV *)); int lock_vec __P((DB_LOCKTAB *, u_int32_t, u_int32_t, DB_LOCKREQ *, int, DB_LOCKREQ **)); +int lock_tvec __P((DB_LOCKTAB *, + DB_TXN *, u_int32_t, DB_LOCKREQ *, int, DB_LOCKREQ **)); #if defined(__cplusplus) } #endif @@ -890,6 +929,7 @@ typedef struct { * 4BSD replaced the dbm interface with ndbm, and are not support here. */ #define dbminit(a) __db_dbm_init(a) +#define dbmclose __db_dbm_close #if !defined(__cplusplus) #define delete(a) __db_dbm_delete(a) #endif @@ -902,12 +942,13 @@ typedef struct { #if defined(__cplusplus) extern "C" { #endif -int __db_dbm_init __P((char *)); -int __db_dbm_delete __P((datum)); +int __db_dbm_close __P((void)); int __db_dbm_dbrdonly __P((void)); +int __db_dbm_delete __P((datum)); int __db_dbm_dirf __P((void)); datum __db_dbm_fetch __P((datum)); datum __db_dbm_firstkey __P((void)); +int __db_dbm_init __P((char *)); datum __db_dbm_nextkey __P((datum)); int __db_dbm_pagf __P((void)); int __db_dbm_store __P((datum, datum)); diff --git a/db2/db/db.c b/db2/db/db.c index 70c6c5443b..2b4c270324 100644 --- a/db2/db/db.c +++ b/db2/db/db.c @@ -44,7 +44,7 @@ #include "config.h" #ifndef lint -static const char sccsid[] = "@(#)db.c 10.57 (Sleepycat) 5/7/98"; +static const char sccsid[] = "@(#)db.c 10.75 (Sleepycat) 12/3/98"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -67,9 +67,6 @@ static const char sccsid[] = "@(#)db.c 10.57 (Sleepycat) 5/7/98"; #include "db_am.h" #include "common_ext.h" -static int db_close __P((DB *, u_int32_t)); -static int db_fd __P((DB *, int *)); - /* * If the metadata page has the flag set, set the local flag. If the page * does NOT have the flag set, return EINVAL if the user's dbinfo argument @@ -87,11 +84,6 @@ static int db_fd __P((DB *, int *)); } \ } -#ifdef _LIBC -#define db_open(fname, type, flags, mode, dbenv, dbinfo, dbpp) \ - __nss_db_open(fname, type, flags, mode, dbenv, dbinfo, dbpp) -#endif - /* * db_open -- * Main library interface to the DB access methods. @@ -141,9 +133,10 @@ db_open(fname, type, flags, mode, dbenv, dbinfo, dbpp) /* * Specifying a cachesize to db_open(3), after creating an - * environment, is a common mistake. + * environment with DB_INIT_MPOOL, is a common mistake. */ - if (dbinfo != NULL && dbinfo->db_cachesize != 0) { + if (dbenv->mp_info != NULL && + dbinfo != NULL && dbinfo->db_cachesize != 0) { __db_err(dbenv, "cachesize will be ignored if environment exists"); return (EINVAL); @@ -156,12 +149,16 @@ db_open(fname, type, flags, mode, dbenv, dbinfo, dbpp) real_name = NULL; /* Allocate the DB structure, reference the DB_ENV structure. */ - if ((dbp = (DB *)__db_calloc(1, sizeof(DB))) == NULL) { - __db_err(dbenv, "%s", strerror(ENOMEM)); - return (ENOMEM); - } + if ((ret = __os_calloc(1, sizeof(DB), &dbp)) != 0) + return (ret); dbp->dbenv = dbenv; + /* Random initialization. */ + TAILQ_INIT(&dbp->free_queue); + TAILQ_INIT(&dbp->active_queue); + if ((ret = __db_init_wrapper(dbp)) != 0) + goto err; + /* Convert the db_open(3) flags. */ if (LF_ISSET(DB_RDONLY)) F_SET(dbp, DB_AM_RDONLY); @@ -192,21 +189,16 @@ db_open(fname, type, flags, mode, dbenv, dbinfo, dbpp) } /* - * Always set the master and initialize the queues, so we can - * use these fields without checking the thread bit. - */ - dbp->master = dbp; - LIST_INIT(&dbp->handleq); - LIST_INSERT_HEAD(&dbp->handleq, dbp, links); - TAILQ_INIT(&dbp->curs_queue); - - /* * Set based on the dbenv fields, although no logging or transactions * are possible for temporary files. */ if (dbenv != NULL) { - if (dbenv->lk_info != NULL) - F_SET(dbp, DB_AM_LOCKING); + if (dbenv->lk_info != NULL) { + if (F_ISSET(dbenv, DB_ENV_CDB)) + F_SET(dbp, DB_AM_CDB); + else + F_SET(dbp, DB_AM_LOCKING); + } if (fname != NULL && dbenv->lg_info != NULL) F_SET(dbp, DB_AM_LOGGING); } @@ -215,9 +207,29 @@ db_open(fname, type, flags, mode, dbenv, dbinfo, dbpp) if (dbinfo == NULL) { dbp->pgsize = 0; dbp->db_malloc = NULL; + dbp->dup_compare = NULL; } else { + /* + * We don't want anything that's not a power-of-2, as we rely + * on that for alignment of various types on the pages. + */ + if ((dbp->pgsize = dbinfo->db_pagesize) != 0 && + (u_int32_t)1 << __db_log2(dbp->pgsize) != dbp->pgsize) { + __db_err(dbenv, "page sizes must be a power-of-2"); + goto einval; + } dbp->pgsize = dbinfo->db_pagesize; dbp->db_malloc = dbinfo->db_malloc; + if (F_ISSET(dbinfo, DB_DUPSORT)) { + if (F_ISSET(dbinfo, DB_DUP)) + dbp->dup_compare = dbinfo->dup_compare == NULL ? + __bam_defcmp : dbinfo->dup_compare; + else { + __db_err(dbenv, "DB_DUPSORT requires DB_DUP"); + goto einval; + } + F_CLR(dbinfo, DB_DUPSORT); + } } /* Fill in the default file mode. */ @@ -235,6 +247,7 @@ db_open(fname, type, flags, mode, dbenv, dbinfo, dbpp) default: goto err; } + dbp->byteswapped = F_ISSET(dbp, DB_AM_SWAP) ? 1 : 0; /* * If we have a file name, try and read the first page, figure out @@ -289,7 +302,7 @@ open_retry: if (LF_ISSET(DB_CREATE)) { * sizes, we limit the default pagesize to 16K. */ if (dbp->pgsize == 0) { - if ((ret = __db_ioinfo(real_name, + if ((ret = __os_ioinfo(real_name, fd, NULL, NULL, &iopsize)) != 0) { __db_err(dbenv, "%s: %s", real_name, strerror(ret)); @@ -299,6 +312,14 @@ open_retry: if (LF_ISSET(DB_CREATE)) { iopsize = 512; if (iopsize > 16 * 1024) iopsize = 16 * 1024; + + /* + * Sheer paranoia, but we don't want anything that's + * not a power-of-2, as we rely on that for alignment + * of various types on the pages. + */ + DB_ROUNDOFF(iopsize, 512); + dbp->pgsize = iopsize; F_SET(dbp, DB_AM_PGDEF); } @@ -308,11 +329,11 @@ open_retry: if (LF_ISSET(DB_CREATE)) { * that the meta-data for all access methods fits in 512 * bytes, and that no database will be smaller than that. */ - if ((ret = __db_read(fd, mbuf, sizeof(mbuf), &nr)) != 0) + if ((ret = __os_read(fd, mbuf, sizeof(mbuf), &nr)) != 0) goto err; /* The fd is no longer needed. */ - (void)__db_close(fd); + (void)__os_close(fd); fd = -1; if (nr != sizeof(mbuf)) { @@ -337,7 +358,7 @@ open_retry: if (LF_ISSET(DB_CREATE)) { */ if (retry_cnt++ < 3 && !LF_ISSET(DB_CREATE | DB_TRUNCATE)) { - __db_sleep(1, 0); + __os_sleep(1, 0); goto open_retry; } if (type == DB_UNKNOWN) { @@ -396,7 +417,7 @@ retry: switch (((BTMETA *)mbuf)->magic) { /* Copy the file's unique id. */ need_fileid = 0; - memcpy(dbp->lock.fileid, btm->uid, DB_FILE_ID_LEN); + memcpy(dbp->fileid, btm->uid, DB_FILE_ID_LEN); break; case DB_HASHMAGIC: if (type != DB_HASH && type != DB_UNKNOWN) @@ -425,7 +446,7 @@ retry: switch (((BTMETA *)mbuf)->magic) { /* Copy the file's unique id. */ need_fileid = 0; - memcpy(dbp->lock.fileid, hashm->uid, DB_FILE_ID_LEN); + memcpy(dbp->fileid, hashm->uid, DB_FILE_ID_LEN); break; default: if (swapped) { @@ -489,11 +510,9 @@ empty: /* F_SET(dbp, DB_AM_MLOCAL); if (dbenv == NULL) { - if ((dbp->mp_dbenv = - (DB_ENV *)__db_calloc(sizeof(DB_ENV), 1)) == NULL) { - ret = ENOMEM; + if ((ret = __os_calloc(1, + sizeof(DB_ENV), &dbp->mp_dbenv)) != 0) goto err; - } envp = dbp->mp_dbenv; restore = 0; @@ -554,20 +573,20 @@ empty: /* */ if (need_fileid) { if (fname == NULL) { - memset(dbp->lock.fileid, 0, DB_FILE_ID_LEN); + memset(dbp->fileid, 0, DB_FILE_ID_LEN); if (F_ISSET(dbp, DB_AM_LOCKING) && (ret = lock_id(dbenv->lk_info, - (u_int32_t *)dbp->lock.fileid)) != 0) + (u_int32_t *)dbp->fileid)) != 0) goto err; } else - if ((ret = __db_fileid(dbenv, - real_name, 1, dbp->lock.fileid)) != 0) + if ((ret = __os_fileid(dbenv, + real_name, 1, dbp->fileid)) != 0) goto err; } /* No further use for the real name. */ if (real_name != NULL) - FREES(real_name); + __os_freestr(real_name); real_name = NULL; /* @@ -595,7 +614,7 @@ empty: /* memset(&finfo, 0, sizeof(finfo)); finfo.ftype = ftype; finfo.pgcookie = &pgcookie; - finfo.fileid = dbp->lock.fileid; + finfo.fileid = dbp->fileid; finfo.lsn_offset = 0; finfo.clear_len = DB_PAGE_CLEAR_LEN; if ((ret = memp_fopen(dbp->mp, fname, @@ -605,12 +624,21 @@ empty: /* /* * XXX - * Truly spectacular layering violation. We need a per-thread mutex - * that lives in shared memory (thanks, HP-UX!) and so we acquire a - * pointer to the mpool one. + * We need a per-thread mutex that lives in shared memory -- HP-UX + * can't allocate mutexes in malloc'd memory. Allocate it from the + * shared memory region, since it's the only one that is guaranteed + * to exist. */ - if (F_ISSET(dbp, DB_AM_THREAD)) - dbp->mutexp = dbp->mpf->mutexp; + if (F_ISSET(dbp, DB_AM_THREAD)) { + if ((ret = __memp_reg_alloc(dbp->mp, + sizeof(db_mutex_t), NULL, &dbp->mutexp)) != 0) + goto err; + /* + * Since we only get here if DB_THREAD was specified, we know + * we have spinlocks and no file offset argument is needed. + */ + (void)__db_mutex_init(dbp->mutexp, 0); + } /* Get a log file id. */ if (F_ISSET(dbp, DB_AM_LOGGING) && @@ -618,18 +646,6 @@ empty: /* dbp, fname, type, &dbp->log_fileid)) != 0) goto err; - /* - * Get a locker id for this DB, and build the lock cookie: the first - * db_pgno_t bytes are the page number, the next N bytes are the file - * id. - */ - if (F_ISSET(dbp, DB_AM_LOCKING)) { - if ((ret = lock_id(dbenv->lk_info, &dbp->locker)) != 0) - goto err; - dbp->lock_dbt.size = sizeof(dbp->lock); - dbp->lock_dbt.data = &dbp->lock; - } - /* Call the real open function. */ switch (type) { case DB_BTREE: @@ -639,7 +655,7 @@ empty: /* if (dbinfo != NULL && (ret = __db_fcchk(dbenv, "db_open", dbinfo->flags, DB_DUP, DB_RECNUM)) != 0) goto err; - if ((ret = __bam_open(dbp, type, dbinfo)) != 0) + if ((ret = __bam_open(dbp, dbinfo)) != 0) goto err; break; case DB_HASH: @@ -655,24 +671,20 @@ empty: /* if (dbinfo != NULL && (ret = __db_fchk(dbenv, "db_open", dbinfo->flags, DB_INFO_FLAGS)) != 0) goto err; - if ((ret = __ram_open(dbp, type, dbinfo)) != 0) + if ((ret = __ram_open(dbp, dbinfo)) != 0) goto err; break; default: abort(); } - /* Call a local close routine. */ - dbp->close = db_close; - dbp->fd = db_fd; - *dbpp = dbp; return (0); einval: ret = EINVAL; err: /* Close the file descriptor. */ if (fd != -1) - (void)__db_close(fd); + (void)__os_close(fd); /* Discard the log file id. */ if (dbp->log_fileid != 0) @@ -688,90 +700,60 @@ err: /* Close the file descriptor. */ /* If we allocated a DB_ENV, discard it. */ if (dbp->mp_dbenv != NULL) - FREE(dbp->mp_dbenv, sizeof(DB_ENV)); + __os_free(dbp->mp_dbenv, sizeof(DB_ENV)); if (real_name != NULL) - FREES(real_name); + __os_freestr(real_name); if (dbp != NULL) - FREE(dbp, sizeof(DB)); + __os_free(dbp, sizeof(DB)); return (ret); } -#ifdef _LIBC -# undef db_open -weak_alias (__nss_db_open, db_open) -#endif - /* - * db_close -- + * __db_close -- * Close a DB tree. + * + * PUBLIC: int __db_close __P((DB *, u_int32_t)); */ -static int -db_close(dbp, flags) +int +__db_close(dbp, flags) DB *dbp; u_int32_t flags; { DBC *dbc; - DB *tdbp; int ret, t_ret; + DB_PANIC_CHECK(dbp); + /* Validate arguments. */ - if ((ret = __db_fchk(dbp->dbenv, "db_close", flags, DB_NOSYNC)) != 0) + if ((ret = __db_closechk(dbp, flags)) != 0) return (ret); /* Sync the underlying file. */ - if (!LF_ISSET(DB_NOSYNC) && + if (flags != DB_NOSYNC && (t_ret = dbp->sync(dbp, 0)) != 0 && ret == 0) ret = t_ret; /* - * Call the underlying access method close routine for all the - * cursors and handles. + * Go through the active cursors and call the cursor recycle routine, + * which resolves pending operations and moves the cursors onto the + * free list. Then, walk the free list and call the cursor destroy + * routine. */ - for (tdbp = LIST_FIRST(&dbp->handleq); - tdbp != NULL; tdbp = LIST_NEXT(tdbp, links)) { - while ((dbc = TAILQ_FIRST(&tdbp->curs_queue)) != NULL) - switch (tdbp->type) { - case DB_BTREE: - if ((t_ret = - __bam_c_iclose(tdbp, dbc)) != 0 && ret == 0) - ret = t_ret; - break; - case DB_HASH: - if ((t_ret = - __ham_c_iclose(tdbp, dbc)) != 0 && ret == 0) - ret = t_ret; - break; - case DB_RECNO: - if ((t_ret = - __ram_c_iclose(tdbp, dbc)) != 0 && ret == 0) - ret = t_ret; - break; - default: - abort(); - } - - switch (tdbp->type) { - case DB_BTREE: - if ((t_ret = __bam_close(tdbp)) != 0 && ret == 0) - ret = t_ret; - break; - case DB_HASH: - if ((t_ret = __ham_close(tdbp)) != 0 && ret == 0) - ret = t_ret; - break; - case DB_RECNO: - if ((t_ret = __ram_close(tdbp)) != 0 && ret == 0) - ret = t_ret; - break; - default: - abort(); - } - } + while ((dbc = TAILQ_FIRST(&dbp->active_queue)) != NULL) + if ((t_ret = dbc->c_close(dbc)) != 0 && ret == 0) + ret = t_ret; + while ((dbc = TAILQ_FIRST(&dbp->free_queue)) != NULL) + if ((t_ret = __db_c_destroy(dbc)) != 0 && ret == 0) + ret = t_ret; + + /* Call the access specific close function. */ + if ((t_ret = dbp->am_close(dbp)) != 0 && ret == 0) + ret = t_ret; /* Sync the memory pool. */ - if (!LF_ISSET(DB_NOSYNC) && (t_ret = memp_fsync(dbp->mpf)) != 0 && + if (flags != DB_NOSYNC && (t_ret = memp_fsync(dbp->mpf)) != 0 && t_ret != DB_INCOMPLETE && ret == 0) ret = t_ret; @@ -788,91 +770,12 @@ db_close(dbp, flags) if (F_ISSET(dbp, DB_AM_LOGGING)) (void)log_unregister(dbp->dbenv->lg_info, dbp->log_fileid); - /* Discard the lock cookie for all handles. */ - for (tdbp = LIST_FIRST(&dbp->handleq); - tdbp != NULL; tdbp = LIST_NEXT(tdbp, links)) - if (F_ISSET(tdbp, DB_AM_LOCKING)) { -#ifdef DEBUG - DB_LOCKREQ request; - - /* - * If we're running tests, display any locks currently - * held. It's possible that some applications may hold - * locks for long periods, e.g., conference room locks, - * but the DB tests should never close holding locks. - */ - request.op = DB_LOCK_DUMP; - if ((t_ret = lock_vec(tdbp->dbenv->lk_info, - tdbp->locker, 0, &request, 1, NULL)) != 0 && - ret == 0) - ret = EAGAIN; -#endif - } - /* If we allocated a DB_ENV, discard it. */ if (dbp->mp_dbenv != NULL) - FREE(dbp->mp_dbenv, sizeof(DB_ENV)); + __os_free(dbp->mp_dbenv, sizeof(DB_ENV)); - /* Free all of the DB's. */ - LIST_REMOVE(dbp, links); - while ((tdbp = LIST_FIRST(&dbp->handleq)) != NULL) { - LIST_REMOVE(tdbp, links); - FREE(tdbp, sizeof(*tdbp)); - } - FREE(dbp, sizeof(*dbp)); + /* Free the DB. */ + __os_free(dbp, sizeof(*dbp)); return (ret); } - -/* - * db_fd -- - * Return a file descriptor for flock'ing. - */ -static int -db_fd(dbp, fdp) - DB *dbp; - int *fdp; -{ - /* - * XXX - * Truly spectacular layering violation. - */ - return (__mp_xxx_fd(dbp->mpf, fdp)); -} - -/* - * __db_pgerr -- - * Error when unable to retrieve a specified page. - * - * PUBLIC: int __db_pgerr __P((DB *, db_pgno_t)); - */ -int -__db_pgerr(dbp, pgno) - DB *dbp; - db_pgno_t pgno; -{ - /* - * Three things are certain: - * Death, taxes, and lost data. - * Guess which has occurred. - */ - __db_err(dbp->dbenv, - "unable to create/retrieve page %lu", (u_long)pgno); - return (__db_panic(dbp)); -} - -/* - * __db_pgfmt -- - * Error when a page has the wrong format. - * - * PUBLIC: int __db_pgfmt __P((DB *, db_pgno_t)); - */ -int -__db_pgfmt(dbp, pgno) - DB *dbp; - db_pgno_t pgno; -{ - __db_err(dbp->dbenv, - "page %lu: illegal page type or format", (u_long)pgno); - return (__db_panic(dbp)); -} diff --git a/db2/db/db.src b/db2/db/db.src index 91d8b390a1..26557e10ac 100644 --- a/db2/db/db.src +++ b/db2/db/db.src @@ -4,7 +4,7 @@ * Copyright (c) 1996, 1997, 1998 * Sleepycat Software. All rights reserved. * - * @(#)db.src 10.6 (Sleepycat) 4/28/98 + * @(#)db.src 10.8 (Sleepycat) 9/20/98 */ PREFIX db @@ -98,6 +98,7 @@ END /* * relink -- Handles relinking around a page. * + * opcode: indicates if this is an addpage or delete page * pgno: the page being changed. * lsn the page's original lsn. * prev: the previous page. @@ -106,6 +107,7 @@ END * lsn_next: the previous page's original lsn. */ BEGIN relink +ARG opcode u_int32_t lu ARG fileid u_int32_t lu ARG pgno db_pgno_t lu POINTER lsn DB_LSN * lu @@ -148,12 +150,3 @@ DBT key DBT s DBT data DBT s ARG arg_flags u_int32_t lu END - -/* - * noop -- do nothing, but get an LSN. - */ -BEGIN noop -ARG fileid u_int32_t lu -ARG pgno db_pgno_t lu -POINTER prevlsn DB_LSN * lu -END diff --git a/db2/db/db_am.c b/db2/db/db_am.c new file mode 100644 index 0000000000..e02ad57f53 --- /dev/null +++ b/db2/db/db_am.c @@ -0,0 +1,430 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1998 + * Sleepycat Software. All rights reserved. + */ + +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)db_am.c 10.15 (Sleepycat) 12/30/98"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <errno.h> +#include <stdlib.h> +#include <string.h> +#endif + +#include "db_int.h" +#include "shqueue.h" +#include "db_page.h" +#include "db_shash.h" +#include "mp.h" +#include "btree.h" +#include "hash.h" +#include "db_am.h" +#include "db_ext.h" + +static int __db_c_close __P((DBC *)); +static int __db_cursor __P((DB *, DB_TXN *, DBC **, u_int32_t)); +static int __db_fd __P((DB *, int *)); +static int __db_get __P((DB *, DB_TXN *, DBT *, DBT *, u_int32_t)); +static int __db_put __P((DB *, DB_TXN *, DBT *, DBT *, u_int32_t)); + +/* + * __db_init_wrapper -- + * Wrapper layer to implement generic DB functions. + * + * PUBLIC: int __db_init_wrapper __P((DB *)); + */ +int +__db_init_wrapper(dbp) + DB *dbp; +{ + dbp->close = __db_close; + dbp->cursor = __db_cursor; + dbp->del = NULL; /* !!! Must be set by access method. */ + dbp->fd = __db_fd; + dbp->get = __db_get; + dbp->join = __db_join; + dbp->put = __db_put; + dbp->stat = NULL; /* !!! Must be set by access method. */ + dbp->sync = __db_sync; + + return (0); +} + +/* + * __db_cursor -- + * Allocate and return a cursor. + */ +static int +__db_cursor(dbp, txn, dbcp, flags) + DB *dbp; + DB_TXN *txn; + DBC **dbcp; + u_int32_t flags; +{ + DBC *dbc, *adbc; + int ret; + db_lockmode_t mode; + u_int32_t op; + + DB_PANIC_CHECK(dbp); + + /* Take one from the free list if it's available. */ + DB_THREAD_LOCK(dbp); + if ((dbc = TAILQ_FIRST(&dbp->free_queue)) != NULL) + TAILQ_REMOVE(&dbp->free_queue, dbc, links); + else { + DB_THREAD_UNLOCK(dbp); + + if ((ret = __os_calloc(1, sizeof(DBC), &dbc)) != 0) + return (ret); + + dbc->dbp = dbp; + dbc->c_close = __db_c_close; + + /* Set up locking information. */ + if (F_ISSET(dbp, DB_AM_LOCKING | DB_AM_CDB)) { + /* + * If we are not threaded, then there is no need to + * create new locker ids. We know that no one else + * is running concurrently using this DB, so we can + * take a peek at any cursors on the active queue. + */ + if (!F_ISSET(dbp, DB_AM_THREAD) && + (adbc = TAILQ_FIRST(&dbp->active_queue)) != NULL) + dbc->lid = adbc->lid; + else + if ((ret = lock_id(dbp->dbenv->lk_info, + &dbc->lid)) != 0) + goto err; + + memcpy(dbc->lock.fileid, dbp->fileid, DB_FILE_ID_LEN); + if (F_ISSET(dbp, DB_AM_CDB)) { + dbc->lock_dbt.size = DB_FILE_ID_LEN; + dbc->lock_dbt.data = dbc->lock.fileid; + } else { + dbc->lock_dbt.size = sizeof(dbc->lock); + dbc->lock_dbt.data = &dbc->lock; + } + } + + switch (dbp->type) { + case DB_BTREE: + case DB_RECNO: + if ((ret = __bam_c_init(dbc)) != 0) + goto err; + break; + case DB_HASH: + if ((ret = __ham_c_init(dbc)) != 0) + goto err; + break; + default: + ret = EINVAL; + goto err; + } + + DB_THREAD_LOCK(dbp); + } + + if ((dbc->txn = txn) == NULL) + dbc->locker = dbc->lid; + else + dbc->locker = txn->txnid; + + TAILQ_INSERT_TAIL(&dbp->active_queue, dbc, links); + DB_THREAD_UNLOCK(dbp); + + /* + * If this is the concurrent DB product, then we do all locking + * in the interface, which is right here. + */ + if (F_ISSET(dbp, DB_AM_CDB)) { + op = LF_ISSET(DB_OPFLAGS_MASK); + mode = (op == DB_WRITELOCK) ? DB_LOCK_WRITE : + (LF_ISSET(DB_RMW) ? DB_LOCK_IWRITE : DB_LOCK_READ); + if ((ret = lock_get(dbp->dbenv->lk_info, dbc->locker, 0, + &dbc->lock_dbt, mode, &dbc->mylock)) != 0) { + (void)__db_c_close(dbc); + return (EAGAIN); + } + if (LF_ISSET(DB_RMW)) + F_SET(dbc, DBC_RMW); + if (op == DB_WRITELOCK) + F_SET(dbc, DBC_WRITER); + } + + *dbcp = dbc; + return (0); + +err: __os_free(dbc, sizeof(*dbc)); + return (ret); +} + +/* + * __db_c_close -- + * Close the cursor (recycle for later use). + */ +static int +__db_c_close(dbc) + DBC *dbc; +{ + DB *dbp; + int ret, t_ret; + + dbp = dbc->dbp; + + DB_PANIC_CHECK(dbp); + + ret = 0; + + /* + * We cannot release the lock until after we've called the + * access method specific routine, since btrees may have pending + * deletes. + */ + + /* Remove the cursor from the active queue. */ + DB_THREAD_LOCK(dbp); + TAILQ_REMOVE(&dbp->active_queue, dbc, links); + DB_THREAD_UNLOCK(dbp); + + /* Call the access specific cursor close routine. */ + if ((t_ret = dbc->c_am_close(dbc)) != 0 && ret == 0) + t_ret = ret; + + /* Release the lock. */ + if (F_ISSET(dbc->dbp, DB_AM_CDB) && dbc->mylock != LOCK_INVALID) { + ret = lock_put(dbc->dbp->dbenv->lk_info, dbc->mylock); + dbc->mylock = LOCK_INVALID; + } + + /* Clean up the cursor. */ + dbc->flags = 0; + +#ifdef DEBUG + /* + * Check for leftover locks, unless we're running with transactions. + * + * If we're running tests, display any locks currently held. It's + * possible that some applications may hold locks for long periods, + * e.g., conference room locks, but the DB tests should never close + * holding locks. + */ + if (F_ISSET(dbp, DB_AM_LOCKING) && dbc->lid == dbc->locker) { + DB_LOCKREQ request; + + request.op = DB_LOCK_DUMP; + if ((t_ret = lock_vec(dbp->dbenv->lk_info, + dbc->locker, 0, &request, 1, NULL)) != 0 && ret == 0) + ret = EAGAIN; + } +#endif + /* Move the cursor to the free queue. */ + DB_THREAD_LOCK(dbp); + TAILQ_INSERT_TAIL(&dbp->free_queue, dbc, links); + DB_THREAD_UNLOCK(dbp); + + return (ret); +} + +#ifdef DEBUG +/* + * __db_cprint -- + * Display the current cursor list. + * + * PUBLIC: int __db_cprint __P((DB *)); + */ +int +__db_cprint(dbp) + DB *dbp; +{ + static const FN fn[] = { + { DBC_RECOVER, "recover" }, + { DBC_RMW, "read-modify-write" }, + { 0 }, + }; + DBC *dbc; + + DB_THREAD_LOCK(dbp); + for (dbc = TAILQ_FIRST(&dbp->active_queue); + dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) { + fprintf(stderr, + "%#0x: dbp: %#0x txn: %#0x lid: %lu locker: %lu", + (u_int)dbc, (u_int)dbc->dbp, (u_int)dbc->txn, + (u_long)dbc->lid, (u_long)dbc->locker); + __db_prflags(dbc->flags, fn, stderr); + fprintf(stderr, "\n"); + } + DB_THREAD_UNLOCK(dbp); + + return (0); +} +#endif /* DEBUG */ + +/* + * __db_c_destroy -- + * Destroy the cursor. + * + * PUBLIC: int __db_c_destroy __P((DBC *)); + */ +int +__db_c_destroy(dbc) + DBC *dbc; +{ + DB *dbp; + int ret; + + dbp = dbc->dbp; + + /* Remove the cursor from the free queue. */ + DB_THREAD_LOCK(dbp); + TAILQ_REMOVE(&dbp->free_queue, dbc, links); + DB_THREAD_UNLOCK(dbp); + + /* Call the access specific cursor destroy routine. */ + ret = dbc->c_am_destroy == NULL ? 0 : dbc->c_am_destroy(dbc); + + /* Free up allocated memory. */ + if (dbc->rkey.data != NULL) + __os_free(dbc->rkey.data, dbc->rkey.ulen); + if (dbc->rdata.data != NULL) + __os_free(dbc->rdata.data, dbc->rdata.ulen); + __os_free(dbc, sizeof(*dbc)); + + return (0); +} + +/* + * db_fd -- + * Return a file descriptor for flock'ing. + */ +static int +__db_fd(dbp, fdp) + DB *dbp; + int *fdp; +{ + DB_PANIC_CHECK(dbp); + + /* + * XXX + * Truly spectacular layering violation. + */ + return (__mp_xxx_fd(dbp->mpf, fdp)); +} + +/* + * __db_get -- + * Return a key/data pair. + */ +static int +__db_get(dbp, txn, key, data, flags) + DB *dbp; + DB_TXN *txn; + DBT *key, *data; + u_int32_t flags; +{ + DBC *dbc; + int ret, t_ret; + + DB_PANIC_CHECK(dbp); + + if ((ret = __db_getchk(dbp, key, data, flags)) != 0) + return (ret); + + if ((ret = dbp->cursor(dbp, txn, &dbc, 0)) != 0) + return (ret); + + DEBUG_LREAD(dbc, txn, "__db_get", key, NULL, flags); + + ret = dbc->c_get(dbc, key, data, + flags == 0 || flags == DB_RMW ? flags | DB_SET : flags); + + if ((t_ret = __db_c_close(dbc)) != 0 && ret == 0) + ret = t_ret; + + return (ret); +} + +/* + * __db_put -- + * Store a key/data pair. + */ +static int +__db_put(dbp, txn, key, data, flags) + DB *dbp; + DB_TXN *txn; + DBT *key, *data; + u_int32_t flags; +{ + DBC *dbc; + DBT tdata; + int ret, t_ret; + + DB_PANIC_CHECK(dbp); + + if ((ret = __db_putchk(dbp, key, data, + flags, F_ISSET(dbp, DB_AM_RDONLY), F_ISSET(dbp, DB_AM_DUP))) != 0) + return (ret); + + if ((ret = dbp->cursor(dbp, txn, &dbc, DB_WRITELOCK)) != 0) + return (ret); + + DEBUG_LWRITE(dbc, txn, "__db_put", key, data, flags); + + if (flags == DB_NOOVERWRITE) { + /* + * Set DB_DBT_USERMEM, this might be a threaded application and + * the flags checking will catch us. We don't want the actual + * data, so request a partial of length 0. + */ + memset(&tdata, 0, sizeof(tdata)); + F_SET(&tdata, DB_DBT_USERMEM | DB_DBT_PARTIAL); + if ((ret = dbc->c_get(dbc, key, &tdata, DB_SET | DB_RMW)) == 0) + ret = DB_KEYEXIST; + else + ret = 0; + } + if (ret == 0) + ret = dbc->c_put(dbc, key, data, DB_KEYLAST); + + if ((t_ret = __db_c_close(dbc)) != 0 && ret == 0) + ret = t_ret; + + return (ret); +} + +/* + * __db_sync -- + * Flush the database cache. + * + * PUBLIC: int __db_sync __P((DB *, u_int32_t)); + */ +int +__db_sync(dbp, flags) + DB *dbp; + u_int32_t flags; +{ + int ret; + + DB_PANIC_CHECK(dbp); + + if ((ret = __db_syncchk(dbp, flags)) != 0) + return (ret); + + /* If it wasn't possible to modify the file, we're done. */ + if (F_ISSET(dbp, DB_AM_INMEM | DB_AM_RDONLY)) + return (0); + + /* Flush any dirty pages from the cache to the backing file. */ + if ((ret = memp_fsync(dbp->mpf)) == DB_INCOMPLETE) + ret = 0; + + return (ret); +} diff --git a/db2/db/db_auto.c b/db2/db/db_auto.c index 5203e0a94c..e3dba23c8b 100644 --- a/db2/db/db_auto.c +++ b/db2/db/db_auto.c @@ -10,7 +10,6 @@ #endif #include "db_int.h" -#include "shqueue.h" #include "db_page.h" #include "db_dispatch.h" #include "db_am.h" @@ -46,8 +45,7 @@ int __db_addrem_log(logp, txnid, ret_lsnp, flags, rectype = DB_db_addrem; txn_num = txnid == NULL ? 0 : txnid->txnid; if (txnid == NULL) { - null_lsn.file = 0; - null_lsn.offset = 0; + ZERO_LSN(null_lsn); lsnp = &null_lsn; } else lsnp = &txnid->last_lsn; @@ -60,8 +58,8 @@ int __db_addrem_log(logp, txnid, ret_lsnp, flags, + sizeof(u_int32_t) + (hdr == NULL ? 0 : hdr->size) + sizeof(u_int32_t) + (dbt == NULL ? 0 : dbt->size) + sizeof(*pagelsn); - if ((logrec.data = (void *)__db_malloc(logrec.size)) == NULL) - return (ENOMEM); + if ((ret = __os_malloc(logrec.size, NULL, &logrec.data)) != 0) + return (ret); bp = logrec.data; memcpy(bp, &rectype, sizeof(rectype)); @@ -112,7 +110,7 @@ int __db_addrem_log(logp, txnid, ret_lsnp, flags, ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags); if (txnid != NULL) txnid->last_lsn = *ret_lsnp; - __db_free(logrec.data); + __os_free(logrec.data, 0); return (ret); } @@ -174,7 +172,7 @@ __db_addrem_print(notused1, dbtp, lsnp, notused2, notused3) printf("\tpagelsn: [%lu][%lu]\n", (u_long)argp->pagelsn.file, (u_long)argp->pagelsn.offset); printf("\n"); - __db_free(argp); + __os_free(argp, 0); return (0); } @@ -188,11 +186,12 @@ __db_addrem_read(recbuf, argpp) { __db_addrem_args *argp; u_int8_t *bp; + int ret; - argp = (__db_addrem_args *)__db_malloc(sizeof(__db_addrem_args) + - sizeof(DB_TXN)); - if (argp == NULL) - return (ENOMEM); + ret = __os_malloc(sizeof(__db_addrem_args) + + sizeof(DB_TXN), NULL, &argp); + if (ret != 0) + return (ret); argp->txnid = (DB_TXN *)&argp[1]; bp = recbuf; memcpy(&argp->type, bp, sizeof(argp->type)); @@ -253,8 +252,7 @@ int __db_split_log(logp, txnid, ret_lsnp, flags, rectype = DB_db_split; txn_num = txnid == NULL ? 0 : txnid->txnid; if (txnid == NULL) { - null_lsn.file = 0; - null_lsn.offset = 0; + ZERO_LSN(null_lsn); lsnp = &null_lsn; } else lsnp = &txnid->last_lsn; @@ -264,8 +262,8 @@ int __db_split_log(logp, txnid, ret_lsnp, flags, + sizeof(pgno) + sizeof(u_int32_t) + (pageimage == NULL ? 0 : pageimage->size) + sizeof(*pagelsn); - if ((logrec.data = (void *)__db_malloc(logrec.size)) == NULL) - return (ENOMEM); + if ((ret = __os_malloc(logrec.size, NULL, &logrec.data)) != 0) + return (ret); bp = logrec.data; memcpy(bp, &rectype, sizeof(rectype)); @@ -302,7 +300,7 @@ int __db_split_log(logp, txnid, ret_lsnp, flags, ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags); if (txnid != NULL) txnid->last_lsn = *ret_lsnp; - __db_free(logrec.data); + __os_free(logrec.data, 0); return (ret); } @@ -353,7 +351,7 @@ __db_split_print(notused1, dbtp, lsnp, notused2, notused3) printf("\tpagelsn: [%lu][%lu]\n", (u_long)argp->pagelsn.file, (u_long)argp->pagelsn.offset); printf("\n"); - __db_free(argp); + __os_free(argp, 0); return (0); } @@ -367,11 +365,12 @@ __db_split_read(recbuf, argpp) { __db_split_args *argp; u_int8_t *bp; + int ret; - argp = (__db_split_args *)__db_malloc(sizeof(__db_split_args) + - sizeof(DB_TXN)); - if (argp == NULL) - return (ENOMEM); + ret = __os_malloc(sizeof(__db_split_args) + + sizeof(DB_TXN), NULL, &argp); + if (ret != 0) + return (ret); argp->txnid = (DB_TXN *)&argp[1]; bp = recbuf; memcpy(&argp->type, bp, sizeof(argp->type)); @@ -430,8 +429,7 @@ int __db_big_log(logp, txnid, ret_lsnp, flags, rectype = DB_db_big; txn_num = txnid == NULL ? 0 : txnid->txnid; if (txnid == NULL) { - null_lsn.file = 0; - null_lsn.offset = 0; + ZERO_LSN(null_lsn); lsnp = &null_lsn; } else lsnp = &txnid->last_lsn; @@ -445,8 +443,8 @@ int __db_big_log(logp, txnid, ret_lsnp, flags, + sizeof(*pagelsn) + sizeof(*prevlsn) + sizeof(*nextlsn); - if ((logrec.data = (void *)__db_malloc(logrec.size)) == NULL) - return (ENOMEM); + if ((ret = __os_malloc(logrec.size, NULL, &logrec.data)) != 0) + return (ret); bp = logrec.data; memcpy(bp, &rectype, sizeof(rectype)); @@ -497,7 +495,7 @@ int __db_big_log(logp, txnid, ret_lsnp, flags, ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags); if (txnid != NULL) txnid->last_lsn = *ret_lsnp; - __db_free(logrec.data); + __os_free(logrec.data, 0); return (ret); } @@ -554,7 +552,7 @@ __db_big_print(notused1, dbtp, lsnp, notused2, notused3) printf("\tnextlsn: [%lu][%lu]\n", (u_long)argp->nextlsn.file, (u_long)argp->nextlsn.offset); printf("\n"); - __db_free(argp); + __os_free(argp, 0); return (0); } @@ -568,11 +566,12 @@ __db_big_read(recbuf, argpp) { __db_big_args *argp; u_int8_t *bp; + int ret; - argp = (__db_big_args *)__db_malloc(sizeof(__db_big_args) + - sizeof(DB_TXN)); - if (argp == NULL) - return (ENOMEM); + ret = __os_malloc(sizeof(__db_big_args) + + sizeof(DB_TXN), NULL, &argp); + if (ret != 0) + return (ret); argp->txnid = (DB_TXN *)&argp[1]; bp = recbuf; memcpy(&argp->type, bp, sizeof(argp->type)); @@ -630,8 +629,7 @@ int __db_ovref_log(logp, txnid, ret_lsnp, flags, rectype = DB_db_ovref; txn_num = txnid == NULL ? 0 : txnid->txnid; if (txnid == NULL) { - null_lsn.file = 0; - null_lsn.offset = 0; + ZERO_LSN(null_lsn); lsnp = &null_lsn; } else lsnp = &txnid->last_lsn; @@ -640,8 +638,8 @@ int __db_ovref_log(logp, txnid, ret_lsnp, flags, + sizeof(pgno) + sizeof(adjust) + sizeof(*lsn); - if ((logrec.data = (void *)__db_malloc(logrec.size)) == NULL) - return (ENOMEM); + if ((ret = __os_malloc(logrec.size, NULL, &logrec.data)) != 0) + return (ret); bp = logrec.data; memcpy(bp, &rectype, sizeof(rectype)); @@ -668,7 +666,7 @@ int __db_ovref_log(logp, txnid, ret_lsnp, flags, ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags); if (txnid != NULL) txnid->last_lsn = *ret_lsnp; - __db_free(logrec.data); + __os_free(logrec.data, 0); return (ret); } @@ -710,7 +708,7 @@ __db_ovref_print(notused1, dbtp, lsnp, notused2, notused3) printf("\tlsn: [%lu][%lu]\n", (u_long)argp->lsn.file, (u_long)argp->lsn.offset); printf("\n"); - __db_free(argp); + __os_free(argp, 0); return (0); } @@ -724,11 +722,12 @@ __db_ovref_read(recbuf, argpp) { __db_ovref_args *argp; u_int8_t *bp; + int ret; - argp = (__db_ovref_args *)__db_malloc(sizeof(__db_ovref_args) + - sizeof(DB_TXN)); - if (argp == NULL) - return (ENOMEM); + ret = __os_malloc(sizeof(__db_ovref_args) + + sizeof(DB_TXN), NULL, &argp); + if (ret != 0) + return (ret); argp->txnid = (DB_TXN *)&argp[1]; bp = recbuf; memcpy(&argp->type, bp, sizeof(argp->type)); @@ -752,16 +751,17 @@ __db_ovref_read(recbuf, argpp) /* * PUBLIC: int __db_relink_log * PUBLIC: __P((DB_LOG *, DB_TXN *, DB_LSN *, u_int32_t, - * PUBLIC: u_int32_t, db_pgno_t, DB_LSN *, db_pgno_t, - * PUBLIC: DB_LSN *, db_pgno_t, DB_LSN *)); + * PUBLIC: u_int32_t, u_int32_t, db_pgno_t, DB_LSN *, + * PUBLIC: db_pgno_t, DB_LSN *, db_pgno_t, DB_LSN *)); */ int __db_relink_log(logp, txnid, ret_lsnp, flags, - fileid, pgno, lsn, prev, lsn_prev, next, - lsn_next) + opcode, fileid, pgno, lsn, prev, lsn_prev, + next, lsn_next) DB_LOG *logp; DB_TXN *txnid; DB_LSN *ret_lsnp; u_int32_t flags; + u_int32_t opcode; u_int32_t fileid; db_pgno_t pgno; DB_LSN * lsn; @@ -779,12 +779,12 @@ int __db_relink_log(logp, txnid, ret_lsnp, flags, rectype = DB_db_relink; txn_num = txnid == NULL ? 0 : txnid->txnid; if (txnid == NULL) { - null_lsn.file = 0; - null_lsn.offset = 0; + ZERO_LSN(null_lsn); lsnp = &null_lsn; } else lsnp = &txnid->last_lsn; logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) + + sizeof(opcode) + sizeof(fileid) + sizeof(pgno) + sizeof(*lsn) @@ -792,8 +792,8 @@ int __db_relink_log(logp, txnid, ret_lsnp, flags, + sizeof(*lsn_prev) + sizeof(next) + sizeof(*lsn_next); - if ((logrec.data = (void *)__db_malloc(logrec.size)) == NULL) - return (ENOMEM); + if ((ret = __os_malloc(logrec.size, NULL, &logrec.data)) != 0) + return (ret); bp = logrec.data; memcpy(bp, &rectype, sizeof(rectype)); @@ -802,6 +802,8 @@ int __db_relink_log(logp, txnid, ret_lsnp, flags, bp += sizeof(txn_num); memcpy(bp, lsnp, sizeof(DB_LSN)); bp += sizeof(DB_LSN); + memcpy(bp, &opcode, sizeof(opcode)); + bp += sizeof(opcode); memcpy(bp, &fileid, sizeof(fileid)); bp += sizeof(fileid); memcpy(bp, &pgno, sizeof(pgno)); @@ -832,7 +834,7 @@ int __db_relink_log(logp, txnid, ret_lsnp, flags, ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags); if (txnid != NULL) txnid->last_lsn = *ret_lsnp; - __db_free(logrec.data); + __os_free(logrec.data, 0); return (ret); } @@ -868,6 +870,7 @@ __db_relink_print(notused1, dbtp, lsnp, notused2, notused3) (u_long)argp->txnid->txnid, (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset); + printf("\topcode: %lu\n", (u_long)argp->opcode); printf("\tfileid: %lu\n", (u_long)argp->fileid); printf("\tpgno: %lu\n", (u_long)argp->pgno); printf("\tlsn: [%lu][%lu]\n", @@ -879,7 +882,7 @@ __db_relink_print(notused1, dbtp, lsnp, notused2, notused3) printf("\tlsn_next: [%lu][%lu]\n", (u_long)argp->lsn_next.file, (u_long)argp->lsn_next.offset); printf("\n"); - __db_free(argp); + __os_free(argp, 0); return (0); } @@ -893,11 +896,12 @@ __db_relink_read(recbuf, argpp) { __db_relink_args *argp; u_int8_t *bp; + int ret; - argp = (__db_relink_args *)__db_malloc(sizeof(__db_relink_args) + - sizeof(DB_TXN)); - if (argp == NULL) - return (ENOMEM); + ret = __os_malloc(sizeof(__db_relink_args) + + sizeof(DB_TXN), NULL, &argp); + if (ret != 0) + return (ret); argp->txnid = (DB_TXN *)&argp[1]; bp = recbuf; memcpy(&argp->type, bp, sizeof(argp->type)); @@ -906,6 +910,8 @@ __db_relink_read(recbuf, argpp) bp += sizeof(argp->txnid->txnid); memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN)); bp += sizeof(DB_LSN); + memcpy(&argp->opcode, bp, sizeof(argp->opcode)); + bp += sizeof(argp->opcode); memcpy(&argp->fileid, bp, sizeof(argp->fileid)); bp += sizeof(argp->fileid); memcpy(&argp->pgno, bp, sizeof(argp->pgno)); @@ -951,8 +957,7 @@ int __db_addpage_log(logp, txnid, ret_lsnp, flags, rectype = DB_db_addpage; txn_num = txnid == NULL ? 0 : txnid->txnid; if (txnid == NULL) { - null_lsn.file = 0; - null_lsn.offset = 0; + ZERO_LSN(null_lsn); lsnp = &null_lsn; } else lsnp = &txnid->last_lsn; @@ -962,8 +967,8 @@ int __db_addpage_log(logp, txnid, ret_lsnp, flags, + sizeof(*lsn) + sizeof(nextpgno) + sizeof(*nextlsn); - if ((logrec.data = (void *)__db_malloc(logrec.size)) == NULL) - return (ENOMEM); + if ((ret = __os_malloc(logrec.size, NULL, &logrec.data)) != 0) + return (ret); bp = logrec.data; memcpy(bp, &rectype, sizeof(rectype)); @@ -995,7 +1000,7 @@ int __db_addpage_log(logp, txnid, ret_lsnp, flags, ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags); if (txnid != NULL) txnid->last_lsn = *ret_lsnp; - __db_free(logrec.data); + __os_free(logrec.data, 0); return (ret); } @@ -1039,7 +1044,7 @@ __db_addpage_print(notused1, dbtp, lsnp, notused2, notused3) printf("\tnextlsn: [%lu][%lu]\n", (u_long)argp->nextlsn.file, (u_long)argp->nextlsn.offset); printf("\n"); - __db_free(argp); + __os_free(argp, 0); return (0); } @@ -1053,11 +1058,12 @@ __db_addpage_read(recbuf, argpp) { __db_addpage_args *argp; u_int8_t *bp; + int ret; - argp = (__db_addpage_args *)__db_malloc(sizeof(__db_addpage_args) + - sizeof(DB_TXN)); - if (argp == NULL) - return (ENOMEM); + ret = __os_malloc(sizeof(__db_addpage_args) + + sizeof(DB_TXN), NULL, &argp); + if (ret != 0) + return (ret); argp->txnid = (DB_TXN *)&argp[1]; bp = recbuf; memcpy(&argp->type, bp, sizeof(argp->type)); @@ -1108,8 +1114,7 @@ int __db_debug_log(logp, txnid, ret_lsnp, flags, rectype = DB_db_debug; txn_num = txnid == NULL ? 0 : txnid->txnid; if (txnid == NULL) { - null_lsn.file = 0; - null_lsn.offset = 0; + ZERO_LSN(null_lsn); lsnp = &null_lsn; } else lsnp = &txnid->last_lsn; @@ -1119,8 +1124,8 @@ int __db_debug_log(logp, txnid, ret_lsnp, flags, + sizeof(u_int32_t) + (key == NULL ? 0 : key->size) + sizeof(u_int32_t) + (data == NULL ? 0 : data->size) + sizeof(arg_flags); - if ((logrec.data = (void *)__db_malloc(logrec.size)) == NULL) - return (ENOMEM); + if ((ret = __os_malloc(logrec.size, NULL, &logrec.data)) != 0) + return (ret); bp = logrec.data; memcpy(bp, &rectype, sizeof(rectype)); @@ -1170,7 +1175,7 @@ int __db_debug_log(logp, txnid, ret_lsnp, flags, ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags); if (txnid != NULL) txnid->last_lsn = *ret_lsnp; - __db_free(logrec.data); + __os_free(logrec.data, 0); return (ret); } @@ -1236,7 +1241,7 @@ __db_debug_print(notused1, dbtp, lsnp, notused2, notused3) printf("\n"); printf("\targ_flags: %lu\n", (u_long)argp->arg_flags); printf("\n"); - __db_free(argp); + __os_free(argp, 0); return (0); } @@ -1250,11 +1255,12 @@ __db_debug_read(recbuf, argpp) { __db_debug_args *argp; u_int8_t *bp; + int ret; - argp = (__db_debug_args *)__db_malloc(sizeof(__db_debug_args) + - sizeof(DB_TXN)); - if (argp == NULL) - return (ENOMEM); + ret = __os_malloc(sizeof(__db_debug_args) + + sizeof(DB_TXN), NULL, &argp); + if (ret != 0) + return (ret); argp->txnid = (DB_TXN *)&argp[1]; bp = recbuf; memcpy(&argp->type, bp, sizeof(argp->type)); @@ -1284,143 +1290,6 @@ __db_debug_read(recbuf, argpp) } /* - * PUBLIC: int __db_noop_log - * PUBLIC: __P((DB_LOG *, DB_TXN *, DB_LSN *, u_int32_t, - * PUBLIC: u_int32_t, db_pgno_t, DB_LSN *)); - */ -int __db_noop_log(logp, txnid, ret_lsnp, flags, - fileid, pgno, prevlsn) - DB_LOG *logp; - DB_TXN *txnid; - DB_LSN *ret_lsnp; - u_int32_t flags; - u_int32_t fileid; - db_pgno_t pgno; - DB_LSN * prevlsn; -{ - DBT logrec; - DB_LSN *lsnp, null_lsn; - u_int32_t rectype, txn_num; - int ret; - u_int8_t *bp; - - rectype = DB_db_noop; - txn_num = txnid == NULL ? 0 : txnid->txnid; - if (txnid == NULL) { - null_lsn.file = 0; - null_lsn.offset = 0; - lsnp = &null_lsn; - } else - lsnp = &txnid->last_lsn; - logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) - + sizeof(fileid) - + sizeof(pgno) - + sizeof(*prevlsn); - if ((logrec.data = (void *)__db_malloc(logrec.size)) == NULL) - return (ENOMEM); - - bp = logrec.data; - memcpy(bp, &rectype, sizeof(rectype)); - bp += sizeof(rectype); - memcpy(bp, &txn_num, sizeof(txn_num)); - bp += sizeof(txn_num); - memcpy(bp, lsnp, sizeof(DB_LSN)); - bp += sizeof(DB_LSN); - memcpy(bp, &fileid, sizeof(fileid)); - bp += sizeof(fileid); - memcpy(bp, &pgno, sizeof(pgno)); - bp += sizeof(pgno); - if (prevlsn != NULL) - memcpy(bp, prevlsn, sizeof(*prevlsn)); - else - memset(bp, 0, sizeof(*prevlsn)); - bp += sizeof(*prevlsn); -#ifdef DIAGNOSTIC - if ((u_int32_t)(bp - (u_int8_t *)logrec.data) != logrec.size) - fprintf(stderr, "Error in log record length"); -#endif - ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags); - if (txnid != NULL) - txnid->last_lsn = *ret_lsnp; - __db_free(logrec.data); - return (ret); -} - -/* - * PUBLIC: int __db_noop_print - * PUBLIC: __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); - */ -int -__db_noop_print(notused1, dbtp, lsnp, notused2, notused3) - DB_LOG *notused1; - DBT *dbtp; - DB_LSN *lsnp; - int notused2; - void *notused3; -{ - __db_noop_args *argp; - u_int32_t i; - u_int ch; - int ret; - - i = 0; - ch = 0; - notused1 = NULL; - notused2 = 0; - notused3 = NULL; - - if ((ret = __db_noop_read(dbtp->data, &argp)) != 0) - return (ret); - printf("[%lu][%lu]db_noop: rec: %lu txnid %lx prevlsn [%lu][%lu]\n", - (u_long)lsnp->file, - (u_long)lsnp->offset, - (u_long)argp->type, - (u_long)argp->txnid->txnid, - (u_long)argp->prev_lsn.file, - (u_long)argp->prev_lsn.offset); - printf("\tfileid: %lu\n", (u_long)argp->fileid); - printf("\tpgno: %lu\n", (u_long)argp->pgno); - printf("\tprevlsn: [%lu][%lu]\n", - (u_long)argp->prevlsn.file, (u_long)argp->prevlsn.offset); - printf("\n"); - __db_free(argp); - return (0); -} - -/* - * PUBLIC: int __db_noop_read __P((void *, __db_noop_args **)); - */ -int -__db_noop_read(recbuf, argpp) - void *recbuf; - __db_noop_args **argpp; -{ - __db_noop_args *argp; - u_int8_t *bp; - - argp = (__db_noop_args *)__db_malloc(sizeof(__db_noop_args) + - sizeof(DB_TXN)); - if (argp == NULL) - return (ENOMEM); - argp->txnid = (DB_TXN *)&argp[1]; - bp = recbuf; - memcpy(&argp->type, bp, sizeof(argp->type)); - bp += sizeof(argp->type); - memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid)); - bp += sizeof(argp->txnid->txnid); - memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN)); - bp += sizeof(DB_LSN); - memcpy(&argp->fileid, bp, sizeof(argp->fileid)); - bp += sizeof(argp->fileid); - memcpy(&argp->pgno, bp, sizeof(argp->pgno)); - bp += sizeof(argp->pgno); - memcpy(&argp->prevlsn, bp, sizeof(argp->prevlsn)); - bp += sizeof(argp->prevlsn); - *argpp = argp; - return (0); -} - -/* * PUBLIC: int __db_init_print __P((DB_ENV *)); */ int @@ -1450,9 +1319,6 @@ __db_init_print(dbenv) if ((ret = __db_add_recovery(dbenv, __db_debug_print, DB_db_debug)) != 0) return (ret); - if ((ret = __db_add_recovery(dbenv, - __db_noop_print, DB_db_noop)) != 0) - return (ret); return (0); } @@ -1486,9 +1352,6 @@ __db_init_recover(dbenv) if ((ret = __db_add_recovery(dbenv, __db_debug_recover, DB_db_debug)) != 0) return (ret); - if ((ret = __db_add_recovery(dbenv, - __db_noop_recover, DB_db_noop)) != 0) - return (ret); return (0); } diff --git a/db2/db/db_dispatch.c b/db2/db/db_dispatch.c index 8645948614..616d08c3ff 100644 --- a/db2/db/db_dispatch.c +++ b/db2/db/db_dispatch.c @@ -43,13 +43,14 @@ #include "config.h" #ifndef lint -static const char sccsid[] = "@(#)db_dispatch.c 10.14 (Sleepycat) 5/3/98"; +static const char sccsid[] = "@(#)db_dispatch.c 10.20 (Sleepycat) 10/10/98"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES #include <sys/types.h> #include <errno.h> +#include <shqueue.h> #include <stddef.h> #include <stdlib.h> #include <string.h> @@ -61,6 +62,7 @@ static const char sccsid[] = "@(#)db_dispatch.c 10.14 (Sleepycat) 5/3/98"; #include "db_am.h" #include "common_ext.h" #include "log_auto.h" +#include "txn.h" #include "txn_auto.h" /* @@ -148,27 +150,16 @@ __db_add_recovery(dbenv, func, ndx) u_int32_t ndx; { u_int32_t i; + int ret; - /* Check if function is already registered. */ - if (dispatch_table && ndx < dispatch_size && - dispatch_table[ndx] != 0 && dispatch_table[ndx] != func) - return (DB_REGISTERED); + COMPQUIET(dbenv, NULL); /* !!!: not currently used. */ /* Check if we have to grow the table. */ if (ndx >= dispatch_size) { - if (dispatch_table == NULL) - dispatch_table = (int (**) - __P((DB_LOG *, DBT *, DB_LSN *, int, void *))) - __db_malloc(DB_user_BEGIN * sizeof(dispatch_table[0])); - else - dispatch_table = (int (**) - __P((DB_LOG *, DBT *, DB_LSN *, int, void *))) - __db_realloc(dispatch_table, (DB_user_BEGIN + - dispatch_size) * sizeof(dispatch_table[0])); - if (dispatch_table == NULL) { - __db_err(dbenv, "%s", strerror(ENOMEM)); - return (ENOMEM); - } + if ((ret = __os_realloc(&dispatch_table, + (DB_user_BEGIN + dispatch_size) * + sizeof(dispatch_table[0]))) != 0) + return (ret); for (i = dispatch_size, dispatch_size += DB_user_BEGIN; i < dispatch_size; ++i) dispatch_table[i] = NULL; @@ -189,9 +180,10 @@ __db_txnlist_init(retp) void *retp; { DB_TXNHEAD *headp; + int ret; - if ((headp = (DB_TXNHEAD *)__db_malloc(sizeof(DB_TXNHEAD))) == NULL) - return (ENOMEM); + if ((ret = __os_malloc(sizeof(DB_TXNHEAD), NULL, &headp)) != 0) + return (ret); LIST_INIT(&headp->head); headp->maxid = 0; @@ -214,9 +206,10 @@ __db_txnlist_add(listp, txnid) { DB_TXNHEAD *hp; DB_TXNLIST *elp; + int ret; - if ((elp = (DB_TXNLIST *)__db_malloc(sizeof(DB_TXNLIST))) == NULL) - return (ENOMEM); + if ((ret = __os_malloc(sizeof(DB_TXNLIST), NULL, &elp)) != 0) + return (ret); elp->txnid = txnid; hp = (DB_TXNHEAD *)listp; @@ -269,9 +262,9 @@ __db_txnlist_end(listp) hp = (DB_TXNHEAD *)listp; while ((p = LIST_FIRST(&hp->head)) != LIST_END(&hp->head)) { LIST_REMOVE(p, links); - __db_free(p); + __os_free(p, 0); } - __db_free(listp); + __os_free(listp, sizeof(DB_TXNHEAD)); } /* diff --git a/db2/db/db_dup.c b/db2/db/db_dup.c index 6379fc1729..2673bbcd61 100644 --- a/db2/db/db_dup.c +++ b/db2/db/db_dup.c @@ -8,7 +8,7 @@ #include "config.h" #ifndef lint -static const char sccsid[] = "@(#)db_dup.c 10.18 (Sleepycat) 5/31/98"; +static const char sccsid[] = "@(#)db_dup.c 10.35 (Sleepycat) 12/2/98"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -23,25 +23,25 @@ static const char sccsid[] = "@(#)db_dup.c 10.18 (Sleepycat) 5/31/98"; #include "btree.h" #include "db_am.h" -static int __db_addpage __P((DB *, - PAGE **, db_indx_t *, int (*)(DB *, u_int32_t, PAGE **))); -static int __db_dsplit __P((DB *, - PAGE **, db_indx_t *, u_int32_t, int (*)(DB *, u_int32_t, PAGE **))); +static int __db_addpage __P((DBC *, + PAGE **, db_indx_t *, int (*)(DBC *, u_int32_t, PAGE **))); +static int __db_dsplit __P((DBC *, + PAGE **, db_indx_t *, u_int32_t, int (*)(DBC *, u_int32_t, PAGE **))); /* * __db_dput -- * Put a duplicate item onto a duplicate page at the given index. * - * PUBLIC: int __db_dput __P((DB *, - * PUBLIC: DBT *, PAGE **, db_indx_t *, int (*)(DB *, u_int32_t, PAGE **))); + * PUBLIC: int __db_dput __P((DBC *, DBT *, + * PUBLIC: PAGE **, db_indx_t *, int (*)(DBC *, u_int32_t, PAGE **))); */ int -__db_dput(dbp, dbt, pp, indxp, newfunc) - DB *dbp; +__db_dput(dbc, dbt, pp, indxp, newfunc) + DBC *dbc; DBT *dbt; PAGE **pp; db_indx_t *indxp; - int (*newfunc) __P((DB *, u_int32_t, PAGE **)); + int (*newfunc) __P((DBC *, u_int32_t, PAGE **)); { BOVERFLOW bo; DBT *data_dbtp, hdr_dbt, *hdr_dbtp; @@ -54,10 +54,12 @@ __db_dput(dbp, dbt, pp, indxp, newfunc) * We need some access method independent threshold for when we put * a duplicate item onto an overflow page. */ - if (dbt->size > 0.25 * dbp->pgsize) { - if ((ret = __db_poff(dbp, dbt, &pgno, newfunc)) != 0) + if (dbt->size > 0.25 * dbc->dbp->pgsize) { + if ((ret = __db_poff(dbc, dbt, &pgno, newfunc)) != 0) return (ret); + UMRW(bo.unused1); B_TSET(bo.type, B_OVERFLOW, 0); + UMRW(bo.unused2); bo.tlen = dbt->size; bo.pgno = pgno; hdr_dbt.data = &bo; @@ -75,11 +77,14 @@ __db_dput(dbp, dbt, pp, indxp, newfunc) pagep = *pp; if (size > P_FREESPACE(pagep)) { if (*indxp == NUM_ENT(*pp) && NEXT_PGNO(*pp) == PGNO_INVALID) - ret = __db_addpage(dbp, pp, indxp, newfunc); + ret = __db_addpage(dbc, pp, indxp, newfunc); else - ret = __db_dsplit(dbp, pp, indxp, isize, newfunc); + ret = __db_dsplit(dbc, pp, indxp, isize, newfunc); if (ret != 0) - /* XXX: Pages not returned to free list. */ + /* + * XXX + * Pages not returned to free list. + */ return (ret); pagep = *pp; } @@ -88,11 +93,11 @@ __db_dput(dbp, dbt, pp, indxp, newfunc) * Now, pagep references the page on which to insert and indx is the * the location to insert. */ - if ((ret = __db_pitem(dbp, + if ((ret = __db_pitem(dbc, pagep, (u_int32_t)*indxp, isize, hdr_dbtp, data_dbtp)) != 0) return (ret); - (void)memp_fset(dbp->mpf, pagep, DB_MPOOL_DIRTY); + (void)memp_fset(dbc->dbp->mpf, pagep, DB_MPOOL_DIRTY); return (0); } @@ -100,15 +105,15 @@ __db_dput(dbp, dbt, pp, indxp, newfunc) * __db_drem -- * Remove a duplicate at the given index on the given page. * - * PUBLIC: int __db_drem __P((DB *, - * PUBLIC: PAGE **, u_int32_t, int (*)(DB *, PAGE *))); + * PUBLIC: int __db_drem __P((DBC *, + * PUBLIC: PAGE **, u_int32_t, int (*)(DBC *, PAGE *))); */ int -__db_drem(dbp, pp, indx, freefunc) - DB *dbp; +__db_drem(dbc, pp, indx, freefunc) + DBC *dbc; PAGE **pp; u_int32_t indx; - int (*freefunc) __P((DB *, PAGE *)); + int (*freefunc) __P((DBC *, PAGE *)); { PAGE *pagep; int ret; @@ -117,12 +122,12 @@ __db_drem(dbp, pp, indx, freefunc) /* Check if we are freeing a big item. */ if (B_TYPE(GET_BKEYDATA(pagep, indx)->type) == B_OVERFLOW) { - if ((ret = __db_doff(dbp, + if ((ret = __db_doff(dbc, GET_BOVERFLOW(pagep, indx)->pgno, freefunc)) != 0) return (ret); - ret = __db_ditem(dbp, pagep, indx, BOVERFLOW_SIZE); + ret = __db_ditem(dbc, pagep, indx, BOVERFLOW_SIZE); } else - ret = __db_ditem(dbp, pagep, indx, + ret = __db_ditem(dbc, pagep, indx, BKEYDATA_SIZE(GET_BKEYDATA(pagep, indx)->len)); if (ret != 0) return (ret); @@ -137,12 +142,12 @@ __db_drem(dbp, pp, indx, freefunc) * !!! * __db_relink will set the dirty bit for us. */ - if ((ret = __db_relink(dbp, pagep, pp, 0)) != 0) + if ((ret = __db_relink(dbc, DB_REM_PAGE, pagep, pp, 0)) != 0) return (ret); - if ((ret = freefunc(dbp, pagep)) != 0) + if ((ret = freefunc(dbc, pagep)) != 0) return (ret); } else - (void)memp_fset(dbp->mpf, pagep, DB_MPOOL_DIRTY); + (void)memp_fset(dbc->dbp->mpf, pagep, DB_MPOOL_DIRTY); return (0); } @@ -151,32 +156,41 @@ __db_drem(dbp, pp, indx, freefunc) * __db_dend -- * Find the last page in a set of offpage duplicates. * - * PUBLIC: int __db_dend __P((DB *, db_pgno_t, PAGE **)); + * PUBLIC: int __db_dend __P((DBC *, db_pgno_t, PAGE **)); */ int -__db_dend(dbp, pgno, pagep) - DB *dbp; +__db_dend(dbc, pgno, pp) + DBC *dbc; db_pgno_t pgno; - PAGE **pagep; + PAGE **pp; { + DB *dbp; PAGE *h; int ret; + dbp = dbc->dbp; + /* * This implements DB_KEYLAST. The last page is returned in pp; pgno * should be the page number of the first page of the duplicate chain. + * + * *pp may be non-NULL -- if given a valid page use it. */ + if (*pp != NULL) + goto started; for (;;) { - if ((ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0) { + if ((ret = memp_fget(dbp->mpf, &pgno, 0, pp)) != 0) { (void)__db_pgerr(dbp, pgno); return (ret); } +started: h = *pp; + if ((pgno = NEXT_PGNO(h)) == PGNO_INVALID) break; - (void)memp_fput(dbp->mpf, h, 0); - } - *pagep = h; + if ((ret = memp_fput(dbp->mpf, h, 0)) != 0) + return (ret); + } return (0); } @@ -191,41 +205,44 @@ __db_dend(dbp, pgno, pagep) * the page on which the insert should happen, not yet put. */ static int -__db_dsplit(dbp, hp, indxp, size, newfunc) - DB *dbp; +__db_dsplit(dbc, hp, indxp, size, newfunc) + DBC *dbc; PAGE **hp; db_indx_t *indxp; u_int32_t size; - int (*newfunc) __P((DB *, u_int32_t, PAGE **)); + int (*newfunc) __P((DBC *, u_int32_t, PAGE **)); { PAGE *h, *np, *tp; BKEYDATA *bk; DBT page_dbt; + DB *dbp; + size_t pgsize; db_indx_t halfbytes, i, indx, lastsum, nindex, oindex, s, sum; - int did_indx, ret; + int did_indx, ret, t_ret; h = *hp; indx = *indxp; + ret = 0; + dbp = dbc->dbp; + pgsize = dbp->pgsize; /* Create a temporary page to do compaction onto. */ - if ((tp = (PAGE *)__db_malloc(dbp->pgsize)) == NULL) - return (ENOMEM); -#ifdef DIAGNOSTIC - memset(tp, 0xff, dbp->pgsize); -#endif + if ((ret = __os_malloc(pgsize, NULL, &tp)) != 0) + return (ret); + /* Create new page for the split. */ - if ((ret = newfunc(dbp, P_DUPLICATE, &np)) != 0) { - FREE(tp, dbp->pgsize); + if ((ret = newfunc(dbc, P_DUPLICATE, &np)) != 0) { + __os_free(tp, pgsize); return (ret); } - P_INIT(np, dbp->pgsize, PGNO(np), PGNO(h), NEXT_PGNO(h), 0, + P_INIT(np, pgsize, PGNO(np), PGNO(h), NEXT_PGNO(h), 0, P_DUPLICATE); - P_INIT(tp, dbp->pgsize, PGNO(h), PREV_PGNO(h), PGNO(np), 0, + P_INIT(tp, pgsize, PGNO(h), PREV_PGNO(h), PGNO(np), 0, P_DUPLICATE); /* Figure out the split point */ - halfbytes = (dbp->pgsize - HOFFSET(h)) / 2; + halfbytes = (pgsize - HOFFSET(h)) / 2; did_indx = 0; for (sum = 0, lastsum = 0, i = 0; i < NUM_ENT(h); i++) { if (i == indx) { @@ -237,7 +254,6 @@ __db_dsplit(dbp, hp, indxp, size, newfunc) (db_indx_t)(sum - halfbytes)) { *hp = np; *indxp = 0; - i--; } else *indxp = i; break; @@ -252,29 +268,28 @@ __db_dsplit(dbp, hp, indxp, size, newfunc) if (lastsum < halfbytes && sum >= halfbytes) { /* We've crossed the halfway point. */ - if ((db_indx_t)(halfbytes - lastsum) < - (db_indx_t)(sum - halfbytes)) - i--; + if ((db_indx_t)(sum - halfbytes) < + (db_indx_t)(halfbytes - lastsum)) + i++; break; } } - /* * Check if we have set the return values of the index pointer and * page pointer. */ if (!did_indx) { *hp = np; - *indxp = indx - i - 1; + *indxp = indx - i; } - if (DB_LOGGING(dbp)) { + if (DB_LOGGING(dbc)) { page_dbt.size = dbp->pgsize; page_dbt.data = h; if ((ret = __db_split_log(dbp->dbenv->lg_info, - dbp->txn, &LSN(h), 0, DB_SPLITOLD, dbp->log_fileid, + dbc->txn, &LSN(h), 0, DB_SPLITOLD, dbp->log_fileid, PGNO(h), &page_dbt, &LSN(h))) != 0) { - FREE(tp, dbp->pgsize); + __os_free(tp, pgsize); return (ret); } LSN(tp) = LSN(h); @@ -283,12 +298,12 @@ __db_dsplit(dbp, hp, indxp, size, newfunc) /* * If it's a btree, adjust the cursors. * - * i is the index of the last element to stay on the page. + * i is the index of the first element to move onto the new page. */ - if (dbp->type == DB_BTREE || dbp->type == DB_RECNO) - __bam_ca_split(dbp, PGNO(h), PGNO(h), PGNO(np), i + 1, 0); + if (dbp->type == DB_BTREE) + __bam_ca_split(dbp, PGNO(h), PGNO(h), PGNO(np), i, 0); - for (nindex = 0, oindex = i + 1; oindex < NUM_ENT(h); oindex++) { + for (nindex = 0, oindex = i; oindex < NUM_ENT(h); oindex++) { bk = GET_BKEYDATA(h, oindex); if (B_TYPE(bk->type) == B_KEYDATA) s = BKEYDATA_SIZE(bk->len); @@ -304,7 +319,7 @@ __db_dsplit(dbp, hp, indxp, size, newfunc) * Now do data compaction by copying the remaining stuff onto the * temporary page and then copying it back to the real page. */ - for (nindex = 0, oindex = 0; oindex <= i; oindex++) { + for (nindex = 0, oindex = 0; oindex < i; oindex++) { bk = GET_BKEYDATA(h, oindex); if (B_TYPE(bk->type) == B_KEYDATA) s = BKEYDATA_SIZE(bk->len); @@ -324,59 +339,73 @@ __db_dsplit(dbp, hp, indxp, size, newfunc) */ memcpy(h, tp, LOFFSET(tp)); memcpy((u_int8_t *)h + HOFFSET(tp), - (u_int8_t *)tp + HOFFSET(tp), dbp->pgsize - HOFFSET(tp)); - FREE(tp, dbp->pgsize); + (u_int8_t *)tp + HOFFSET(tp), pgsize - HOFFSET(tp)); + __os_free(tp, pgsize); - if (DB_LOGGING(dbp)) { - page_dbt.size = dbp->pgsize; + if (DB_LOGGING(dbc)) { + /* + * XXX + * If either of these fails, are we leaving pages pinned? + * Yes, but it seems like this happens in error case. + */ + page_dbt.size = pgsize; page_dbt.data = h; if ((ret = __db_split_log(dbp->dbenv->lg_info, - dbp->txn, &LSN(h), 0, DB_SPLITNEW, dbp->log_fileid, + dbc->txn, &LSN(h), 0, DB_SPLITNEW, dbp->log_fileid, PGNO(h), &page_dbt, &LSN(h))) != 0) return (ret); - page_dbt.size = dbp->pgsize; + page_dbt.size = pgsize; page_dbt.data = np; if ((ret = __db_split_log(dbp->dbenv->lg_info, - dbp->txn, &LSN(np), 0, DB_SPLITNEW, dbp->log_fileid, + dbc->txn, &LSN(np), 0, DB_SPLITNEW, dbp->log_fileid, PGNO(np), &page_dbt, &LSN(np))) != 0) return (ret); } /* + * Finally, if there was a next page after the page being + * split, fix its prev pointer. + */ + if (np->next_pgno != PGNO_INVALID) + ret = __db_relink(dbc, DB_ADD_PAGE, np, NULL, 1); + + /* * Figure out if the location we're interested in is on the new * page, and if so, reset the callers' pointer. Push the other * page back to the store. */ if (*hp == h) - ret = memp_fput(dbp->mpf, np, DB_MPOOL_DIRTY); + t_ret = memp_fput(dbp->mpf, np, DB_MPOOL_DIRTY); else - ret = memp_fput(dbp->mpf, h, DB_MPOOL_DIRTY); + t_ret = memp_fput(dbp->mpf, h, DB_MPOOL_DIRTY); - return (ret); + return (ret != 0 ? ret : t_ret); } /* * __db_ditem -- * Remove an item from a page. * - * PUBLIC: int __db_ditem __P((DB *, PAGE *, u_int32_t, u_int32_t)); + * PUBLIC: int __db_ditem __P((DBC *, PAGE *, u_int32_t, u_int32_t)); */ int -__db_ditem(dbp, pagep, indx, nbytes) - DB *dbp; +__db_ditem(dbc, pagep, indx, nbytes) + DBC *dbc; PAGE *pagep; u_int32_t indx, nbytes; { + DB *dbp; DBT ldbt; db_indx_t cnt, offset; int ret; u_int8_t *from; - if (DB_LOGGING(dbp)) { + dbp = dbc->dbp; + if (DB_LOGGING(dbc)) { ldbt.data = P_ENTRY(pagep, indx); ldbt.size = nbytes; - if ((ret = __db_addrem_log(dbp->dbenv->lg_info, dbp->txn, + if ((ret = __db_addrem_log(dbp->dbenv->lg_info, dbc->txn, &LSN(pagep), 0, DB_REM_DUP, dbp->log_fileid, PGNO(pagep), (u_int32_t)indx, nbytes, &ldbt, NULL, &LSN(pagep))) != 0) return (ret); @@ -413,7 +442,7 @@ __db_ditem(dbp, pagep, indx, nbytes) sizeof(db_indx_t) * (NUM_ENT(pagep) - indx)); /* If it's a btree, adjust the cursors. */ - if (dbp->type == DB_BTREE || dbp->type == DB_RECNO) + if (dbp->type == DB_BTREE) __bam_ca_di(dbp, PGNO(pagep), indx, -1); return (0); @@ -424,16 +453,17 @@ __db_ditem(dbp, pagep, indx, nbytes) * Put an item on a page. * * PUBLIC: int __db_pitem - * PUBLIC: __P((DB *, PAGE *, u_int32_t, u_int32_t, DBT *, DBT *)); + * PUBLIC: __P((DBC *, PAGE *, u_int32_t, u_int32_t, DBT *, DBT *)); */ int -__db_pitem(dbp, pagep, indx, nbytes, hdr, data) - DB *dbp; +__db_pitem(dbc, pagep, indx, nbytes, hdr, data) + DBC *dbc; PAGE *pagep; u_int32_t indx; u_int32_t nbytes; DBT *hdr, *data; { + DB *dbp; BKEYDATA bk; DBT thdr; int ret; @@ -456,8 +486,9 @@ __db_pitem(dbp, pagep, indx, nbytes, hdr, data) * the passed in header sizes must be adjusted for the structure's * placeholder for the trailing variable-length data field. */ - if (DB_LOGGING(dbp)) - if ((ret = __db_addrem_log(dbp->dbenv->lg_info, dbp->txn, + dbp = dbc->dbp; + if (DB_LOGGING(dbc)) + if ((ret = __db_addrem_log(dbp->dbenv->lg_info, dbc->txn, &LSN(pagep), 0, DB_ADD_DUP, dbp->log_fileid, PGNO(pagep), (u_int32_t)indx, nbytes, hdr, data, &LSN(pagep))) != 0) return (ret); @@ -485,7 +516,7 @@ __db_pitem(dbp, pagep, indx, nbytes, hdr, data) memcpy(p + hdr->size, data->data, data->size); /* If it's a btree, adjust the cursors. */ - if (dbp->type == DB_BTREE || dbp->type == DB_RECNO) + if (dbp->type == DB_BTREE) __bam_ca_di(dbp, PGNO(pagep), indx, 1); return (0); @@ -495,14 +526,16 @@ __db_pitem(dbp, pagep, indx, nbytes, hdr, data) * __db_relink -- * Relink around a deleted page. * - * PUBLIC: int __db_relink __P((DB *, PAGE *, PAGE **, int)); + * PUBLIC: int __db_relink __P((DBC *, u_int32_t, PAGE *, PAGE **, int)); */ int -__db_relink(dbp, pagep, new_next, needlock) - DB *dbp; +__db_relink(dbc, add_rem, pagep, new_next, needlock) + DBC *dbc; + u_int32_t add_rem; PAGE *pagep, **new_next; int needlock; { + DB *dbp; PAGE *np, *pp; DB_LOCK npl, ppl; DB_LSN *nlsnp, *plsnp; @@ -512,10 +545,15 @@ __db_relink(dbp, pagep, new_next, needlock) np = pp = NULL; npl = ppl = LOCK_INVALID; nlsnp = plsnp = NULL; + dbp = dbc->dbp; - /* Retrieve and lock the two pages. */ + /* + * Retrieve and lock the one/two pages. For a remove, we may need + * two pages (the before and after). For an add, we only need one + * because, the split took care of the prev. + */ if (pagep->next_pgno != PGNO_INVALID) { - if (needlock && (ret = __bam_lget(dbp, + if (needlock && (ret = __bam_lget(dbc, 0, pagep->next_pgno, DB_LOCK_WRITE, &npl)) != 0) goto err; if ((ret = memp_fget(dbp->mpf, @@ -525,8 +563,8 @@ __db_relink(dbp, pagep, new_next, needlock) } nlsnp = &np->lsn; } - if (pagep->prev_pgno != PGNO_INVALID) { - if (needlock && (ret = __bam_lget(dbp, + if (add_rem == DB_REM_PAGE && pagep->prev_pgno != PGNO_INVALID) { + if (needlock && (ret = __bam_lget(dbc, 0, pagep->prev_pgno, DB_LOCK_WRITE, &ppl)) != 0) goto err; if ((ret = memp_fget(dbp->mpf, @@ -538,9 +576,10 @@ __db_relink(dbp, pagep, new_next, needlock) } /* Log the change. */ - if (DB_LOGGING(dbp)) { - if ((ret = __db_relink_log(dbp->dbenv->lg_info, dbp->txn, - &pagep->lsn, 0, dbp->log_fileid, pagep->pgno, &pagep->lsn, + if (DB_LOGGING(dbc)) { + if ((ret = __db_relink_log(dbp->dbenv->lg_info, dbc->txn, + &pagep->lsn, 0, add_rem, dbp->log_fileid, + pagep->pgno, &pagep->lsn, pagep->prev_pgno, plsnp, pagep->next_pgno, nlsnp)) != 0) goto err; if (np != NULL) @@ -558,7 +597,10 @@ __db_relink(dbp, pagep, new_next, needlock) * set to NULL. */ if (np != NULL) { - np->prev_pgno = pagep->prev_pgno; + if (add_rem == DB_ADD_PAGE) + np->prev_pgno = pagep->pgno; + else + np->prev_pgno = pagep->prev_pgno; if (new_next == NULL) ret = memp_fput(dbp->mpf, np, DB_MPOOL_DIRTY); else { @@ -568,7 +610,7 @@ __db_relink(dbp, pagep, new_next, needlock) if (ret != 0) goto err; if (needlock) - (void)__bam_lput(dbp, npl); + (void)__bam_lput(dbc, npl); } else if (new_next != NULL) *new_next = NULL; @@ -577,18 +619,18 @@ __db_relink(dbp, pagep, new_next, needlock) if ((ret = memp_fput(dbp->mpf, pp, DB_MPOOL_DIRTY)) != 0) goto err; if (needlock) - (void)__bam_lput(dbp, ppl); + (void)__bam_lput(dbc, ppl); } return (0); err: if (np != NULL) (void)memp_fput(dbp->mpf, np, 0); if (needlock && npl != LOCK_INVALID) - (void)__bam_lput(dbp, npl); + (void)__bam_lput(dbc, npl); if (pp != NULL) (void)memp_fput(dbp->mpf, pp, 0); if (needlock && ppl != LOCK_INVALID) - (void)__bam_lput(dbp, ppl); + (void)__bam_lput(dbc, ppl); return (ret); } @@ -596,34 +638,37 @@ err: if (np != NULL) * __db_ddup -- * Delete an offpage chain of duplicates. * - * PUBLIC: int __db_ddup __P((DB *, db_pgno_t, int (*)(DB *, PAGE *))); + * PUBLIC: int __db_ddup __P((DBC *, db_pgno_t, int (*)(DBC *, PAGE *))); */ int -__db_ddup(dbp, pgno, freefunc) - DB *dbp; +__db_ddup(dbc, pgno, freefunc) + DBC *dbc; db_pgno_t pgno; - int (*freefunc) __P((DB *, PAGE *)); + int (*freefunc) __P((DBC *, PAGE *)); { + DB *dbp; PAGE *pagep; DBT tmp_dbt; int ret; + dbp = dbc->dbp; do { if ((ret = memp_fget(dbp->mpf, &pgno, 0, &pagep)) != 0) { (void)__db_pgerr(dbp, pgno); return (ret); } - if (DB_LOGGING(dbp)) { + if (DB_LOGGING(dbc)) { tmp_dbt.data = pagep; tmp_dbt.size = dbp->pgsize; - if ((ret = __db_split_log(dbp->dbenv->lg_info, dbp->txn, - &LSN(pagep), 0, DB_SPLITOLD, dbp->log_fileid, - PGNO(pagep), &tmp_dbt, &LSN(pagep))) != 0) + if ((ret = __db_split_log(dbp->dbenv->lg_info, + dbc->txn, &LSN(pagep), 0, DB_SPLITOLD, + dbp->log_fileid, PGNO(pagep), &tmp_dbt, + &LSN(pagep))) != 0) return (ret); } pgno = pagep->next_pgno; - if ((ret = freefunc(dbp, pagep)) != 0) + if ((ret = freefunc(dbc, pagep)) != 0) return (ret); } while (pgno != PGNO_INVALID); @@ -636,21 +681,23 @@ __db_ddup(dbp, pgno, freefunc) * current page. */ static int -__db_addpage(dbp, hp, indxp, newfunc) - DB *dbp; +__db_addpage(dbc, hp, indxp, newfunc) + DBC *dbc; PAGE **hp; db_indx_t *indxp; - int (*newfunc) __P((DB *, u_int32_t, PAGE **)); + int (*newfunc) __P((DBC *, u_int32_t, PAGE **)); { + DB *dbp; PAGE *newpage; int ret; - if ((ret = newfunc(dbp, P_DUPLICATE, &newpage)) != 0) + dbp = dbc->dbp; + if ((ret = newfunc(dbc, P_DUPLICATE, &newpage)) != 0) return (ret); - if (DB_LOGGING(dbp)) { + if (DB_LOGGING(dbc)) { if ((ret = __db_addpage_log(dbp->dbenv->lg_info, - dbp->txn, &LSN(*hp), 0, dbp->log_fileid, + dbc->txn, &LSN(*hp), 0, dbp->log_fileid, PGNO(*hp), &LSN(*hp), PGNO(newpage), &LSN(newpage))) != 0) { return (ret); } @@ -666,3 +713,235 @@ __db_addpage(dbp, hp, indxp, newfunc) *indxp = 0; return (0); } + +/* + * __db_dsearch -- + * Search a set of duplicates for the proper position for a new duplicate. + * + * + pgno is the page number of the page on which to begin searching. + * Since we can continue duplicate searches, it might not be the first + * page. + * + * + If we are continuing a search, then *pp may be non-NULL in which + * case we do not have to retrieve the page. + * + * + If we are continuing a search, then *indxp contains the first + * on pgno of where we should begin the search. + * + * NOTE: if there is no comparison function, then continuing is + * meaningless, and *pp should always be NULL and *indxp will be + * ignored. + * + * 3 return values:: + * + * + pp is the returned page pointer of where this element should go. + * + indxp is the returned index on that page + * + cmpp is the returned final comparison result. + * + * PUBLIC: int __db_dsearch __P((DBC *, + * PUBLIC: int, DBT *, db_pgno_t, db_indx_t *, PAGE **, int *)); + */ +int +__db_dsearch(dbc, is_insert, dbt, pgno, indxp, pp, cmpp) + DBC *dbc; + int is_insert, *cmpp; + DBT *dbt; + db_pgno_t pgno; + db_indx_t *indxp; + PAGE **pp; +{ + DB *dbp; + PAGE *h; + db_indx_t base, indx, lim, save_indx; + db_pgno_t save_pgno; + int ret; + + dbp = dbc->dbp; + + if (dbp->dup_compare == NULL) { + /* + * We may have been given a valid page, but we may not be + * able to use it. The problem is that the application is + * doing a join and we're trying to continue the search, + * but since the items aren't sorted, we can't. Discard + * the page if it's not the one we're going to start with + * anyway. + */ + if (*pp != NULL && (*pp)->pgno != pgno) { + if ((ret = memp_fput(dbp->mpf, *pp, 0)) != 0) + return (ret); + *pp = NULL; + } + + /* + * If no duplicate function is specified, just go to the end + * of the duplicate set. + */ + if (is_insert) { + if ((ret = __db_dend(dbc, pgno, pp)) != 0) + return (ret); + *indxp = NUM_ENT(*pp); + return (0); + } + + /* + * We are looking for a specific duplicate, so do a linear + * search. + */ + if (*pp != NULL) + goto nocmp_started; + for (;;) { + if ((ret = memp_fget(dbp->mpf, &pgno, 0, pp)) != 0) + goto pg_err; +nocmp_started: h = *pp; + + for (*indxp = 0; *indxp < NUM_ENT(h); ++*indxp) { + if ((*cmpp = __bam_cmp(dbp, + dbt, h, *indxp, __bam_defcmp)) != 0) + continue; + /* + * The duplicate may have already been deleted, + * if it's a btree page, in which case we skip + * it. + */ + if (dbp->type == DB_BTREE && + B_DISSET(GET_BKEYDATA(h, *indxp)->type)) + continue; + + return (0); + } + + if ((pgno = h->next_pgno) == PGNO_INVALID) + break; + + if ((ret = memp_fput(dbp->mpf, h, 0)) != 0) + return (ret); + } + *cmpp = 1; /* We didn't succeed... */ + return (0); + } + + /* + * We have a comparison routine, i.e., the duplicates are sorted. + * Walk through the chain of duplicates, checking the last entry + * on each page to decide if it's the page we want to search. + * + * *pp may be non-NULL -- if we were given a valid page (e.g., are + * in mid-search), then use the provided page. + */ + if (*pp != NULL) + goto cmp_started; + for (;;) { + if ((ret = memp_fget(dbp->mpf, &pgno, 0, pp)) != 0) + goto pg_err; +cmp_started: h = *pp; + + if ((pgno = h->next_pgno) == PGNO_INVALID || __bam_cmp(dbp, + dbt, h, h->entries - 1, dbp->dup_compare) <= 0) + break; + /* + * Even when continuing a search, make sure we don't skip + * entries on a new page + */ + *indxp = 0; + + if ((ret = memp_fput(dbp->mpf, h, 0)) != 0) + return (ret); + } + + /* Next, do a binary search on the page. */ + base = F_ISSET(dbc, DBC_CONTINUE) ? *indxp : 0; + for (lim = NUM_ENT(h) - base; lim != 0; lim >>= 1) { + indx = base + (lim >> 1); + if ((*cmpp = __bam_cmp(dbp, + dbt, h, indx, dbp->dup_compare)) == 0) { + *indxp = indx; + + if (dbp->type != DB_BTREE || + !B_DISSET(GET_BKEYDATA(h, *indxp)->type)) + return (0); + goto check_delete; + } + if (*cmpp > 0) { + base = indx + 1; + lim--; + } + } + + /* + * Base references the smallest index larger than the supplied DBT's + * data item, potentially both 0 and NUM_ENT. + */ + *indxp = base; + return (0); + +check_delete: + /* + * The duplicate may have already been deleted, if it's a btree page, + * in which case we wander around, hoping to find an entry that hasn't + * been deleted. First, wander in a forwardly direction. + */ + save_pgno = (*pp)->pgno; + save_indx = *indxp; + for (++*indxp;;) { + for (; *indxp < NUM_ENT(h); ++*indxp) { + if ((*cmpp = __bam_cmp(dbp, + dbt, h, *indxp, dbp->dup_compare)) != 0) + goto check_delete_rev; + + if (!B_DISSET(GET_BKEYDATA(h, *indxp)->type)) + return (0); + } + if ((pgno = h->next_pgno) == PGNO_INVALID) + break; + + if ((ret = memp_fput(dbp->mpf, h, 0)) != 0) + return (ret); + + if ((ret = memp_fget(dbp->mpf, &pgno, 0, pp)) != 0) + goto pg_err; + h = *pp; + + *indxp = 0; + } + +check_delete_rev: + /* Go back to where we started, and wander in a backwardly direction. */ + if (h->pgno != save_pgno) { + if ((ret = memp_fput(dbp->mpf, h, 0)) != 0) + return (ret); + if ((ret = memp_fget(dbp->mpf, &save_pgno, 0, pp)) != 0) + goto pg_err; + h = *pp; + } + + for (;;) { + while (*indxp > 0) { + --*indxp; + if ((*cmpp = __bam_cmp(dbp, + dbt, h, *indxp, dbp->dup_compare)) != 0) + goto check_delete_fail; + + if (!B_DISSET(GET_BKEYDATA(h, *indxp)->type)) + return (0); + } + if ((pgno = h->prev_pgno) == PGNO_INVALID) + break; + + if ((ret = memp_fput(dbp->mpf, h, 0)) != 0) + return (ret); + + if ((ret = memp_fget(dbp->mpf, &pgno, 0, pp)) != 0) + goto pg_err; + h = *pp; + + *indxp = NUM_ENT(h); + } + +check_delete_fail: + *cmpp = 1; /* We didn't succeed... */ + return (0); + +pg_err: __db_pgerr(dbp, pgno); + return (ret); +} diff --git a/db2/db/db_iface.c b/db2/db/db_iface.c new file mode 100644 index 0000000000..4ebf3ba019 --- /dev/null +++ b/db2/db/db_iface.c @@ -0,0 +1,488 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997, 1998 + * Sleepycat Software. All rights reserved. + */ + +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)db_iface.c 10.40 (Sleepycat) 12/19/98"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <errno.h> +#endif + +#include "db_int.h" +#include "db_page.h" +#include "db_auto.h" +#include "db_ext.h" +#include "common_ext.h" + +static int __db_keyempty __P((const DB_ENV *)); +static int __db_rdonly __P((const DB_ENV *, const char *)); +static int __dbt_ferr __P((const DB *, const char *, const DBT *, int)); + +/* + * __db_cdelchk -- + * Common cursor delete argument checking routine. + * + * PUBLIC: int __db_cdelchk __P((const DB *, u_int32_t, int, int)); + */ +int +__db_cdelchk(dbp, flags, isrdonly, isvalid) + const DB *dbp; + u_int32_t flags; + int isrdonly, isvalid; +{ + /* Check for changes to a read-only tree. */ + if (isrdonly) + return (__db_rdonly(dbp->dbenv, "c_del")); + + /* Check for invalid function flags. */ + switch (flags) { + case 0: + break; + default: + return (__db_ferr(dbp->dbenv, "DBcursor->c_del", 0)); + } + + /* + * The cursor must be initialized, return -1 for an invalid cursor, + * otherwise 0. + */ + return (isvalid ? 0 : EINVAL); +} + +/* + * __db_cgetchk -- + * Common cursor get argument checking routine. + * + * PUBLIC: int __db_cgetchk __P((const DB *, DBT *, DBT *, u_int32_t, int)); + */ +int +__db_cgetchk(dbp, key, data, flags, isvalid) + const DB *dbp; + DBT *key, *data; + u_int32_t flags; + int isvalid; +{ + int key_einval, key_flags, ret; + + key_einval = key_flags = 0; + + /* Check for invalid function flags. */ + LF_CLR(DB_RMW); + switch (flags) { + case DB_NEXT_DUP: + if (dbp->type == DB_RECNO) + goto err; + /* FALLTHROUGH */ + case DB_CURRENT: + case DB_FIRST: + case DB_LAST: + case DB_NEXT: + case DB_PREV: + key_flags = 1; + break; + case DB_GET_BOTH: + case DB_SET_RANGE: + key_einval = key_flags = 1; + break; + case DB_SET: + key_einval = 1; + break; + case DB_GET_RECNO: + if (!F_ISSET(dbp, DB_BT_RECNUM)) + goto err; + break; + case DB_SET_RECNO: + if (!F_ISSET(dbp, DB_BT_RECNUM)) + goto err; + key_einval = key_flags = 1; + break; + default: +err: return (__db_ferr(dbp->dbenv, "DBcursor->c_get", 0)); + } + + /* Check for invalid key/data flags. */ + if ((ret = __dbt_ferr(dbp, "key", key, 0)) != 0) + return (ret); + if ((ret = __dbt_ferr(dbp, "data", data, 0)) != 0) + return (ret); + + /* Check for missing keys. */ + if (key_einval && (key->data == NULL || key->size == 0)) + return (__db_keyempty(dbp->dbenv)); + + /* + * The cursor must be initialized for DB_CURRENT, return -1 for an + * invalid cursor, otherwise 0. + */ + return (isvalid || flags != DB_CURRENT ? 0 : EINVAL); +} + +/* + * __db_cputchk -- + * Common cursor put argument checking routine. + * + * PUBLIC: int __db_cputchk __P((const DB *, + * PUBLIC: const DBT *, DBT *, u_int32_t, int, int)); + */ +int +__db_cputchk(dbp, key, data, flags, isrdonly, isvalid) + const DB *dbp; + const DBT *key; + DBT *data; + u_int32_t flags; + int isrdonly, isvalid; +{ + int key_einval, key_flags, ret; + + key_einval = key_flags = 0; + + /* Check for changes to a read-only tree. */ + if (isrdonly) + return (__db_rdonly(dbp->dbenv, "c_put")); + + /* Check for invalid function flags. */ + switch (flags) { + case DB_AFTER: + case DB_BEFORE: + if (dbp->dup_compare != NULL) + goto err; + if (dbp->type == DB_RECNO && !F_ISSET(dbp, DB_RE_RENUMBER)) + goto err; + if (dbp->type != DB_RECNO && !F_ISSET(dbp, DB_AM_DUP)) + goto err; + break; + case DB_CURRENT: + /* + * If there is a comparison function, doing a DB_CURRENT + * must not change the part of the data item that is used + * for the comparison. + */ + break; + case DB_KEYFIRST: + case DB_KEYLAST: + if (dbp->type == DB_RECNO) + goto err; + key_einval = key_flags = 1; + break; + default: +err: return (__db_ferr(dbp->dbenv, "DBcursor->c_put", 0)); + } + + /* Check for invalid key/data flags. */ + if (key_flags && (ret = __dbt_ferr(dbp, "key", key, 0)) != 0) + return (ret); + if ((ret = __dbt_ferr(dbp, "data", data, 0)) != 0) + return (ret); + + /* Check for missing keys. */ + if (key_einval && (key->data == NULL || key->size == 0)) + return (__db_keyempty(dbp->dbenv)); + + /* + * The cursor must be initialized for anything other than DB_KEYFIRST + * and DB_KEYLAST, return -1 for an invalid cursor, otherwise 0. + */ + return (isvalid || + flags == DB_KEYFIRST || flags == DB_KEYLAST ? 0 : EINVAL); +} + +/* + * __db_closechk -- + * DB->close flag check. + * + * PUBLIC: int __db_closechk __P((const DB *, u_int32_t)); + */ +int +__db_closechk(dbp, flags) + const DB *dbp; + u_int32_t flags; +{ + /* Check for invalid function flags. */ + if (flags != 0 && flags != DB_NOSYNC) + return (__db_ferr(dbp->dbenv, "DB->close", 0)); + + return (0); +} + +/* + * __db_delchk -- + * Common delete argument checking routine. + * + * PUBLIC: int __db_delchk __P((const DB *, DBT *, u_int32_t, int)); + */ +int +__db_delchk(dbp, key, flags, isrdonly) + const DB *dbp; + DBT *key; + u_int32_t flags; + int isrdonly; +{ + /* Check for changes to a read-only tree. */ + if (isrdonly) + return (__db_rdonly(dbp->dbenv, "delete")); + + /* Check for invalid function flags. */ + switch (flags) { + case 0: + break; + default: + return (__db_ferr(dbp->dbenv, "DB->del", 0)); + } + + /* Check for missing keys. */ + if (key->data == NULL || key->size == 0) + return (__db_keyempty(dbp->dbenv)); + + return (0); +} + +/* + * __db_getchk -- + * Common get argument checking routine. + * + * PUBLIC: int __db_getchk __P((const DB *, const DBT *, DBT *, u_int32_t)); + */ +int +__db_getchk(dbp, key, data, flags) + const DB *dbp; + const DBT *key; + DBT *data; + u_int32_t flags; +{ + int ret; + + /* Check for invalid function flags. */ + LF_CLR(DB_RMW); + switch (flags) { + case 0: + case DB_GET_BOTH: + break; + case DB_SET_RECNO: + if (!F_ISSET(dbp, DB_BT_RECNUM)) + goto err; + break; + default: +err: return (__db_ferr(dbp->dbenv, "DB->get", 0)); + } + + /* Check for invalid key/data flags. */ + if ((ret = __dbt_ferr(dbp, "key", key, flags == DB_SET_RECNO)) != 0) + return (ret); + if ((ret = __dbt_ferr(dbp, "data", data, 1)) != 0) + return (ret); + + /* Check for missing keys. */ + if (key->data == NULL || key->size == 0) + return (__db_keyempty(dbp->dbenv)); + + return (0); +} + +/* + * __db_joinchk -- + * Common join argument checking routine. + * + * PUBLIC: int __db_joinchk __P((const DB *, u_int32_t)); + */ +int +__db_joinchk(dbp, flags) + const DB *dbp; + u_int32_t flags; +{ + if (flags != 0) + return (__db_ferr(dbp->dbenv, "DB->join", 0)); + + return (0); +} + +/* + * __db_putchk -- + * Common put argument checking routine. + * + * PUBLIC: int __db_putchk + * PUBLIC: __P((const DB *, DBT *, const DBT *, u_int32_t, int, int)); + */ +int +__db_putchk(dbp, key, data, flags, isrdonly, isdup) + const DB *dbp; + DBT *key; + const DBT *data; + u_int32_t flags; + int isrdonly, isdup; +{ + int ret; + + /* Check for changes to a read-only tree. */ + if (isrdonly) + return (__db_rdonly(dbp->dbenv, "put")); + + /* Check for invalid function flags. */ + switch (flags) { + case 0: + case DB_NOOVERWRITE: + break; + case DB_APPEND: + if (dbp->type != DB_RECNO) + goto err; + break; + default: +err: return (__db_ferr(dbp->dbenv, "DB->put", 0)); + } + + /* Check for invalid key/data flags. */ + if ((ret = __dbt_ferr(dbp, "key", key, 0)) != 0) + return (ret); + if ((ret = __dbt_ferr(dbp, "data", data, 0)) != 0) + return (ret); + + /* Check for missing keys. */ + if (key->data == NULL || key->size == 0) + return (__db_keyempty(dbp->dbenv)); + + /* Check for partial puts in the presence of duplicates. */ + if (isdup && F_ISSET(data, DB_DBT_PARTIAL)) { + __db_err(dbp->dbenv, +"a partial put in the presence of duplicates requires a cursor operation"); + return (EINVAL); + } + + return (0); +} + +/* + * __db_statchk -- + * Common stat argument checking routine. + * + * PUBLIC: int __db_statchk __P((const DB *, u_int32_t)); + */ +int +__db_statchk(dbp, flags) + const DB *dbp; + u_int32_t flags; +{ + /* Check for invalid function flags. */ + switch (flags) { + case 0: + break; + case DB_RECORDCOUNT: + if (dbp->type == DB_RECNO) + break; + if (dbp->type == DB_BTREE && F_ISSET(dbp, DB_BT_RECNUM)) + break; + goto err; + default: +err: return (__db_ferr(dbp->dbenv, "DB->stat", 0)); + } + + return (0); +} + +/* + * __db_syncchk -- + * Common sync argument checking routine. + * + * PUBLIC: int __db_syncchk __P((const DB *, u_int32_t)); + */ +int +__db_syncchk(dbp, flags) + const DB *dbp; + u_int32_t flags; +{ + /* Check for invalid function flags. */ + switch (flags) { + case 0: + break; + default: + return (__db_ferr(dbp->dbenv, "DB->sync", 0)); + } + + return (0); +} + +/* + * __dbt_ferr -- + * Check a DBT for flag errors. + */ +static int +__dbt_ferr(dbp, name, dbt, check_thread) + const DB *dbp; + const char *name; + const DBT *dbt; + int check_thread; +{ + int ret; + + /* + * Check for invalid DBT flags. We allow any of the flags to be + * specified to any DB or DBcursor call so that applications can + * set DB_DBT_MALLOC when retrieving a data item from a secondary + * database and then specify that same DBT as a key to a primary + * database, without having to clear flags. + */ + if ((ret = __db_fchk(dbp->dbenv, name, dbt->flags, + DB_DBT_MALLOC | DB_DBT_USERMEM | DB_DBT_PARTIAL)) != 0) + return (ret); + if ((ret = __db_fcchk(dbp->dbenv, name, + dbt->flags, DB_DBT_MALLOC, DB_DBT_USERMEM)) != 0) + return (ret); + + if (check_thread && F_ISSET(dbp, DB_AM_THREAD) && + !F_ISSET(dbt, DB_DBT_MALLOC | DB_DBT_USERMEM)) { + __db_err(dbp->dbenv, + "missing flag thread flag for %s DBT", name); + return (EINVAL); + } + return (0); +} + +/* + * __db_eopnotsup -- + * Common operation not supported message. + * + * PUBLIC: int __db_eopnotsup __P((const DB_ENV *)); + */ +int +__db_eopnotsup(dbenv) + const DB_ENV *dbenv; +{ + __db_err(dbenv, "operation not supported"); +#ifdef EOPNOTSUPP + return (EOPNOTSUPP); +#else + return (EINVAL); +#endif +} + +/* + * __db_keyempty -- + * Common missing or empty key value message. + */ +static int +__db_keyempty(dbenv) + const DB_ENV *dbenv; +{ + __db_err(dbenv, "missing or empty key value specified"); + return (EINVAL); +} + +/* + * __db_rdonly -- + * Common readonly message. + */ +static int +__db_rdonly(dbenv, name) + const DB_ENV *dbenv; + const char *name; +{ + __db_err(dbenv, "%s: attempt to modify a read-only tree", name); + return (EACCES); +} diff --git a/db2/db/db_join.c b/db2/db/db_join.c new file mode 100644 index 0000000000..a4051c20b0 --- /dev/null +++ b/db2/db/db_join.c @@ -0,0 +1,271 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1998 + * Sleepycat Software. All rights reserved. + */ + +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)db_join.c 10.10 (Sleepycat) 10/9/98"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <errno.h> +#include <string.h> +#endif + +#include "db_int.h" +#include "db_page.h" +#include "db_join.h" +#include "db_am.h" +#include "common_ext.h" + +static int __db_join_close __P((DBC *)); +static int __db_join_del __P((DBC *, u_int32_t)); +static int __db_join_get __P((DBC *, DBT *, DBT *, u_int32_t)); +static int __db_join_put __P((DBC *, DBT *, DBT *, u_int32_t)); + +/* + * This is the duplicate-assisted join functionality. Right now we're + * going to write it such that we return one item at a time, although + * I think we may need to optimize it to return them all at once. + * It should be easier to get it working this way, and I believe that + * changing it should be fairly straightforward. + * + * XXX + * Right now we do not maintain the number of duplicates so we do + * not optimize the join. If the caller does, then best performance + * will be achieved by putting the cursor with the smallest cardinality + * first. + * + * The first cursor moves sequentially through the duplicate set while + * the others search explicitly for the duplicate in question. + * + */ + +/* + * __db_join -- + * This is the interface to the duplicate-assisted join functionality. + * In the same way that cursors mark a position in a database, a cursor + * can mark a position in a join. While most cursors are created by the + * cursor method of a DB, join cursors are created through an explicit + * call to DB->join. + * + * The curslist is an array of existing, intialized cursors and primary + * is the DB of the primary file. The data item that joins all the + * cursors in the curslist is used as the key into the primary and that + * key and data are returned. When no more items are left in the join + * set, the c_next operation off the join cursor will return DB_NOTFOUND. + * + * PUBLIC: int __db_join __P((DB *, DBC **, u_int32_t, DBC **)); + */ +int +__db_join(primary, curslist, flags, dbcp) + DB *primary; + DBC **curslist, **dbcp; + u_int32_t flags; +{ + DBC *dbc; + JOIN_CURSOR *jc; + int i, ret; + + DB_PANIC_CHECK(primary); + + if ((ret = __db_joinchk(primary, flags)) != 0) + return (ret); + + if (curslist == NULL || curslist[0] == NULL) + return (EINVAL); + + dbc = NULL; + jc = NULL; + + if ((ret = __os_calloc(1, sizeof(DBC), &dbc)) != 0) + goto err; + + if ((ret = __os_calloc(1, sizeof(JOIN_CURSOR), &jc)) != 0) + goto err; + + if ((ret = __os_malloc(256, NULL, &jc->j_key.data)) != 0) + goto err; + jc->j_key.ulen = 256; + F_SET(&jc->j_key, DB_DBT_USERMEM); + + for (jc->j_curslist = curslist; + *jc->j_curslist != NULL; jc->j_curslist++) + ; + if ((ret = __os_calloc((jc->j_curslist - curslist + 1), + sizeof(DBC *), &jc->j_curslist)) != 0) + goto err; + for (i = 0; curslist[i] != NULL; i++) { + if (i != 0) + F_SET(curslist[i], DBC_KEYSET); + jc->j_curslist[i] = curslist[i]; + } + + dbc->c_close = __db_join_close; + dbc->c_del = __db_join_del; + dbc->c_get = __db_join_get; + dbc->c_put = __db_join_put; + dbc->internal = jc; + dbc->dbp = primary; + jc->j_init = 1; + jc->j_primary = primary; + + *dbcp = dbc; + + return (0); + +err: if (jc != NULL) { + if (jc->j_curslist != NULL) + __os_free(jc->j_curslist, + (jc->j_curslist - curslist + 1) * sizeof(DBC *)); + __os_free(jc, sizeof(JOIN_CURSOR)); + } + if (dbc != NULL) + __os_free(dbc, sizeof(DBC)); + return (ret); +} + +static int +__db_join_put(dbc, key, data, flags) + DBC *dbc; + DBT *key; + DBT *data; + u_int32_t flags; +{ + DB_PANIC_CHECK(dbc->dbp); + + COMPQUIET(key, NULL); + COMPQUIET(data, NULL); + COMPQUIET(flags, 0); + return (EINVAL); +} + +static int +__db_join_del(dbc, flags) + DBC *dbc; + u_int32_t flags; +{ + DB_PANIC_CHECK(dbc->dbp); + + COMPQUIET(flags, 0); + return (EINVAL); +} + +static int +__db_join_get(dbc, key, data, flags) + DBC *dbc; + DBT *key, *data; + u_int32_t flags; +{ + DB *dbp; + DBC **cpp; + JOIN_CURSOR *jc; + int ret; + u_int32_t operation; + + dbp = dbc->dbp; + + DB_PANIC_CHECK(dbp); + + operation = LF_ISSET(DB_OPFLAGS_MASK); + if (operation != 0 && operation != DB_JOIN_ITEM) + return (__db_ferr(dbp->dbenv, "DBcursor->c_get", 0)); + + LF_CLR(DB_OPFLAGS_MASK); + if ((ret = + __db_fchk(dbp->dbenv, "DBcursor->c_get", flags, DB_RMW)) != 0) + return (ret); + + jc = (JOIN_CURSOR *)dbc->internal; +retry: + ret = jc->j_curslist[0]->c_get(jc->j_curslist[0], + &jc->j_key, key, jc->j_init ? DB_CURRENT : DB_NEXT_DUP); + + if (ret == ENOMEM) { + jc->j_key.ulen <<= 1; + if ((ret = __os_realloc(&jc->j_key.data, jc->j_key.ulen)) != 0) + return (ret); + goto retry; + } + if (ret != 0) + return (ret); + + jc->j_init = 0; + do { + /* + * We have the first element; now look for it in the + * other cursors. + */ + for (cpp = jc->j_curslist + 1; *cpp != NULL; cpp++) { +retry2: if ((ret = ((*cpp)->c_get)(*cpp, + &jc->j_key, key, DB_GET_BOTH)) == DB_NOTFOUND) + break; + if (ret == ENOMEM) { + jc->j_key.ulen <<= 1; + if ((ret = __os_realloc(&jc->j_key.data, + jc->j_key.ulen)) != 0) + return (ret); + goto retry2; + } + if (F_ISSET(*cpp, DBC_KEYSET)) { + F_CLR(*cpp, DBC_KEYSET); + F_SET(*cpp, DBC_CONTINUE); + } + } + + /* + * If we got out of here with ret != 0, then we failed to + * find the duplicate in one of the files, so we go on to + * the next item in the outermost relation. If ret was + * equal to 0, then we've got something to return. + */ + if (ret == 0) + break; + } while ((ret = jc->j_curslist[0]->c_get(jc->j_curslist[0], + &jc->j_key, key, DB_NEXT_DUP)) == 0); + + /* + * If ret != 0 here, we've exhausted the first file. Otherwise, + * key and data are set and we need to do the lookup on the + * primary. + */ + if (ret != 0) + return (ret); + + if (operation == DB_JOIN_ITEM) + return (0); + else + return ((jc->j_primary->get)(jc->j_primary, + jc->j_curslist[0]->txn, key, data, 0)); +} + +static int +__db_join_close(dbc) + DBC *dbc; +{ + JOIN_CURSOR *jc; + int i; + + DB_PANIC_CHECK(dbc->dbp); + + jc = (JOIN_CURSOR *)dbc->internal; + + /* + * Clear the optimization flag in the cursors. + */ + for (i = 0; jc->j_curslist[i] != NULL; i++) + F_CLR(jc->j_curslist[i], DBC_CONTINUE | DBC_KEYSET); + + __os_free(jc->j_curslist, 0); + __os_free(jc->j_key.data, jc->j_key.ulen); + __os_free(jc, sizeof(JOIN_CURSOR)); + __os_free(dbc, sizeof(DBC)); + + return (0); +} diff --git a/db2/db/db_overflow.c b/db2/db/db_overflow.c index d28740dcbe..0efcc9de7f 100644 --- a/db2/db/db_overflow.c +++ b/db2/db/db_overflow.c @@ -47,7 +47,7 @@ #include "config.h" #ifndef lint -static const char sccsid[] = "@(#)db_overflow.c 10.11 (Sleepycat) 5/7/98"; +static const char sccsid[] = "@(#)db_overflow.c 10.21 (Sleepycat) 9/27/98"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -60,6 +60,7 @@ static const char sccsid[] = "@(#)db_overflow.c 10.11 (Sleepycat) 5/7/98"; #include "db_int.h" #include "db_page.h" #include "db_am.h" +#include "common_ext.h" /* * Big key/data code. @@ -106,29 +107,20 @@ __db_goff(dbp, dbt, tlen, pgno, bpp, bpsz) needed = tlen; } - /* - * Allocate any necessary memory. - * - * XXX: Never allocate 0 bytes; - */ + /* Allocate any necessary memory. */ if (F_ISSET(dbt, DB_DBT_USERMEM)) { if (needed > dbt->ulen) { dbt->size = needed; return (ENOMEM); } } else if (F_ISSET(dbt, DB_DBT_MALLOC)) { - dbt->data = dbp->db_malloc == NULL ? - (void *)__db_malloc(needed + 1) : - (void *)dbp->db_malloc(needed + 1); - if (dbt->data == NULL) - return (ENOMEM); + if ((ret = + __os_malloc(needed, dbp->db_malloc, &dbt->data)) != 0) + return (ret); } else if (*bpsz == 0 || *bpsz < needed) { - *bpp = (*bpp == NULL ? - (void *)__db_malloc(needed + 1) : - (void *)__db_realloc(*bpp, needed + 1)); - if (*bpp == NULL) - return (ENOMEM); - *bpsz = needed + 1; + if ((ret = __os_realloc(bpp, needed)) != 0) + return (ret); + *bpsz = needed; dbt->data = *bpp; } else dbt->data = *bpp; @@ -168,16 +160,17 @@ __db_goff(dbp, dbt, tlen, pgno, bpp, bpsz) * __db_poff -- * Put an offpage item. * - * PUBLIC: int __db_poff __P((DB *, const DBT *, db_pgno_t *, - * PUBLIC: int (*)(DB *, u_int32_t, PAGE **))); + * PUBLIC: int __db_poff __P((DBC *, const DBT *, db_pgno_t *, + * PUBLIC: int (*)(DBC *, u_int32_t, PAGE **))); */ int -__db_poff(dbp, dbt, pgnop, newfunc) - DB *dbp; +__db_poff(dbc, dbt, pgnop, newfunc) + DBC *dbc; const DBT *dbt; db_pgno_t *pgnop; - int (*newfunc) __P((DB *, u_int32_t, PAGE **)); + int (*newfunc) __P((DBC *, u_int32_t, PAGE **)); { + DB *dbp; PAGE *pagep, *lastp; DB_LSN new_lsn, null_lsn; DBT tmp_dbt; @@ -191,6 +184,7 @@ __db_poff(dbp, dbt, pgnop, newfunc) * number of bytes we get for pages we fill completely with a single * item. */ + dbp = dbc->dbp; pagespace = P_MAXSPACE(dbp->pgsize); lastp = NULL; @@ -208,13 +202,13 @@ __db_poff(dbp, dbt, pgnop, newfunc) * the item onto the page. If sz is less than pagespace, we * have a partial record. */ - if ((ret = newfunc(dbp, P_OVERFLOW, &pagep)) != 0) + if ((ret = newfunc(dbc, P_OVERFLOW, &pagep)) != 0) return (ret); - if (DB_LOGGING(dbp)) { + if (DB_LOGGING(dbc)) { tmp_dbt.data = p; tmp_dbt.size = pagespace; ZERO_LSN(null_lsn); - if ((ret = __db_big_log(dbp->dbenv->lg_info, dbp->txn, + if ((ret = __db_big_log(dbp->dbenv->lg_info, dbc->txn, &new_lsn, 0, DB_ADD_BIG, dbp->log_fileid, PGNO(pagep), lastp ? PGNO(lastp) : PGNO_INVALID, PGNO_INVALID, &tmp_dbt, &LSN(pagep), @@ -256,24 +250,26 @@ __db_poff(dbp, dbt, pgnop, newfunc) * __db_ovref -- * Increment/decrement the reference count on an overflow page. * - * PUBLIC: int __db_ovref __P((DB *, db_pgno_t, int32_t)); + * PUBLIC: int __db_ovref __P((DBC *, db_pgno_t, int32_t)); */ int -__db_ovref(dbp, pgno, adjust) - DB *dbp; +__db_ovref(dbc, pgno, adjust) + DBC *dbc; db_pgno_t pgno; int32_t adjust; { + DB *dbp; PAGE *h; int ret; + dbp = dbc->dbp; if ((ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0) { (void)__db_pgerr(dbp, pgno); return (ret); } - if (DB_LOGGING(dbp)) - if ((ret = __db_ovref_log(dbp->dbenv->lg_info, dbp->txn, + if (DB_LOGGING(dbc)) + if ((ret = __db_ovref_log(dbp->dbenv->lg_info, dbc->txn, &LSN(h), 0, dbp->log_fileid, h->pgno, adjust, &LSN(h))) != 0) return (ret); @@ -287,19 +283,21 @@ __db_ovref(dbp, pgno, adjust) * __db_doff -- * Delete an offpage chain of overflow pages. * - * PUBLIC: int __db_doff __P((DB *, db_pgno_t, int (*)(DB *, PAGE *))); + * PUBLIC: int __db_doff __P((DBC *, db_pgno_t, int (*)(DBC *, PAGE *))); */ int -__db_doff(dbp, pgno, freefunc) - DB *dbp; +__db_doff(dbc, pgno, freefunc) + DBC *dbc; db_pgno_t pgno; - int (*freefunc) __P((DB *, PAGE *)); + int (*freefunc) __P((DBC *, PAGE *)); { + DB *dbp; PAGE *pagep; DB_LSN null_lsn; DBT tmp_dbt; int ret; + dbp = dbc->dbp; do { if ((ret = memp_fget(dbp->mpf, &pgno, 0, &pagep)) != 0) { (void)__db_pgerr(dbp, pgno); @@ -312,21 +310,21 @@ __db_doff(dbp, pgno, freefunc) */ if (TYPE(pagep) == P_OVERFLOW && OV_REF(pagep) > 1) { (void)memp_fput(dbp->mpf, pagep, 0); - return (__db_ovref(dbp, pgno, -1)); + return (__db_ovref(dbc, pgno, -1)); } - if (DB_LOGGING(dbp)) { + if (DB_LOGGING(dbc)) { tmp_dbt.data = (u_int8_t *)pagep + P_OVERHEAD; tmp_dbt.size = OV_LEN(pagep); ZERO_LSN(null_lsn); - if ((ret = __db_big_log(dbp->dbenv->lg_info, dbp->txn, + if ((ret = __db_big_log(dbp->dbenv->lg_info, dbc->txn, &LSN(pagep), 0, DB_REM_BIG, dbp->log_fileid, PGNO(pagep), PREV_PGNO(pagep), NEXT_PGNO(pagep), &tmp_dbt, &LSN(pagep), &null_lsn, &null_lsn)) != 0) return (ret); } pgno = pagep->next_pgno; - if ((ret = freefunc(dbp, pagep)) != 0) + if ((ret = freefunc(dbc, pagep)) != 0) return (ret); } while (pgno != PGNO_INVALID); @@ -339,44 +337,71 @@ __db_doff(dbp, pgno, freefunc) * * Given a starting page number and a key, return <0, 0, >0 to indicate if the * key on the page is less than, equal to or greater than the key specified. + * We optimize this by doing chunk at a time comparison unless the user has + * specified a comparison function. In this case, we need to materialize + * the entire object and call their comparison routine. * - * PUBLIC: int __db_moff __P((DB *, const DBT *, db_pgno_t)); + * PUBLIC: int __db_moff __P((DB *, const DBT *, db_pgno_t, u_int32_t, + * PUBLIC: int (*)(const DBT *, const DBT *), int *)); */ int -__db_moff(dbp, dbt, pgno) +__db_moff(dbp, dbt, pgno, tlen, cmpfunc, cmpp) DB *dbp; const DBT *dbt; db_pgno_t pgno; + u_int32_t tlen; + int (*cmpfunc) __P((const DBT *, const DBT *)), *cmpp; { PAGE *pagep; - u_int32_t cmp_bytes, key_left; + DBT local_dbt; + void *buf; + u_int32_t bufsize, cmp_bytes, key_left; u_int8_t *p1, *p2; int ret; + /* + * If there is a user-specified comparison function, build a + * contiguous copy of the key, and call it. + */ + if (cmpfunc != NULL) { + memset(&local_dbt, 0, sizeof(local_dbt)); + buf = NULL; + bufsize = 0; + + if ((ret = __db_goff(dbp, + &local_dbt, tlen, pgno, &buf, &bufsize)) != 0) + return (ret); + *cmpp = cmpfunc(&local_dbt, dbt); + __os_free(buf, bufsize); + return (0); + } + /* While there are both keys to compare. */ - for (ret = 0, p1 = dbt->data, + for (*cmpp = 0, p1 = dbt->data, key_left = dbt->size; key_left > 0 && pgno != PGNO_INVALID;) { - if (memp_fget(dbp->mpf, &pgno, 0, &pagep) != 0) { - (void)__db_pgerr(dbp, pgno); - return (0); /* No system error return. */ - } + if ((ret = memp_fget(dbp->mpf, &pgno, 0, &pagep)) != 0) + return (ret); cmp_bytes = OV_LEN(pagep) < key_left ? OV_LEN(pagep) : key_left; key_left -= cmp_bytes; for (p2 = (u_int8_t *)pagep + P_OVERHEAD; cmp_bytes-- > 0; ++p1, ++p2) if (*p1 != *p2) { - ret = (long)*p1 - (long)*p2; + *cmpp = (long)*p1 - (long)*p2; break; } pgno = NEXT_PGNO(pagep); - (void)memp_fput(dbp->mpf, pagep, 0); - if (ret != 0) + if ((ret = memp_fput(dbp->mpf, pagep, 0)) != 0) return (ret); + if (*cmpp != 0) + return (0); } if (key_left > 0) /* DBT is longer than page key. */ - return (-1); - if (pgno != PGNO_INVALID) /* DBT is shorter than page key. */ - return (1); + *cmpp = -1; + else if (pgno != PGNO_INVALID) /* DBT is shorter than page key. */ + *cmpp = 1; + else + *cmpp = 0; + return (0); } diff --git a/db2/db/db_pr.c b/db2/db/db_pr.c index a294cdd135..7f4364c6e1 100644 --- a/db2/db/db_pr.c +++ b/db2/db/db_pr.c @@ -8,7 +8,7 @@ #include "config.h" #ifndef lint -static const char sccsid[] = "@(#)db_pr.c 10.29 (Sleepycat) 5/23/98"; +static const char sccsid[] = "@(#)db_pr.c 10.40 (Sleepycat) 11/22/98"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -126,11 +126,10 @@ __db_prdb(dbp) { DB_AM_MLOCAL, "local mpool" }, { DB_AM_PGDEF, "default page size" }, { DB_AM_RDONLY, "read-only" }, - { DB_AM_RECOVER, "recover" }, { DB_AM_SWAP, "needswap" }, { DB_AM_THREAD, "thread" }, - { DB_BT_RECNUM, "btree:records" }, - { DB_HS_DIRTYMETA, "hash:dirty-meta" }, + { DB_BT_RECNUM, "btree:recnum" }, + { DB_DBM_ERROR, "dbm/ndbm error" }, { DB_RE_DELIMITER, "recno:delimiter" }, { DB_RE_FIXEDLEN, "recno:fixed-length" }, { DB_RE_PAD, "recno:pad" }, @@ -178,42 +177,55 @@ __db_prbtree(dbp) static const FN mfn[] = { { BTM_DUP, "duplicates" }, { BTM_RECNO, "recno" }, - { BTM_RECNUM, "btree:records" }, + { BTM_RECNUM, "btree:recnum" }, { BTM_FIXEDLEN, "recno:fixed-length" }, { BTM_RENUMBER, "recno:renumber" }, { 0 }, }; + DBC *dbc; BTMETA *mp; BTREE *t; - EPG *epg; FILE *fp; PAGE *h; RECNO *rp; db_pgno_t i; - int ret; + int cnt, ret; + const char *sep; t = dbp->internal; fp = __db_prinit(NULL); + if ((ret = dbp->cursor(dbp, NULL, &dbc, 0)) != 0) + return (ret); (void)fprintf(fp, "%s\nOn-page metadata:\n", DB_LINE); i = PGNO_METADATA; - if ((ret = __bam_pget(dbp, (PAGE **)&mp, &i, 0)) != 0) + if ((ret = memp_fget(dbp->mpf, &i, 0, (PAGE **)&mp)) != 0) { + (void)dbc->c_close(dbc); return (ret); + } + fprintf(fp, "lsn.file: %lu lsn.offset: %lu\n", + (u_long)LSN(mp).file, (u_long)LSN(mp).offset); (void)fprintf(fp, "magic %#lx\n", (u_long)mp->magic); (void)fprintf(fp, "version %#lx\n", (u_long)mp->version); (void)fprintf(fp, "pagesize %lu\n", (u_long)mp->pagesize); (void)fprintf(fp, "maxkey: %lu minkey: %lu\n", (u_long)mp->maxkey, (u_long)mp->minkey); - (void)fprintf(fp, "free %lu", (u_long)mp->free); - for (i = mp->free; i != PGNO_INVALID;) { - if ((ret = __bam_pget(dbp, &h, &i, 0)) != 0) + (void)fprintf(fp, "free list: %lu", (u_long)mp->free); + for (i = mp->free, cnt = 0, sep = ", "; i != PGNO_INVALID;) { + if ((ret = memp_fget(dbp->mpf, &i, 0, &h)) != 0) return (ret); i = h->next_pgno; (void)memp_fput(dbp->mpf, h, 0); - (void)fprintf(fp, ", %lu", (u_long)i); + (void)fprintf(fp, "%s%lu", sep, (u_long)i); + if (++cnt % 10 == 0) { + (void)fprintf(fp, "\n"); + cnt = 0; + sep = ""; + } else + sep = ", "; } (void)fprintf(fp, "\n"); @@ -227,7 +239,7 @@ __db_prbtree(dbp) (u_long)t->bt_maxkey, (u_long)t->bt_minkey); (void)fprintf(fp, "bt_compare: %#lx bt_prefix: %#lx\n", (u_long)t->bt_compare, (u_long)t->bt_prefix); - if ((rp = t->bt_recno) != NULL) { + if ((rp = t->recno) != NULL) { (void)fprintf(fp, "re_delim: %#lx re_pad: %#lx re_len: %lu re_source: %s\n", (u_long)rp->re_delim, (u_long)rp->re_pad, @@ -238,13 +250,9 @@ __db_prbtree(dbp) (u_long)rp->re_cmap, (u_long)rp->re_smap, (u_long)rp->re_emap, (u_long)rp->re_msize); } - (void)fprintf(fp, "stack:"); - for (epg = t->bt_stack; epg < t->bt_sp; ++epg) - (void)fprintf(fp, " %lu", (u_long)epg->page->pgno); - (void)fprintf(fp, "\n"); (void)fprintf(fp, "ovflsize: %lu\n", (u_long)t->bt_ovflsize); (void)fflush(fp); - return (0); + return (dbc->c_close(dbc)); } /* @@ -258,51 +266,50 @@ __db_prhash(dbp) DB *dbp; { FILE *fp; - HTAB *t; + DBC *dbc; + HASH_CURSOR *hcp; int i, put_page, ret; db_pgno_t pgno; - t = dbp->internal; - fp = __db_prinit(NULL); + if ((ret = dbp->cursor(dbp, NULL, &dbc, 0)) != 0) + return (ret); + hcp = (HASH_CURSOR *)dbc->internal; - fprintf(fp, "\thash_accesses %lu\n", (u_long)t->hash_accesses); - fprintf(fp, "\thash_collisions %lu\n", (u_long)t->hash_collisions); - fprintf(fp, "\thash_expansions %lu\n", (u_long)t->hash_expansions); - fprintf(fp, "\thash_overflows %lu\n", (u_long)t->hash_overflows); - fprintf(fp, "\thash_bigpages %lu\n", (u_long)t->hash_bigpages); - fprintf(fp, "\n"); - - if (t->hdr == NULL) { + /* + * In this case, hcp->hdr will never be null, if we decide + * to pass dbc's to this routine instead, then it could be. + */ + if (hcp->hdr == NULL) { pgno = PGNO_METADATA; - if ((ret = memp_fget(dbp->mpf, &pgno, 0, &t->hdr)) != 0) + if ((ret = memp_fget(dbp->mpf, &pgno, 0, &hcp->hdr)) != 0) return (ret); put_page = 1; } else put_page = 0; - fprintf(fp, "\tmagic %#lx\n", (u_long)t->hdr->magic); - fprintf(fp, "\tversion %lu\n", (u_long)t->hdr->version); - fprintf(fp, "\tpagesize %lu\n", (u_long)t->hdr->pagesize); - fprintf(fp, "\tovfl_point %lu\n", (u_long)t->hdr->ovfl_point); - fprintf(fp, "\tlast_freed %lu\n", (u_long)t->hdr->last_freed); - fprintf(fp, "\tmax_bucket %lu\n", (u_long)t->hdr->max_bucket); - fprintf(fp, "\thigh_mask %#lx\n", (u_long)t->hdr->high_mask); - fprintf(fp, "\tlow_mask %#lx\n", (u_long)t->hdr->low_mask); - fprintf(fp, "\tffactor %lu\n", (u_long)t->hdr->ffactor); - fprintf(fp, "\tnelem %lu\n", (u_long)t->hdr->nelem); - fprintf(fp, "\th_charkey %#lx\n", (u_long)t->hdr->h_charkey); + fprintf(fp, "\tmagic %#lx\n", (u_long)hcp->hdr->magic); + fprintf(fp, "\tversion %lu\n", (u_long)hcp->hdr->version); + fprintf(fp, "\tpagesize %lu\n", (u_long)hcp->hdr->pagesize); + fprintf(fp, "\tovfl_point %lu\n", (u_long)hcp->hdr->ovfl_point); + fprintf(fp, "\tlast_freed %lu\n", (u_long)hcp->hdr->last_freed); + fprintf(fp, "\tmax_bucket %lu\n", (u_long)hcp->hdr->max_bucket); + fprintf(fp, "\thigh_mask %#lx\n", (u_long)hcp->hdr->high_mask); + fprintf(fp, "\tlow_mask %#lx\n", (u_long)hcp->hdr->low_mask); + fprintf(fp, "\tffactor %lu\n", (u_long)hcp->hdr->ffactor); + fprintf(fp, "\tnelem %lu\n", (u_long)hcp->hdr->nelem); + fprintf(fp, "\th_charkey %#lx\n", (u_long)hcp->hdr->h_charkey); for (i = 0; i < NCACHED; i++) - fprintf(fp, "%lu ", (u_long)t->hdr->spares[i]); + fprintf(fp, "%lu ", (u_long)hcp->hdr->spares[i]); fprintf(fp, "\n"); (void)fflush(fp); if (put_page) { - (void)memp_fput(dbp->mpf, (PAGE *)t->hdr, 0); - t->hdr = NULL; + (void)memp_fput(dbp->mpf, (PAGE *)hcp->hdr, 0); + hcp->hdr = NULL; } - return (0); + return (dbc->c_close(dbc)); } /* @@ -318,22 +325,18 @@ __db_prtree(mpf, all) { PAGE *h; db_pgno_t i; - int ret, t_ret; if (set_psize == PSIZE_BOUNDARY) __db_psize(mpf); - ret = 0; for (i = PGNO_ROOT;; ++i) { - if ((ret = memp_fget(mpf, &i, 0, &h)) != 0) + if (memp_fget(mpf, &i, 0, &h) != 0) break; - if (TYPE(h) != P_INVALID) - if ((t_ret = __db_prpage(h, all)) != 0 && ret == 0) - ret = t_ret; + (void)__db_prpage(h, all); (void)memp_fput(mpf, h, 0); } (void)fflush(__db_prinit(NULL)); - return (ret); + return (0); } /* @@ -425,8 +428,7 @@ __db_prpage(h, all) (TYPE(h) == P_LRECNO && h->pgno == PGNO_ROOT)) fprintf(fp, " total records: %4lu", (u_long)RE_NREC(h)); fprintf(fp, "\n"); - if (TYPE(h) == P_LBTREE || TYPE(h) == P_LRECNO || - TYPE(h) == P_DUPLICATE || TYPE(h) == P_OVERFLOW) + if (TYPE(h) != P_IBTREE && TYPE(h) != P_IRECNO) fprintf(fp, " prev: %4lu next: %4lu", (u_long)PREV_PGNO(h), (u_long)NEXT_PGNO(h)); if (TYPE(h) == P_IBTREE || TYPE(h) == P_LBTREE) diff --git a/db2/db/db_rec.c b/db2/db/db_rec.c index 1ef6f18e61..7f577b5855 100644 --- a/db2/db/db_rec.c +++ b/db2/db/db_rec.c @@ -8,7 +8,7 @@ #include "config.h" #ifndef lint -static const char sccsid[] = "@(#)db_rec.c 10.16 (Sleepycat) 4/28/98"; +static const char sccsid[] = "@(#)db_rec.c 10.19 (Sleepycat) 9/27/98"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -40,7 +40,8 @@ __db_addrem_recover(logp, dbtp, lsnp, redo, info) void *info; { __db_addrem_args *argp; - DB *file_dbp, *mdbp; + DB *file_dbp; + DBC *dbc; DB_MPOOLFILE *mpf; PAGE *pagep; u_int32_t change; @@ -57,9 +58,7 @@ __db_addrem_recover(logp, dbtp, lsnp, redo, info) * would not have to undo anything. In this case, * don't bother creating a page. */ - *lsnp = argp->prev_lsn; - ret = 0; - goto out; + goto done; } else if ((ret = memp_fget(mpf, &argp->pgno, DB_MPOOL_CREATE, &pagep)) != 0) @@ -73,7 +72,7 @@ __db_addrem_recover(logp, dbtp, lsnp, redo, info) (cmp_n == 0 && !redo && argp->opcode == DB_REM_DUP)) { /* Need to redo an add, or undo a delete. */ - if ((ret = __db_pitem(file_dbp, pagep, argp->indx, argp->nbytes, + if ((ret = __db_pitem(dbc, pagep, argp->indx, argp->nbytes, argp->hdr.size == 0 ? NULL : &argp->hdr, argp->dbt.size == 0 ? NULL : &argp->dbt)) != 0) goto out; @@ -83,7 +82,7 @@ __db_addrem_recover(logp, dbtp, lsnp, redo, info) } else if ((cmp_n == 0 && !redo && argp->opcode == DB_ADD_DUP) || (cmp_p == 0 && redo && argp->opcode == DB_REM_DUP)) { /* Need to undo an add, or redo a delete. */ - if ((ret = __db_ditem(file_dbp, + if ((ret = __db_ditem(dbc, pagep, argp->indx, argp->nbytes)) != 0) goto out; change = DB_MPOOL_DIRTY; @@ -96,8 +95,11 @@ __db_addrem_recover(logp, dbtp, lsnp, redo, info) LSN(pagep) = argp->pagelsn; } - if ((ret = memp_fput(mpf, pagep, change)) == 0) - *lsnp = argp->prev_lsn; + if ((ret = memp_fput(mpf, pagep, change)) != 0) + goto out; + +done: *lsnp = argp->prev_lsn; + ret = 0; out: REC_CLOSE; } @@ -114,7 +116,8 @@ __db_split_recover(logp, dbtp, lsnp, redo, info) void *info; { __db_split_args *argp; - DB *file_dbp, *mdbp; + DB *file_dbp; + DBC *dbc; DB_MPOOLFILE *mpf; PAGE *pagep; int change, cmp_n, cmp_p, ret; @@ -130,9 +133,7 @@ __db_split_recover(logp, dbtp, lsnp, redo, info) * would not have to undo anything. In this case, * don't bother creating a page. */ - *lsnp = argp->prev_lsn; - ret = 0; - goto out; + goto done; } else if ((ret = memp_fget(mpf, &argp->pgno, DB_MPOOL_CREATE, &pagep)) != 0) @@ -169,8 +170,11 @@ __db_split_recover(logp, dbtp, lsnp, redo, info) LSN(pagep) = argp->pagelsn; change = DB_MPOOL_DIRTY; } - if ((ret = memp_fput(mpf, pagep, change)) == 0) - *lsnp = argp->prev_lsn; + if ((ret = memp_fput(mpf, pagep, change)) != 0) + goto out; + +done: *lsnp = argp->prev_lsn; + ret = 0; out: REC_CLOSE; } @@ -187,7 +191,8 @@ __db_big_recover(logp, dbtp, lsnp, redo, info) void *info; { __db_big_args *argp; - DB *file_dbp, *mdbp; + DB *file_dbp; + DBC *dbc; DB_MPOOLFILE *mpf; PAGE *pagep; u_int32_t change; @@ -209,7 +214,7 @@ __db_big_recover(logp, dbtp, lsnp, redo, info) } else if ((ret = memp_fget(mpf, &argp->pgno, DB_MPOOL_CREATE, &pagep)) != 0) - goto out; + goto out; } /* @@ -299,9 +304,7 @@ npage: if (argp->next_pgno != PGNO_INVALID) { * so we would not have to undo anything. In * this case, don't bother creating a page. */ - *lsnp = argp->prev_lsn; - ret = 0; - goto out; + goto done; } else if ((ret = memp_fget(mpf, &argp->next_pgno, DB_MPOOL_CREATE, &pagep)) != 0) @@ -323,7 +326,8 @@ npage: if (argp->next_pgno != PGNO_INVALID) { goto out; } - *lsnp = argp->prev_lsn; +done: *lsnp = argp->prev_lsn; + ret = 0; out: REC_CLOSE; } @@ -343,7 +347,8 @@ __db_ovref_recover(logp, dbtp, lsnp, redo, info) void *info; { __db_ovref_args *argp; - DB *file_dbp, *mdbp; + DB *file_dbp; + DBC *dbc; DB_MPOOLFILE *mpf; PAGE *pagep; int modified, ret; @@ -370,8 +375,11 @@ __db_ovref_recover(logp, dbtp, lsnp, redo, info) pagep->lsn = argp->lsn; modified = 1; } - if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) == 0) - *lsnp = argp->prev_lsn; + if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0) + goto out; + +done: *lsnp = argp->prev_lsn; + ret = 0; out: REC_CLOSE; } @@ -392,17 +400,20 @@ __db_relink_recover(logp, dbtp, lsnp, redo, info) void *info; { __db_relink_args *argp; - DB *file_dbp, *mdbp; + DB *file_dbp; + DBC *dbc; DB_MPOOLFILE *mpf; PAGE *pagep; - int modified, ret; + int cmp_n, cmp_p, modified, ret; REC_PRINT(__db_relink_print); REC_INTRO(__db_relink_read); /* - * There are three pages we need to check -- the page, and the - * previous and next pages, if they existed. + * There are up to three pages we need to check -- the page, and the + * previous and next pages, if they existed. For a page add operation, + * the current page is the result of a split and is being recovered + * elsewhere, so all we need do is recover the next page. */ if ((ret = memp_fget(mpf, &argp->pgno, 0, &pagep)) != 0) { if (redo) { @@ -411,6 +422,9 @@ __db_relink_recover(logp, dbtp, lsnp, redo, info) } goto next; } + if (argp->opcode == DB_ADD_PAGE) + goto next; + modified = 0; if (log_compare(&LSN(pagep), &argp->lsn) == 0 && redo) { /* Redo the relink. */ @@ -424,10 +438,8 @@ __db_relink_recover(logp, dbtp, lsnp, redo, info) pagep->lsn = argp->lsn; modified = 1; } - if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0) { - (void)__db_panic(file_dbp); + if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0) goto out; - } next: if ((ret = memp_fget(mpf, &argp->next, 0, &pagep)) != 0) { if (redo) { @@ -437,23 +449,27 @@ next: if ((ret = memp_fget(mpf, &argp->next, 0, &pagep)) != 0) { goto prev; } modified = 0; - if (log_compare(&LSN(pagep), &argp->lsn_next) == 0 && redo) { - /* Redo the relink. */ + cmp_n = log_compare(lsnp, &LSN(pagep)); + cmp_p = log_compare(&LSN(pagep), &argp->lsn_next); + if ((argp->opcode == DB_REM_PAGE && cmp_p == 0 && redo) || + (argp->opcode == DB_ADD_PAGE && cmp_n == 0 && !redo)) { + /* Redo the remove or undo the add. */ pagep->prev_pgno = argp->prev; pagep->lsn = *lsnp; modified = 1; - } else if (log_compare(lsnp, &LSN(pagep)) == 0 && !redo) { - /* Undo the relink. */ + } else if ((argp->opcode == DB_REM_PAGE && cmp_n == 0 && !redo) || + (argp->opcode == DB_ADD_PAGE && cmp_p == 0 && redo)) { + /* Undo the remove or redo the add. */ pagep->prev_pgno = argp->pgno; pagep->lsn = argp->lsn_next; modified = 1; } - if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0) { - (void)__db_panic(file_dbp); + if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0) goto out; - } + if (argp->opcode == DB_ADD_PAGE) + goto done; prev: if ((ret = memp_fget(mpf, &argp->prev, 0, &pagep)) != 0) { if (redo) { @@ -476,10 +492,8 @@ prev: if ((ret = memp_fget(mpf, &argp->prev, 0, &pagep)) != 0) { pagep->lsn = argp->lsn_prev; modified = 1; } - if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0) { - (void) __db_panic(file_dbp); + if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0) goto out; - } done: *lsnp = argp->prev_lsn; ret = 0; @@ -500,7 +514,8 @@ __db_addpage_recover(logp, dbtp, lsnp, redo, info) void *info; { __db_addpage_args *argp; - DB *file_dbp, *mdbp; + DB *file_dbp; + DBC *dbc; DB_MPOOLFILE *mpf; PAGE *pagep; u_int32_t change; @@ -541,8 +556,7 @@ __db_addpage_recover(logp, dbtp, lsnp, redo, info) * would not have to undo anything. In this case, * don't bother creating a page. */ - ret = 0; - goto out; + goto done; } else if ((ret = memp_fget(mpf, &argp->nextpgno, DB_MPOOL_CREATE, &pagep)) != 0) @@ -563,11 +577,13 @@ __db_addpage_recover(logp, dbtp, lsnp, redo, info) LSN(pagep) = argp->nextlsn; change = DB_MPOOL_DIRTY; } - ret = memp_fput(mpf, pagep, change); + if ((ret = memp_fput(mpf, pagep, change)) != 0) + goto out; + +done: *lsnp = argp->prev_lsn; + ret = 0; -out: if (ret == 0) - *lsnp = argp->prev_lsn; - REC_CLOSE; +out: REC_CLOSE; } /* @@ -598,46 +614,3 @@ __db_debug_recover(logp, dbtp, lsnp, redo, info) REC_NOOP_CLOSE; } - -/* - * __db_noop_recover -- - * Recovery function for noop. - * - * PUBLIC: int __db_noop_recover __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); - */ -int -__db_noop_recover(logp, dbtp, lsnp, redo, info) - DB_LOG *logp; - DBT *dbtp; - DB_LSN *lsnp; - int redo; - void *info; -{ - __db_noop_args *argp; - DB *file_dbp, *mdbp; - DB_MPOOLFILE *mpf; - PAGE *pagep; - u_int32_t change; - int cmp_n, cmp_p, ret; - - REC_PRINT(__db_noop_print); - REC_INTRO(__db_noop_read); - - if ((ret = memp_fget(mpf, &argp->pgno, 0, &pagep)) != 0) - goto out; - - cmp_n = log_compare(lsnp, &LSN(pagep)); - cmp_p = log_compare(&LSN(pagep), &argp->prevlsn); - change = 0; - if (cmp_p == 0 && redo) { - LSN(pagep) = *lsnp; - change = DB_MPOOL_DIRTY; - } else if (cmp_n == 0 && !redo) { - LSN(pagep) = argp->prevlsn; - change = DB_MPOOL_DIRTY; - } - *lsnp = argp->prev_lsn; - ret = memp_fput(mpf, pagep, change); - -out: REC_CLOSE; -} diff --git a/db2/db/db_ret.c b/db2/db/db_ret.c index 9d9b599ad6..9f0d0ecf8d 100644 --- a/db2/db/db_ret.c +++ b/db2/db/db_ret.c @@ -8,7 +8,7 @@ #include "config.h" #ifndef lint -static const char sccsid[] = "@(#)db_ret.c 10.13 (Sleepycat) 5/7/98"; +static const char sccsid[] = "@(#)db_ret.c 10.16 (Sleepycat) 10/4/98"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -93,6 +93,8 @@ __db_retcopy(dbt, data, len, memp, memsize, db_malloc) u_int32_t *memsize; void *(*db_malloc) __P((size_t)); { + int ret; + /* If returning a partial record, reset the length. */ if (F_ISSET(dbt, DB_DBT_PARTIAL)) { data = (u_int8_t *)data + dbt->doff; @@ -120,9 +122,6 @@ __db_retcopy(dbt, data, len, memp, memsize, db_malloc) * guarantees consistency, i.e., the application can always free memory * without concern as to how many bytes of the record were requested. * - * XXX - * Never allocate 0 bytes, it's known to make malloc/realloc unhappy. - * * Use the memory specified by the application: DB_DBT_USERMEM. * * !!! @@ -130,11 +129,8 @@ __db_retcopy(dbt, data, len, memp, memsize, db_malloc) * memory pointer is allowed to be NULL. */ if (F_ISSET(dbt, DB_DBT_MALLOC)) { - dbt->data = db_malloc == NULL ? - (void *)__db_malloc(len) : - (void *)db_malloc(len + 1); - if (dbt->data == NULL) - return (ENOMEM); + if ((ret = __os_malloc(len, db_malloc, &dbt->data)) != 0) + return (ret); } else if (F_ISSET(dbt, DB_DBT_USERMEM)) { if (len != 0 && (dbt->data == NULL || dbt->ulen < len)) return (ENOMEM); @@ -142,12 +138,9 @@ __db_retcopy(dbt, data, len, memp, memsize, db_malloc) return (EINVAL); } else { if (len != 0 && (*memsize == 0 || *memsize < len)) { - *memp = *memp == NULL ? - (void *)__db_malloc(len) : - (void *)__db_realloc(*memp, len); - if (*memp == NULL) { + if ((ret = __os_realloc(memp, len)) != 0) { *memsize = 0; - return (ENOMEM); + return (ret); } *memsize = len; } diff --git a/db2/db/db_thread.c b/db2/db/db_thread.c deleted file mode 100644 index 73e2a51286..0000000000 --- a/db2/db/db_thread.c +++ /dev/null @@ -1,121 +0,0 @@ -/*- - * See the file LICENSE for redistribution information. - * - * Copyright (c) 1996, 1997, 1998 - * Sleepycat Software. All rights reserved. - */ - -#include "config.h" - -#ifndef lint -static const char sccsid[] = "@(#)db_thread.c 8.15 (Sleepycat) 4/26/98"; -#endif /* not lint */ - -#ifndef NO_SYSTEM_INCLUDES -#include <sys/types.h> - -#include <errno.h> -#include <string.h> -#endif - -#include "db_int.h" -#include "db_page.h" -#include "db_am.h" - -static int __db_getlockid __P((DB *, DB *)); - -/* - * __db_gethandle -- - * Called by db access method routines when the DB_THREAD flag is set. - * This routine returns a handle, either an existing handle from the - * chain of handles, or creating one if necessary. - * - * PUBLIC: int __db_gethandle __P((DB *, int (*)(DB *, DB *), DB **)); - */ -int -__db_gethandle(dbp, am_func, dbpp) - DB *dbp, **dbpp; - int (*am_func) __P((DB *, DB *)); -{ - DB *ret_dbp; - int ret, t_ret; - - if ((ret = __db_mutex_lock((db_mutex_t *)dbp->mutexp, -1)) != 0) - return (ret); - - if ((ret_dbp = LIST_FIRST(&dbp->handleq)) != NULL) - /* Simply take one off the list. */ - LIST_REMOVE(ret_dbp, links); - else { - /* Allocate a new handle. */ - if ((ret_dbp = (DB *)__db_malloc(sizeof(*dbp))) == NULL) { - ret = ENOMEM; - goto err; - } - memcpy(ret_dbp, dbp, sizeof(*dbp)); - ret_dbp->internal = NULL; - TAILQ_INIT(&ret_dbp->curs_queue); - - /* Set the locker, the lock structure and the lock DBT. */ - if ((ret = __db_getlockid(dbp, ret_dbp)) != 0) - goto err; - - /* Finally, call the access method specific dup function. */ - if ((ret = am_func(dbp, ret_dbp)) != 0) - goto err; - } - - *dbpp = ret_dbp; - - if (0) { -err: if (ret_dbp != NULL) - FREE(ret_dbp, sizeof(*ret_dbp)); - } - if ((t_ret = - __db_mutex_unlock((db_mutex_t *)dbp->mutexp, -1)) != 0 && ret == 0) - ret = t_ret; - return (ret); -} - -/* - * __db_puthandle -- - * Return a DB handle to the pool for later use. - * - * PUBLIC: int __db_puthandle __P((DB *)); - */ -int -__db_puthandle(dbp) - DB *dbp; -{ - DB *master; - int ret; - - master = dbp->master; - if ((ret = __db_mutex_lock((db_mutex_t *)master->mutexp, -1)) != 0) - return (ret); - - LIST_INSERT_HEAD(&master->handleq, dbp, links); - - return (__db_mutex_unlock((db_mutex_t *)master->mutexp, -1)); -} - -/* - * __db_getlockid -- - * Create a new locker ID and copy the file lock information from - * the old DB into the new one. - */ -static int -__db_getlockid(dbp, new_dbp) - DB *dbp, *new_dbp; -{ - int ret; - - if (F_ISSET(dbp, DB_AM_LOCKING)) { - if ((ret = lock_id(dbp->dbenv->lk_info, &new_dbp->locker)) != 0) - return (ret); - memcpy(new_dbp->lock.fileid, dbp->lock.fileid, DB_FILE_ID_LEN); - new_dbp->lock_dbt.size = sizeof(new_dbp->lock); - new_dbp->lock_dbt.data = &new_dbp->lock; - } - return (0); -} diff --git a/db2/db185/db185.c b/db2/db185/db185.c index 893dfa3c7f..739ada83d0 100644 --- a/db2/db185/db185.c +++ b/db2/db185/db185.c @@ -11,7 +11,7 @@ static const char copyright[] = "@(#) Copyright (c) 1996, 1997, 1998\n\ Sleepycat Software Inc. All rights reserved.\n"; -static const char sccsid[] = "@(#)db185.c 8.17 (Sleepycat) 5/7/98"; +static const char sccsid[] = "@(#)db185.c 8.21 (Sleepycat) 11/22/98"; #endif #ifndef NO_SYSTEM_INCLUDES @@ -28,6 +28,10 @@ static const char sccsid[] = "@(#)db185.c 8.17 (Sleepycat) 5/7/98"; #include "db185_int.h" #include "common_ext.h" +#ifndef STDERR_FILENO +#define STDERR_FILENO 2 +#endif + static int db185_close __P((DB185 *)); static int db185_del __P((const DB185 *, const DBT185 *, u_int)); static int db185_fd __P((const DB185 *)); @@ -37,7 +41,7 @@ static int db185_seq __P((const DB185 *, DBT185 *, DBT185 *, u_int)); static int db185_sync __P((const DB185 *, u_int)); DB185 * -__dbopen(file, oflags, mode, type, openinfo) +dbopen(file, oflags, mode, type, openinfo) const char *file; int oflags, mode; DBTYPE type; @@ -49,9 +53,10 @@ __dbopen(file, oflags, mode, type, openinfo) DB *dbp; DB185 *db185p; DB_INFO dbinfo, *dbinfop; - int s_errno; + ssize_t nw; + int fd, s_errno; - if ((db185p = (DB185 *)__db_calloc(1, sizeof(DB185))) == NULL) + if ((errno = __os_calloc(1, sizeof(DB185), &db185p)) != 0) return (NULL); dbinfop = NULL; memset(&dbinfo, 0, sizeof(dbinfo)); @@ -93,7 +98,8 @@ __dbopen(file, oflags, mode, type, openinfo) dbinfop->h_ffactor = hi->ffactor; dbinfop->h_nelem = hi->nelem; dbinfop->db_cachesize = hi->cachesize; - dbinfop->h_hash = hi->hash; + dbinfop->h_hash = (u_int32_t (*) + __P((const void *, u_int32_t)))hi->hash; dbinfop->db_lorder = hi->lorder; } @@ -127,14 +133,15 @@ __dbopen(file, oflags, mode, type, openinfo) * that in DB 2.0, so do that cast. */ if (file != NULL) { - if (oflags & O_CREAT && __db_exists(file, NULL) != 0) - (void)__os_close(__os_open(file, oflags, mode)); + if (oflags & O_CREAT && __os_exists(file, NULL) != 0) + if (__os_open(file, oflags, mode, &fd) == 0) + (void)__os_close(fd); dbinfop->re_source = (char *)file; - file = NULL; if (O_RDONLY) oflags &= ~O_RDONLY; oflags |= O_RDWR; + file = NULL; } if ((ri = openinfo) != NULL) { @@ -144,7 +151,8 @@ __dbopen(file, oflags, mode, type, openinfo) */ #define BFMSG "DB: DB 1.85's recno bfname field is not supported.\n" if (ri->bfname != NULL) { - (void)__os_write(2, BFMSG, sizeof(BFMSG) - 1); + (void)__os_write(STDERR_FILENO, + BFMSG, sizeof(BFMSG) - 1, &nw); goto einval; } @@ -196,27 +204,26 @@ __dbopen(file, oflags, mode, type, openinfo) */ if ((errno = db_open(file, type, __db_oflags(oflags), mode, NULL, dbinfop, &dbp)) != 0) { - __db_free(db185p); + __os_free(db185p, sizeof(DB185)); return (NULL); } /* Create the cursor used for sequential ops. */ - if ((errno = dbp->cursor(dbp, NULL, &((DB185 *)db185p)->dbc)) != 0) { + if ((errno = dbp->cursor(dbp, NULL, &((DB185 *)db185p)->dbc, 0)) != 0) { s_errno = errno; (void)dbp->close(dbp, 0); - __db_free(db185p); - __set_errno(s_errno); + __os_free(db185p, sizeof(DB185)); + errno = s_errno; return (NULL); } db185p->internal = dbp; return (db185p); -einval: __db_free(db185p); - __set_errno(EINVAL); +einval: __os_free(db185p, sizeof(DB185)); + errno = EINVAL; return (NULL); } -weak_alias (__dbopen, dbopen) static int db185_close(db185p) @@ -226,9 +233,9 @@ db185_close(db185p) dbp = (DB *)db185p->internal; - __set_errno(dbp->close(dbp, 0)); + errno = dbp->close(dbp, 0); - __db_free(db185p); + __os_free(db185p, sizeof(DB185)); return (errno == 0 ? 0 : -1); } @@ -251,9 +258,9 @@ db185_del(db185p, key185, flags) if (flags & ~R_CURSOR) goto einval; if (flags & R_CURSOR) - __set_errno(db185p->dbc->c_del(db185p->dbc, 0)); + errno = db185p->dbc->c_del(db185p->dbc, 0); else - __set_errno(dbp->del(dbp, NULL, &key, 0)); + errno = dbp->del(dbp, NULL, &key, 0); switch (errno) { case 0: @@ -263,7 +270,7 @@ db185_del(db185p, key185, flags) } return (-1); -einval: __set_errno(EINVAL); +einval: errno = EINVAL; return (-1); } @@ -276,7 +283,7 @@ db185_fd(db185p) dbp = (DB *)db185p->internal; - return ((__set_errno(dbp->fd(dbp, &fd))) == 0 ? fd : -1); + return ((errno = dbp->fd(dbp, &fd)) == 0 ? fd : -1); } static int @@ -301,7 +308,7 @@ db185_get(db185p, key185, data185, flags) if (flags) goto einval; - switch (__set_errno(dbp->get(dbp, NULL, &key, &data, 0))) { + switch (errno = dbp->get(dbp, NULL, &key, &data, 0)) { case 0: data185->data = data.data; data185->size = data.size; @@ -311,7 +318,7 @@ db185_get(db185p, key185, data185, flags) } return (-1); -einval: __set_errno(EINVAL); +einval: errno = EINVAL; return (-1); } @@ -338,46 +345,46 @@ db185_put(db185p, key185, data185, flags) switch (flags) { case 0: - __set_errno(dbp->put(dbp, NULL, &key, &data, 0)); + errno = dbp->put(dbp, NULL, &key, &data, 0); break; case R_CURSOR: - __set_errno( - db185p->dbc->c_put(db185p->dbc, &key, &data, DB_CURRENT)); + errno = + db185p->dbc->c_put(db185p->dbc, &key, &data, DB_CURRENT); break; case R_IAFTER: case R_IBEFORE: if (dbp->type != DB_RECNO) goto einval; - if ((__set_errno(dbp->cursor(dbp, NULL, &dbcp_put))) != 0) + if ((errno = dbp->cursor(dbp, NULL, &dbcp_put, 0)) != 0) return (-1); - if ((__set_errno( - dbcp_put->c_get(dbcp_put, &key, &data, DB_SET))) != 0) { + if ((errno = + dbcp_put->c_get(dbcp_put, &key, &data, DB_SET)) != 0) { s_errno = errno; (void)dbcp_put->c_close(dbcp_put); - __set_errno(s_errno); + errno = s_errno; return (-1); } memset(&data, 0, sizeof(data)); data.data = data185->data; data.size = data185->size; - __set_errno(dbcp_put->c_put(dbcp_put, - &key, &data, flags == R_IAFTER ? DB_AFTER : DB_BEFORE)); + errno = dbcp_put->c_put(dbcp_put, + &key, &data, flags == R_IAFTER ? DB_AFTER : DB_BEFORE); s_errno = errno; (void)dbcp_put->c_close(dbcp_put); - __set_errno(s_errno); + errno = s_errno; break; case R_NOOVERWRITE: - __set_errno(dbp->put(dbp, NULL, &key, &data, DB_NOOVERWRITE)); + errno = dbp->put(dbp, NULL, &key, &data, DB_NOOVERWRITE); break; case R_SETCURSOR: if (dbp->type != DB_BTREE && dbp->type != DB_RECNO) goto einval; - if ((__set_errno(dbp->put(dbp, NULL, &key, &data, 0))) != 0) + if ((errno = dbp->put(dbp, NULL, &key, &data, 0)) != 0) break; - __set_errno(db185p->dbc->c_get(db185p->dbc, - &key, &data, DB_SET_RANGE)); + errno = + db185p->dbc->c_get(db185p->dbc, &key, &data, DB_SET_RANGE); break; default: goto einval; @@ -393,7 +400,7 @@ db185_put(db185p, key185, data185, flags) } return (-1); -einval: __set_errno(EINVAL); +einval: errno = EINVAL; return (-1); } @@ -438,8 +445,7 @@ db185_seq(db185p, key185, data185, flags) default: goto einval; } - switch (__set_errno(db185p->dbc->c_get(db185p->dbc, - &key, &data, flags))) { + switch (errno = db185p->dbc->c_get(db185p->dbc, &key, &data, flags)) { case 0: key185->data = key.data; key185->size = key.size; @@ -451,7 +457,7 @@ db185_seq(db185p, key185, data185, flags) } return (-1); -einval: __set_errno(EINVAL); +einval: errno = EINVAL; return (-1); } @@ -461,6 +467,7 @@ db185_sync(db185p, flags) u_int flags; { DB *dbp; + ssize_t nw; dbp = (DB *)db185p->internal; @@ -473,14 +480,14 @@ db185_sync(db185p, flags) * We can't support the R_RECNOSYNC flag. */ #define RSMSG "DB: DB 1.85's R_RECNOSYNC sync flag is not supported.\n" - (void)__os_write(2, RSMSG, sizeof(RSMSG) - 1); + (void)__os_write(STDERR_FILENO, RSMSG, sizeof(RSMSG) - 1, &nw); goto einval; default: goto einval; } - return ((__set_errno(dbp->sync(dbp, 0))) == 0 ? 0 : -1); + return ((errno = dbp->sync(dbp, 0)) == 0 ? 0 : -1); -einval: __set_errno(EINVAL); +einval: errno = EINVAL; return (-1); } diff --git a/db2/db_185.h b/db2/db_185.h index 0be51f5074..a928ca8fd5 100644 --- a/db2/db_185.h +++ b/db2/db_185.h @@ -65,11 +65,11 @@ #ifndef __BIT_TYPES_DEFINED__ #define __BIT_TYPES_DEFINED__ - - - - - +@u_int8_decl@ +@int16_decl@ +@u_int16_decl@ +@int32_decl@ +@u_int32_decl@ #endif /* diff --git a/db2/db_int.h b/db2/db_int.h index 92a3817764..0016240e70 100644 --- a/db2/db_int.h +++ b/db2/db_int.h @@ -4,14 +4,15 @@ * Copyright (c) 1996, 1997, 1998 * Sleepycat Software. All rights reserved. * - * @(#)db_int.h.src 10.62 (Sleepycat) 5/23/98 + * @(#)db_int.h 10.77 (Sleepycat) 1/3/99 */ #ifndef _DB_INTERNAL_H_ #define _DB_INTERNAL_H_ -#include <db.h> /* Standard DB include file. */ +#include "db.h" /* Standard DB include file. */ #include "queue.h" +#include "shqueue.h" /******************************************************* * General purpose constants and macros. @@ -75,27 +76,7 @@ #define R_ADDR(base, offset) ((void *)((u_int8_t *)((base)->addr) + offset)) #define R_OFFSET(base, p) ((u_int8_t *)(p) - (u_int8_t *)(base)->addr) -/* Free and free-string macros that overwrite memory. */ -#ifdef DIAGNOSTIC -#undef FREE -#define FREE(p, len) { \ - memset(p, 0xff, len); \ - __db_free(p); \ -} -#undef FREES -#define FREES(p) { \ - FREE(p, strlen(p)); \ -} -#else -#undef FREE -#define FREE(p, len) { \ - __db_free(p); \ -} -#undef FREES -#define FREES(p) { \ - __db_free(p); \ -} -#endif +#define DB_DEFAULT 0x000000 /* No flag was specified. */ /* Structure used to print flag values. */ typedef struct __fn { @@ -111,25 +92,29 @@ typedef struct __fn { #define LF_CLR(f) (flags &= ~(f)) #define LF_ISSET(f) (flags & (f)) +/* + * Panic check: + * All interfaces check the panic flag, if it's set, the tree is dead. + */ +#define DB_PANIC_CHECK(dbp) { \ + if ((dbp)->dbenv != NULL && (dbp)->dbenv->db_panic != 0) \ + return (DB_RUNRECOVERY); \ +} + /* Display separator string. */ #undef DB_LINE #define DB_LINE "=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=" -/* Global variables. */ -typedef struct __db_globals { - int db_mutexlocks; /* DB_MUTEXLOCKS */ - int db_region_anon; /* DB_REGION_ANON, DB_REGION_NAME */ - int db_region_init; /* DB_REGION_INIT */ - int db_tsl_spins; /* DB_TSL_SPINS */ - int db_pageyield; /* DB_PAGEYIELD */ -} DB_GLOBALS; -extern DB_GLOBALS __db_global_values; -#define DB_GLOBAL(v) __db_global_values.v - /* Unused, or not-used-yet variable. "Shut that bloody compiler up!" */ #define COMPQUIET(n, v) (n) = (v) /* + * Purify and similar run-time tools complain about unitialized reads/writes + * for structure fields whose only purpose is padding. + */ +#define UMRW(v) (v) = 0 + +/* * Win16 needs specific syntax on callback functions. Nobody else cares. */ #ifndef DB_CALLBACK @@ -155,8 +140,6 @@ extern DB_GLOBALS __db_global_values; *******************************************************/ typedef unsigned char tsl_t; - - /* * !!! * Various systems require different alignments for mutexes (the worst we've @@ -204,21 +187,6 @@ typedef struct _db_mutex_t { if (F_ISSET(dbp, DB_AM_THREAD)) \ (void)__db_mutex_unlock((db_mutex_t *)(dbp)->mutexp, -1); -/* Btree/recno local statistics structure. */ -struct __db_bt_lstat; typedef struct __db_bt_lstat DB_BTREE_LSTAT; -struct __db_bt_lstat { - u_int32_t bt_freed; /* Pages freed for reuse. */ - u_int32_t bt_pfxsaved; /* Bytes saved by prefix compression. */ - u_int32_t bt_split; /* Total number of splits. */ - u_int32_t bt_rootsplit; /* Root page splits. */ - u_int32_t bt_fastsplit; /* Fast splits. */ - u_int32_t bt_added; /* Items added. */ - u_int32_t bt_deleted; /* Items deleted. */ - u_int32_t bt_get; /* Items retrieved. */ - u_int32_t bt_cache_hit; /* Hits in fast-insert code. */ - u_int32_t bt_cache_miss; /* Misses in fast-insert code. */ -}; - /******************************************************* * Environment. *******************************************************/ @@ -250,6 +218,7 @@ typedef struct _rlayout { int majver; /* Major version number. */ int minver; /* Minor version number. */ int patch; /* Patch version number. */ + int panic; /* Region is dead. */ #define INVALID_SEGID -1 int segid; /* shmget(2) ID, or Win16 segment ID. */ @@ -262,9 +231,9 @@ typedef struct _rlayout { * we don't make the underlying VM unhappy. */ #define DB_VMPAGESIZE (4 * 1024) -#define DB_ROUNDOFF(i) { \ - (i) += DB_VMPAGESIZE - 1; \ - (i) -= (i) % DB_VMPAGESIZE; \ +#define DB_ROUNDOFF(n, round) { \ + (n) += (round) - 1; \ + (n) -= (n) % (round); \ } /* @@ -292,6 +261,7 @@ struct __db_reginfo { and mmap(2) is being used to map it into our address space. */ int segid; /* shmget(2) ID, or Win16 segment ID. */ + void *wnt_handle; /* Win/NT HANDLE. */ /* Shared flags. */ /* 0x0001 COMMON MASK with RLAYOUT structure. */ @@ -334,8 +304,8 @@ typedef struct __dbpginfo { #define IS_ZERO_LSN(LSN) ((LSN).file == 0) /* Test if we need to log a change. */ -#define DB_LOGGING(dbp) \ - (F_ISSET(dbp, DB_AM_LOGGING) && !F_ISSET(dbp, DB_AM_RECOVER)) +#define DB_LOGGING(dbc) \ + (F_ISSET((dbc)->dbp, DB_AM_LOGGING) && !F_ISSET(dbc, DBC_RECOVER)) #ifdef DIAGNOSTIC /* @@ -350,30 +320,30 @@ typedef struct __dbpginfo { * A data * F flags */ -#define LOG_OP(D, T, O, K, A, F) { \ +#define LOG_OP(C, T, O, K, A, F) { \ DB_LSN _lsn; \ DBT _op; \ - if (DB_LOGGING((D))) { \ + if (DB_LOGGING((C))) { \ memset(&_op, 0, sizeof(_op)); \ _op.data = O; \ _op.size = strlen(O) + 1; \ - (void)__db_debug_log((D)->dbenv->lg_info, \ - T, &_lsn, 0, &_op, (D)->log_fileid, K, A, F); \ + (void)__db_debug_log((C)->dbp->dbenv->lg_info, \ + T, &_lsn, 0, &_op, (C)->dbp->log_fileid, K, A, F); \ } \ } #ifdef DEBUG_ROP -#define DEBUG_LREAD(D, T, O, K, A, F) LOG_OP(D, T, O, K, A, F) +#define DEBUG_LREAD(C, T, O, K, A, F) LOG_OP(C, T, O, K, A, F) #else -#define DEBUG_LREAD(D, T, O, K, A, F) +#define DEBUG_LREAD(C, T, O, K, A, F) #endif #ifdef DEBUG_WOP -#define DEBUG_LWRITE(D, T, O, K, A, F) LOG_OP(D, T, O, K, A, F) +#define DEBUG_LWRITE(C, T, O, K, A, F) LOG_OP(C, T, O, K, A, F) #else -#define DEBUG_LWRITE(D, T, O, K, A, F) +#define DEBUG_LWRITE(C, T, O, K, A, F) #endif #else -#define DEBUG_LREAD(D, T, O, K, A, F) -#define DEBUG_LWRITE(D, T, O, K, A, F) +#define DEBUG_LREAD(C, T, O, K, A, F) +#define DEBUG_LWRITE(C, T, O, K, A, F) #endif /* DIAGNOSTIC */ /******************************************************* @@ -393,10 +363,45 @@ struct __db_txn { DB_LSN last_lsn; /* Lsn of last log write. */ u_int32_t txnid; /* Unique transaction id. */ size_t off; /* Detail structure within region. */ - TAILQ_ENTRY(__db_txn) links; + TAILQ_ENTRY(__db_txn) links; /* Links transactions off manager. */ + TAILQ_HEAD(__kids, __db_txn) kids; /* Child transactions. */ + TAILQ_ENTRY(__db_txn) klinks; /* Links child transactions. */ + +#define TXN_MALLOC 0x01 /* Structure allocated by TXN system. */ + u_int32_t flags; +}; + +/******************************************************* + * Global variables. + *******************************************************/ +/* + * !!! + * Initialized in os/os_config.c, don't change this unless you change it + * as well. + */ + +struct __rmname { + char *dbhome; + int rmid; + TAILQ_ENTRY(__rmname) links; }; -#include "os_func.h" +typedef struct __db_globals { + int db_mutexlocks; /* DB_MUTEXLOCKS */ + int db_pageyield; /* DB_PAGEYIELD */ + int db_region_anon; /* DB_REGION_ANON, DB_REGION_NAME */ + int db_region_init; /* DB_REGION_INIT */ + int db_tsl_spins; /* DB_TSL_SPINS */ + /* XA: list of opened environments. */ + TAILQ_HEAD(__db_envq, __db_env) db_envq; + /* XA: list of id to dbhome mappings. */ + TAILQ_HEAD(__db_nameq, __rmname) db_nameq; +} DB_GLOBALS; + +extern DB_GLOBALS __db_global_values; +#define DB_GLOBAL(v) __db_global_values.v + +#include "os.h" #include "os_ext.h" #endif /* !_DB_INTERNAL_H_ */ diff --git a/db2/dbm/dbm.c b/db2/dbm/dbm.c index 261fe81ff2..5bcb53f023 100644 --- a/db2/dbm/dbm.c +++ b/db2/dbm/dbm.c @@ -47,7 +47,7 @@ #include "config.h" #ifndef lint -static const char sccsid[] = "@(#)dbm.c 10.16 (Sleepycat) 5/7/98"; +static const char sccsid[] = "@(#)dbm.c 10.23 (Sleepycat) 11/22/98"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -89,6 +89,16 @@ __db_dbm_init(file) } weak_alias (__db_dbm_init, dbminit) +int +__db_dbm_close() +{ + if (__cur_db != NULL) { + dbm_close(__cur_db); + __cur_db = NULL; + } + return (0); +} + datum __db_dbm_fetch(key) datum key; @@ -140,16 +150,11 @@ int __db_dbm_delete(key) datum key; { - int ret; - if (__cur_db == NULL) { __db_no_open(); return (-1); } - ret = dbm_delete(__cur_db, key); - if (ret == 0) - ret = (((DB *)__cur_db)->sync)((DB *)__cur_db, 0); - return (ret); + return (dbm_delete(__cur_db, key)); } weak_alias (__db_dbm_delete, delete) @@ -157,16 +162,11 @@ int __db_dbm_store(key, dat) datum key, dat; { - int ret; - if (__cur_db == NULL) { __db_no_open(); return (-1); } - ret = dbm_store(__cur_db, key, dat, DBM_REPLACE); - if (ret == 0) - ret = (((DB *)__cur_db)->sync)((DB *)__cur_db, 0); - return (ret); + return (dbm_store(__cur_db, key, dat, DBM_REPLACE)); } weak_alias (__db_dbm_store, store) @@ -192,7 +192,9 @@ __db_ndbm_open(file, oflags, mode) int oflags, mode; { DB *dbp; + DBC *dbc; DB_INFO dbinfo; + int sv_errno; char path[MAXPATHLEN]; memset(&dbinfo, 0, sizeof(dbinfo)); @@ -215,7 +217,15 @@ __db_ndbm_open(file, oflags, mode) if ((errno = db_open(path, DB_HASH, __db_oflags(oflags), mode, NULL, &dbinfo, &dbp)) != 0) return (NULL); - return ((DBM *)dbp); + + if ((errno = dbp->cursor(dbp, NULL, &dbc, 0)) != 0) { + sv_errno = errno; + (void)dbp->close(dbp, 0); + errno = sv_errno; + return (NULL); + } + + return ((DBM *)dbc); } weak_alias (__db_ndbm_open, dbm_open) @@ -224,10 +234,14 @@ weak_alias (__db_ndbm_open, dbm_open) * Nothing. */ void -__db_ndbm_close(db) - DBM *db; +__db_ndbm_close(dbm) + DBM *dbm; { - (void)db->close(db, 0); + DBC *dbc; + + dbc = (DBC *)dbm; + + (void)dbc->dbp->close(dbc->dbp, 0); } weak_alias (__db_ndbm_close, dbm_close) @@ -237,25 +251,39 @@ weak_alias (__db_ndbm_close, dbm_close) * NULL on failure */ datum -__db_ndbm_fetch(db, key) - DBM *db; +__db_ndbm_fetch(dbm, key) + DBM *dbm; datum key; { + DBC *dbc; DBT _key, _data; datum data; int ret; + dbc = (DBC *)dbm; + memset(&_key, 0, sizeof(DBT)); memset(&_data, 0, sizeof(DBT)); _key.size = key.dsize; _key.data = key.dptr; - if ((ret = db->get((DB *)db, NULL, &_key, &_data, 0)) == 0) { + + /* + * Note that we can't simply use the dbc we have to do a c_get/SET, + * because that cursor is the one used for sequential iteration and + * it has to remain stable in the face of intervening gets and puts. + */ + if ((ret = dbc->dbp->get(dbc->dbp, NULL, &_key, &_data, 0)) == 0) { data.dptr = _data.data; data.dsize = _data.size; } else { data.dptr = NULL; data.dsize = 0; - __set_errno (ret == DB_NOTFOUND ? ENOENT : ret); + if (ret == DB_NOTFOUND) + errno = ENOENT; + else { + errno = ret; + F_SET(dbc->dbp, DB_DBM_ERROR); + } } return (data); } @@ -267,30 +295,31 @@ weak_alias (__db_ndbm_fetch, dbm_fetch) * NULL on failure */ datum -__db_ndbm_firstkey(db) - DBM *db; +__db_ndbm_firstkey(dbm) + DBM *dbm; { + DBC *dbc; DBT _key, _data; datum key; int ret; - DBC *cp; - - if ((cp = TAILQ_FIRST(&db->curs_queue)) == NULL) - if ((errno = db->cursor(db, NULL, &cp)) != 0) { - memset(&key, 0, sizeof(key)); - return (key); - } + dbc = (DBC *)dbm; memset(&_key, 0, sizeof(DBT)); memset(&_data, 0, sizeof(DBT)); - if ((ret = (cp->c_get)(cp, &_key, &_data, DB_FIRST)) == 0) { + + if ((ret = dbc->c_get(dbc, &_key, &_data, DB_FIRST)) == 0) { key.dptr = _key.data; key.dsize = _key.size; } else { key.dptr = NULL; key.dsize = 0; - __set_errno (ret == DB_NOTFOUND ? ENOENT : ret); + if (ret == DB_NOTFOUND) + errno = ENOENT; + else { + errno = ret; + F_SET(dbc->dbp, DB_DBM_ERROR); + } } return (key); } @@ -302,29 +331,31 @@ weak_alias (__db_ndbm_firstkey, dbm_firstkey) * NULL on failure */ datum -__db_ndbm_nextkey(db) - DBM *db; +__db_ndbm_nextkey(dbm) + DBM *dbm; { - DBC *cp; + DBC *dbc; DBT _key, _data; datum key; int ret; - if ((cp = TAILQ_FIRST(&db->curs_queue)) == NULL) - if ((errno = db->cursor(db, NULL, &cp)) != 0) { - memset(&key, 0, sizeof(key)); - return (key); - } + dbc = (DBC *)dbm; memset(&_key, 0, sizeof(DBT)); memset(&_data, 0, sizeof(DBT)); - if ((ret = (cp->c_get)(cp, &_key, &_data, DB_NEXT)) == 0) { + + if ((ret = dbc->c_get(dbc, &_key, &_data, DB_NEXT)) == 0) { key.dptr = _key.data; key.dsize = _key.size; } else { key.dptr = NULL; key.dsize = 0; - __set_errno (ret == DB_NOTFOUND ? ENOENT : ret); + if (ret == DB_NOTFOUND) + errno = ENOENT; + else { + errno = ret; + F_SET(dbc->dbp, DB_DBM_ERROR); + } } return (key); } @@ -336,19 +367,29 @@ weak_alias (__db_ndbm_nextkey, dbm_nextkey) * <0 failure */ int -__db_ndbm_delete(db, key) - DBM *db; +__db_ndbm_delete(dbm, key) + DBM *dbm; datum key; { + DBC *dbc; DBT _key; int ret; + dbc = (DBC *)dbm; + memset(&_key, 0, sizeof(DBT)); _key.data = key.dptr; _key.size = key.dsize; - if ((ret = (((DB *)db)->del)((DB *)db, NULL, &_key, 0)) == 0) + + if ((ret = dbc->dbp->del(dbc->dbp, NULL, &_key, 0)) == 0) return (0); - errno = ret == DB_NOTFOUND ? ENOENT : ret; + + if (ret == DB_NOTFOUND) + errno = ENOENT; + else { + errno = ret; + F_SET(dbc->dbp, DB_DBM_ERROR); + } return (-1); } weak_alias (__db_ndbm_delete, dbm_delete) @@ -360,49 +401,59 @@ weak_alias (__db_ndbm_delete, dbm_delete) * 1 if DBM_INSERT and entry exists */ int -__db_ndbm_store(db, key, data, flags) - DBM *db; +__db_ndbm_store(dbm, key, data, flags) + DBM *dbm; datum key, data; int flags; { + DBC *dbc; DBT _key, _data; int ret; + dbc = (DBC *)dbm; + memset(&_key, 0, sizeof(DBT)); - memset(&_data, 0, sizeof(DBT)); _key.data = key.dptr; _key.size = key.dsize; + + memset(&_data, 0, sizeof(DBT)); _data.data = data.dptr; _data.size = data.dsize; - if ((ret = db->put((DB *)db, NULL, + + if ((ret = dbc->dbp->put(dbc->dbp, NULL, &_key, &_data, flags == DBM_INSERT ? DB_NOOVERWRITE : 0)) == 0) return (0); + if (ret == DB_KEYEXIST) return (1); + errno = ret; + F_SET(dbc->dbp, DB_DBM_ERROR); return (-1); } weak_alias (__db_ndbm_store, dbm_store) int -__db_ndbm_error(db) - DBM *db; +__db_ndbm_error(dbm) + DBM *dbm; { - HTAB *hp; + DBC *dbc; - hp = (HTAB *)db->internal; - return (hp->local_errno); + dbc = (DBC *)dbm; + + return (F_ISSET(dbc->dbp, DB_DBM_ERROR)); } weak_alias (__db_ndbm_error, dbm_error) int -__db_ndbm_clearerr(db) - DBM *db; +__db_ndbm_clearerr(dbm) + DBM *dbm; { - HTAB *hp; + DBC *dbc; + + dbc = (DBC *)dbm; - hp = (HTAB *)db->internal; - hp->local_errno = 0; + F_CLR(dbc->dbp, DB_DBM_ERROR); return (0); } weak_alias (__db_ndbm_clearerr, dbm_clearerr) @@ -413,10 +464,14 @@ weak_alias (__db_ndbm_clearerr, dbm_clearerr) * 0 if not read-only */ int -__db_ndbm_rdonly(db) - DBM *db; +__db_ndbm_rdonly(dbm) + DBM *dbm; { - return (F_ISSET((DB *)db, DB_AM_RDONLY) ? 1 : 0); + DBC *dbc; + + dbc = (DBC *)dbm; + + return (F_ISSET(dbc->dbp, DB_AM_RDONLY) ? 1 : 0); } /* @@ -426,23 +481,23 @@ __db_ndbm_rdonly(db) * and picked one to use at random. */ int -__db_ndbm_dirfno(db) - DBM *db; +__db_ndbm_dirfno(dbm) + DBM *dbm; { - int fd; - - (void)db->fd(db, &fd); - return (fd); + return (dbm_pagfno(dbm)); } weak_alias (__db_ndbm_dirfno, dbm_dirfno) int -__db_ndbm_pagfno(db) - DBM *db; +__db_ndbm_pagfno(dbm) + DBM *dbm; { + DBC *dbc; int fd; - (void)db->fd(db, &fd); + dbc = (DBC *)dbm; + + (void)dbc->dbp->fd(dbc->dbp, &fd); return (fd); } weak_alias (__db_ndbm_pagfno, dbm_pagfno) diff --git a/db2/hash/hash.c b/db2/hash/hash.c index 0265f19659..0d202fce20 100644 --- a/db2/hash/hash.c +++ b/db2/hash/hash.c @@ -47,7 +47,7 @@ #include "config.h" #ifndef lint -static const char sccsid[] = "@(#)hash.c 10.45 (Sleepycat) 5/11/98"; +static const char sccsid[] = "@(#)hash.c 10.63 (Sleepycat) 12/11/98"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -64,23 +64,23 @@ static const char sccsid[] = "@(#)hash.c 10.45 (Sleepycat) 5/11/98"; #include "db_am.h" #include "db_ext.h" #include "hash.h" +#include "btree.h" #include "log.h" +#include "db_shash.h" +#include "lock.h" +#include "lock_ext.h" static int __ham_c_close __P((DBC *)); static int __ham_c_del __P((DBC *, u_int32_t)); +static int __ham_c_destroy __P((DBC *)); static int __ham_c_get __P((DBC *, DBT *, DBT *, u_int32_t)); static int __ham_c_put __P((DBC *, DBT *, DBT *, u_int32_t)); -static int __ham_c_init __P((DB *, DB_TXN *, DBC **)); -static int __ham_cursor __P((DB *, DB_TXN *, DBC **)); static int __ham_delete __P((DB *, DB_TXN *, DBT *, u_int32_t)); -static int __ham_dup_return __P((HTAB *, HASH_CURSOR *, DBT *, u_int32_t)); -static int __ham_get __P((DB *, DB_TXN *, DBT *, DBT *, u_int32_t)); -static void __ham_init_htab __P((HTAB *, u_int32_t, u_int32_t)); -static int __ham_lookup __P((HTAB *, - HASH_CURSOR *, const DBT *, u_int32_t, db_lockmode_t)); -static int __ham_overwrite __P((HTAB *, HASH_CURSOR *, DBT *)); -static int __ham_put __P((DB *, DB_TXN *, DBT *, DBT *, u_int32_t)); -static int __ham_sync __P((DB *, u_int32_t)); +static int __ham_dup_return __P((DBC *, DBT *, u_int32_t)); +static int __ham_expand_table __P((DBC *)); +static void __ham_init_htab __P((DBC *, u_int32_t, u_int32_t)); +static int __ham_lookup __P((DBC *, const DBT *, u_int32_t, db_lockmode_t)); +static int __ham_overwrite __P((DBC *, DBT *)); /************************** INTERFACE ROUTINES ***************************/ /* OPEN/CLOSE */ @@ -96,65 +96,53 @@ __ham_open(dbp, dbinfo) DB_INFO *dbinfo; { DB_ENV *dbenv; - DBC *curs; - HTAB *hashp; + DBC *dbc; + HASH_CURSOR *hcp; int file_existed, ret; + dbc = NULL; dbenv = dbp->dbenv; - if ((hashp = (HTAB *)__db_calloc(1, sizeof(HTAB))) == NULL) - return (ENOMEM); - hashp->dbp = dbp; - /* Set the hash function if specified by the user. */ if (dbinfo != NULL && dbinfo->h_hash != NULL) - hashp->hash = dbinfo->h_hash; + dbp->h_hash = dbinfo->h_hash; /* - * Initialize the remaining fields of the dbp. The type, close and - * fd functions are all set in db_open. + * Initialize the remaining fields of the dbp. The only function + * that differs from the default set is __ham_stat(). */ - dbp->internal = hashp; - dbp->cursor = __ham_cursor; + dbp->internal = NULL; + dbp->am_close = __ham_close; dbp->del = __ham_delete; - dbp->get = __ham_get; - dbp->put = __ham_put; - dbp->sync = __ham_sync; - - /* If locking is turned on, lock the meta data page. */ - if (F_ISSET(dbp, DB_AM_LOCKING)) { - dbp->lock.pgno = BUCKET_INVALID; - if ((ret = lock_get(dbenv->lk_info, dbp->locker, - 0, &dbp->lock_dbt, DB_LOCK_READ, &hashp->hlock)) != 0) { - if (ret < 0) - ret = EAGAIN; - goto out; - } - } + dbp->stat = __ham_stat; + + /* Get a cursor we can use for the rest of this function. */ + if ((ret = dbp->cursor(dbp, NULL, &dbc, 0)) != 0) + goto out; + + hcp = (HASH_CURSOR *)dbc->internal; + GET_META(dbp, hcp, ret); + if (ret != 0) + goto out; /* - * Now, we can try to read the meta-data page and figure out - * if we set up locking and get the meta-data page properly. * If this is a new file, initialize it, and put it back dirty. */ - if ((ret = __ham_get_page(hashp->dbp, 0, (PAGE **)&hashp->hdr)) != 0) - goto out; - /* Initialize the hashp structure */ - if (hashp->hdr->magic == DB_HASHMAGIC) { + /* Initialize the hdr structure */ + if (hcp->hdr->magic == DB_HASHMAGIC) { file_existed = 1; /* File exists, verify the data in the header. */ - if (hashp->hash == NULL) - hashp->hash = - hashp->hdr->version < 5 ? __ham_func4 : __ham_func5; - if (hashp->hash(CHARKEY, sizeof(CHARKEY)) != - hashp->hdr->h_charkey) { - __db_err(hashp->dbp->dbenv, - "hash: incompatible hash function"); + if (dbp->h_hash == NULL) + dbp->h_hash = + hcp->hdr->version < 5 ? __ham_func4 : __ham_func5; + if (dbp->h_hash(CHARKEY, sizeof(CHARKEY)) != + hcp->hdr->h_charkey) { + __db_err(dbp->dbenv, "hash: incompatible hash function"); ret = EINVAL; goto out; } - if (F_ISSET(hashp->hdr, DB_HASH_DUP)) + if (F_ISSET(hcp->hdr, DB_HASH_DUP)) F_SET(dbp, DB_AM_DUP); } else { /* @@ -163,59 +151,27 @@ __ham_open(dbp, dbinfo) */ file_existed = 0; if (F_ISSET(dbp, DB_AM_LOCKING) && - ((ret = lock_put(dbenv->lk_info, hashp->hlock)) != 0 || - (ret = lock_get(dbenv->lk_info, dbp->locker, 0, - &dbp->lock_dbt, DB_LOCK_WRITE, &hashp->hlock)) != 0)) { + ((ret = lock_put(dbenv->lk_info, hcp->hlock)) != 0 || + (ret = lock_get(dbenv->lk_info, dbc->locker, 0, + &dbc->lock_dbt, DB_LOCK_WRITE, &hcp->hlock)) != 0)) { if (ret < 0) ret = EAGAIN; goto out; } - __ham_init_htab(hashp, - dbinfo != NULL ? dbinfo->h_nelem : 0, + __ham_init_htab(dbc, dbinfo != NULL ? dbinfo->h_nelem : 0, dbinfo != NULL ? dbinfo->h_ffactor : 0); if (F_ISSET(dbp, DB_AM_DUP)) - F_SET(hashp->hdr, DB_HASH_DUP); - if ((ret = __ham_dirty_page(hashp, (PAGE *)hashp->hdr)) != 0) + F_SET(hcp->hdr, DB_HASH_DUP); + if ((ret = __ham_dirty_page(dbp, (PAGE *)hcp->hdr)) != 0) goto out; } - /* Initialize the default cursor. */ - __ham_c_init(dbp, NULL, &curs); - TAILQ_INSERT_TAIL(&dbp->curs_queue, curs, links); - - /* Allocate memory for our split buffer. */ - if ((hashp->split_buf = (PAGE *)__db_malloc(dbp->pgsize)) == NULL) { - ret = ENOMEM; - goto out; - } - -#ifdef NO_STATISTICS_FOR_DB_ERR - __db_err(dbp->dbenv, - "%s%lx\n%s%ld\n%s%ld\n%s%ld\n%s%ld\n%s0x%lx\n%s0x%lx\n%s%ld\n%s%ld\n%s0x%lx", - "TABLE POINTER ", (long)hashp, - "BUCKET SIZE ", (long)hashp->hdr->pagesize, - "FILL FACTOR ", (long)hashp->hdr->ffactor, - "MAX BUCKET ", (long)hashp->hdr->max_bucket, - "OVFL POINT ", (long)hashp->hdr->ovfl_point, - "LAST FREED ", (long)hashp->hdr->last_freed, - "HIGH MASK ", (long)hashp->hdr->high_mask, - "LOW MASK ", (long)hashp->hdr->low_mask, - "NELEM ", (long)hashp->hdr->nelem, - "FLAGS ", (long)hashp->hdr->flags); -#endif - /* Release the meta data page */ - (void)__ham_put_page(hashp->dbp, (PAGE *)hashp->hdr, 0); - if (F_ISSET(dbp, DB_AM_LOCKING) && - (ret = lock_put(dbenv->lk_info, hashp->hlock)) != 0) { - if (ret < 0) - ret = EAGAIN; + RELEASE_META(dbp, hcp); + if ((ret = dbc->c_close(dbc)) != 0) goto out; - } - hashp->hlock = 0; - hashp->hdr = NULL; /* Sync the file so that we know that the meta data goes to disk. */ if (!file_existed && (ret = dbp->sync(dbp, 0)) != 0) goto out; @@ -232,27 +188,8 @@ int __ham_close(dbp) DB *dbp; { - HTAB *hashp; - int ret, t_ret; - - DEBUG_LWRITE(dbp, NULL, "ham_close", NULL, NULL, 0); - hashp = (HTAB *)dbp->internal; - ret = 0; - - /* Free the split page. */ - if (hashp->split_buf) - FREE(hashp->split_buf, dbp->pgsize); - - if (hashp->hdr && (t_ret = __ham_put_page(hashp->dbp, - (PAGE *)hashp->hdr, 0)) != 0 && ret == 0) - ret = t_ret; - if (hashp->hlock && (t_ret = lock_put(hashp->dbp->dbenv->lk_info, - hashp->hlock)) != 0 && ret == 0) - ret = t_ret; - - FREE(hashp, sizeof(HTAB)); - dbp->internal = NULL; - return (ret); + COMPQUIET(dbp, NULL); + return (0); } /************************** LOCAL CREATION ROUTINES **********************/ @@ -260,408 +197,204 @@ __ham_close(dbp) * Returns 0 on No Error */ static void -__ham_init_htab(hashp, nelem, ffactor) - HTAB *hashp; +__ham_init_htab(dbc, nelem, ffactor) + DBC *dbc; u_int32_t nelem, ffactor; { + DB *dbp; + HASH_CURSOR *hcp; int32_t l2, nbuckets; - memset(hashp->hdr, 0, sizeof(HASHHDR)); - hashp->hdr->ffactor = ffactor; - hashp->hdr->pagesize = hashp->dbp->pgsize; - ZERO_LSN(hashp->hdr->lsn); - hashp->hdr->magic = DB_HASHMAGIC; - hashp->hdr->version = DB_HASHVERSION; - if (hashp->hash == NULL) - hashp->hash = - hashp->hdr->version < 5 ? __ham_func4 : __ham_func5; - hashp->hdr->h_charkey = hashp->hash(CHARKEY, sizeof(CHARKEY)); - if (nelem != 0 && hashp->hdr->ffactor != 0) { - nelem = (nelem - 1) / hashp->hdr->ffactor + 1; + hcp = (HASH_CURSOR *)dbc->internal; + dbp = dbc->dbp; + memset(hcp->hdr, 0, sizeof(HASHHDR)); + hcp->hdr->ffactor = ffactor; + hcp->hdr->pagesize = dbp->pgsize; + ZERO_LSN(hcp->hdr->lsn); + hcp->hdr->magic = DB_HASHMAGIC; + hcp->hdr->version = DB_HASHVERSION; + + if (dbp->h_hash == NULL) + dbp->h_hash = hcp->hdr->version < 5 ? __ham_func4 : __ham_func5; + hcp->hdr->h_charkey = dbp->h_hash(CHARKEY, sizeof(CHARKEY)); + if (nelem != 0 && hcp->hdr->ffactor != 0) { + nelem = (nelem - 1) / hcp->hdr->ffactor + 1; l2 = __db_log2(nelem > 2 ? nelem : 2); } else l2 = 2; nbuckets = 1 << l2; - hashp->hdr->ovfl_point = l2; - hashp->hdr->last_freed = PGNO_INVALID; + hcp->hdr->ovfl_point = l2; + hcp->hdr->last_freed = PGNO_INVALID; - hashp->hdr->max_bucket = hashp->hdr->high_mask = nbuckets - 1; - hashp->hdr->low_mask = (nbuckets >> 1) - 1; - memcpy(hashp->hdr->uid, hashp->dbp->lock.fileid, DB_FILE_ID_LEN); + hcp->hdr->max_bucket = hcp->hdr->high_mask = nbuckets - 1; + hcp->hdr->low_mask = (nbuckets >> 1) - 1; + memcpy(hcp->hdr->uid, dbp->fileid, DB_FILE_ID_LEN); } -/********************** DESTROY/CLOSE ROUTINES ************************/ - - -/* - * Write modified pages to disk - * - * Returns: - * 0 == OK - * -1 ERROR - */ static int -__ham_sync(dbp, flags) - DB *dbp; - u_int32_t flags; -{ - int ret; - - DEBUG_LWRITE(dbp, NULL, "ham_sync", NULL, NULL, flags); - if ((ret = __db_syncchk(dbp, flags)) != 0) - return (ret); - if (F_ISSET(dbp, DB_AM_RDONLY)) - return (0); - - if ((ret = memp_fsync(dbp->mpf)) == DB_INCOMPLETE) - ret = 0; - - return (ret); -} - -/*******************************SEARCH ROUTINES *****************************/ -/* - * All the access routines return - * - * Returns: - * 0 on SUCCESS - * 1 to indicate an external ERROR (i.e. key not found, etc) - * -1 to indicate an internal ERROR (i.e. out of memory, etc) - */ - -static int -__ham_get(dbp, txn, key, data, flags) +__ham_delete(dbp, txn, key, flags) DB *dbp; DB_TXN *txn; DBT *key; - DBT *data; u_int32_t flags; { - DB *ldbp; - HTAB *hashp; + DBC *dbc; HASH_CURSOR *hcp; - int ret, t_ret; + int ret, tret; - DEBUG_LREAD(dbp, txn, "ham_get", key, NULL, flags); - if ((ret = __db_getchk(dbp, key, data, flags)) != 0) - return (ret); + DB_PANIC_CHECK(dbp); - ldbp = dbp; - if (F_ISSET(dbp, DB_AM_THREAD) && - (ret = __db_gethandle(dbp, __ham_hdup, &ldbp)) != 0) + if ((ret = + __db_delchk(dbp, key, flags, F_ISSET(dbp, DB_AM_RDONLY))) != 0) return (ret); - hashp = (HTAB *)ldbp->internal; - SET_LOCKER(ldbp, txn); - GET_META(ldbp, hashp); - - hashp->hash_accesses++; - hcp = (HASH_CURSOR *)TAILQ_FIRST(&ldbp->curs_queue)->internal; - if ((ret = __ham_lookup(hashp, hcp, key, 0, DB_LOCK_READ)) == 0) { - if (F_ISSET(hcp, H_OK)) - ret = __ham_dup_return(hashp, hcp, data, DB_FIRST); - else /* Key was not found */ - ret = DB_NOTFOUND; - } - - if ((t_ret = __ham_item_done(hashp, hcp, 0)) != 0 && ret == 0) - ret = t_ret; - RELEASE_META(ldbp, hashp); - if (F_ISSET(dbp, DB_AM_THREAD)) - __db_puthandle(ldbp); - return (ret); -} - -static int -__ham_put(dbp, txn, key, data, flags) - DB *dbp; - DB_TXN *txn; - DBT *key; - DBT *data; - u_int32_t flags; -{ - DB *ldbp; - DBT tmp_val, *myval; - HASH_CURSOR *hcp; - HTAB *hashp; - u_int32_t nbytes; - int ret, t_ret; - - DEBUG_LWRITE(dbp, txn, "ham_put", key, data, flags); - if ((ret = __db_putchk(dbp, key, data, - flags, F_ISSET(dbp, DB_AM_RDONLY), F_ISSET(dbp, DB_AM_DUP))) != 0) + if ((ret = dbp->cursor(dbp, txn, &dbc, DB_WRITELOCK)) != 0) return (ret); - ldbp = dbp; - if (F_ISSET(dbp, DB_AM_THREAD) && - (ret = __db_gethandle(dbp, __ham_hdup, &ldbp)) != 0) - return (ret); + DEBUG_LWRITE(dbc, txn, "ham_delete", key, NULL, flags); - hashp = (HTAB *)ldbp->internal; - SET_LOCKER(ldbp, txn); - GET_META(ldbp, hashp); - hcp = TAILQ_FIRST(&ldbp->curs_queue)->internal; - - nbytes = (ISBIG(hashp, key->size) ? HOFFPAGE_PSIZE : - HKEYDATA_PSIZE(key->size)) + - (ISBIG(hashp, data->size) ? HOFFPAGE_PSIZE : - HKEYDATA_PSIZE(data->size)); - - hashp->hash_accesses++; - ret = __ham_lookup(hashp, hcp, key, nbytes, DB_LOCK_WRITE); - - if (ret == DB_NOTFOUND) { - ret = 0; - if (hcp->seek_found_page != PGNO_INVALID && - hcp->seek_found_page != hcp->pgno) { - if ((ret = __ham_item_done(hashp, hcp, 0)) != 0) - goto out; - hcp->pgno = hcp->seek_found_page; - hcp->bndx = NDX_INVALID; - } + hcp = (HASH_CURSOR *)dbc->internal; + GET_META(dbp, hcp, ret); + if (ret != 0) + goto out; - if (F_ISSET(data, DB_DBT_PARTIAL) && data->doff != 0) { - /* - * Doing a partial put, but the key does not exist - * and we are not beginning the write at 0. We - * must create a data item padded up to doff and - * then write the new bytes represented by val. - */ - ret = __ham_init_dbt(&tmp_val, data->size + data->doff, - &hcp->big_data, &hcp->big_datalen); - if (ret == 0) { - memset(tmp_val.data, 0, data->doff); - memcpy((u_int8_t *)tmp_val.data + data->doff, - data->data, data->size); - myval = &tmp_val; - } - } else - myval = (DBT *)data; - - if (ret == 0) - ret = __ham_add_el(hashp, hcp, key, myval, H_KEYDATA); - } else if (ret == 0 && F_ISSET(hcp, H_OK)) { - if (flags == DB_NOOVERWRITE) - ret = DB_KEYEXIST; - else if (F_ISSET(ldbp, DB_AM_DUP)) - ret = __ham_add_dup(hashp, hcp, data, DB_KEYLAST); + hcp->stats.hash_deleted++; + if ((ret = __ham_lookup(dbc, key, 0, DB_LOCK_WRITE)) == 0) { + if (F_ISSET(hcp, H_OK)) + ret = __ham_del_pair(dbc, 1); else - ret = __ham_overwrite(hashp, hcp, data); - } - - /* Free up all the cursor pages. */ - if ((t_ret = __ham_item_done(hashp, hcp, ret == 0)) != 0 && ret == 0) - ret = t_ret; - /* Now check if we have to grow. */ -out: if (ret == 0 && F_ISSET(hcp, H_EXPAND)) { - ret = __ham_expand_table(hashp); - F_CLR(hcp, H_EXPAND); + ret = DB_NOTFOUND; } - if ((t_ret = __ham_item_done(hashp, hcp, ret == 0)) != 0 && ret == 0) - ret = t_ret; - RELEASE_META(ldbp, hashp); - if (F_ISSET(dbp, DB_AM_THREAD)) - __db_puthandle(ldbp); + RELEASE_META(dbp, hcp); +out: if ((tret = dbc->c_close(dbc)) != 0 && ret == 0) + ret = tret; return (ret); } -static int -__ham_cursor(dbp, txnid, dbcp) - DB *dbp; - DB_TXN *txnid; - DBC **dbcp; -{ +/* ****************** CURSORS ********************************** */ +/* + * __ham_c_init -- + * Initialize the hash-specific portion of a cursor. + * + * PUBLIC: int __ham_c_init __P((DBC *)); + */ +int +__ham_c_init(dbc) + DBC *dbc; + { + HASH_CURSOR *new_curs; int ret; - DEBUG_LWRITE(dbp, txnid, "ham_cursor", NULL, NULL, 0); - if ((ret = __ham_c_init(dbp, txnid, dbcp)) != 0) + if ((ret = __os_calloc(1, sizeof(struct cursor_t), &new_curs)) != 0) + return (ret); + if ((ret = + __os_malloc(dbc->dbp->pgsize, NULL, &new_curs->split_buf)) != 0) { + __os_free(new_curs, sizeof(*new_curs)); return (ret); - - DB_THREAD_LOCK(dbp); - TAILQ_INSERT_TAIL(&dbp->curs_queue, *dbcp, links); - DB_THREAD_UNLOCK(dbp); - return (ret); -} - -static int -__ham_c_init(dbp, txnid, dbcp) - DB *dbp; - DB_TXN *txnid; - DBC **dbcp; -{ - DBC *db_curs; - HASH_CURSOR *new_curs; - - if ((db_curs = (DBC *)__db_calloc(sizeof(DBC), 1)) == NULL) - return (ENOMEM); - - if ((new_curs = - (HASH_CURSOR *)__db_calloc(sizeof(struct cursor_t), 1)) == NULL) { - FREE(db_curs, sizeof(DBC)); - return (ENOMEM); } - db_curs->internal = new_curs; - db_curs->c_close = __ham_c_close; - db_curs->c_del = __ham_c_del; - db_curs->c_get = __ham_c_get; - db_curs->c_put = __ham_c_put; - db_curs->txn = txnid; - db_curs->dbp = dbp; + new_curs->dbc = dbc; + + dbc->internal = new_curs; + dbc->c_am_close = __ham_c_close; + dbc->c_am_destroy = __ham_c_destroy; + dbc->c_del = __ham_c_del; + dbc->c_get = __ham_c_get; + dbc->c_put = __ham_c_put; - new_curs->db_cursor = db_curs; __ham_item_init(new_curs); - if (dbcp != NULL) - *dbcp = db_curs; return (0); } +/* + * __ham_c_close -- + * Close down the cursor from a single use. + */ static int -__ham_delete(dbp, txn, key, flags) - DB *dbp; - DB_TXN *txn; - DBT *key; - u_int32_t flags; -{ - DB *ldbp; - HTAB *hashp; - HASH_CURSOR *hcp; - int ret, t_ret; - - DEBUG_LWRITE(dbp, txn, "ham_delete", key, NULL, flags); - if ((ret = - __db_delchk(dbp, key, flags, F_ISSET(dbp, DB_AM_RDONLY))) != 0) - return (ret); - - ldbp = dbp; - if (F_ISSET(dbp, DB_AM_THREAD) && - (ret = __db_gethandle(dbp, __ham_hdup, &ldbp)) != 0) - return (ret); - hashp = (HTAB *)ldbp->internal; - SET_LOCKER(ldbp, txn); - GET_META(ldbp, hashp); - hcp = TAILQ_FIRST(&ldbp->curs_queue)->internal; - - hashp->hash_accesses++; - if ((ret = __ham_lookup(hashp, hcp, key, 0, DB_LOCK_WRITE)) == 0) { - if (F_ISSET(hcp, H_OK)) - ret = __ham_del_pair(hashp, hcp, 1); - else - ret = DB_NOTFOUND; - } - - if ((t_ret = __ham_item_done(hashp, hcp, ret == 0)) != 0 && ret == 0) - ret = t_ret; - RELEASE_META(ldbp, hashp); - if (F_ISSET(dbp, DB_AM_THREAD)) - __db_puthandle(ldbp); - return (ret); -} - -/* ****************** CURSORS ********************************** */ -static int -__ham_c_close(cursor) - DBC *cursor; +__ham_c_close(dbc) + DBC *dbc; { - DB *ldbp; int ret; - DEBUG_LWRITE(cursor->dbp, cursor->txn, "ham_c_close", NULL, NULL, 0); - /* - * If the pagep, dpagep, and lock fields of the cursor are all NULL, - * then there really isn't a need to get a handle here. However, - * the normal case is that at least one of those fields is non-NULL, - * and putting those checks in here would couple the ham_item_done - * functionality with cursor close which would be pretty disgusting. - * Instead, we pay the overhead here of always getting the handle. - */ - ldbp = cursor->dbp; - if (F_ISSET(cursor->dbp, DB_AM_THREAD) && - (ret = __db_gethandle(cursor->dbp, __ham_hdup, &ldbp)) != 0) + if ((ret = __ham_item_done(dbc, 0)) != 0) return (ret); - ret = __ham_c_iclose(ldbp, cursor); - - if (F_ISSET(ldbp, DB_AM_THREAD)) - __db_puthandle(ldbp); - return (ret); + __ham_item_init((HASH_CURSOR *)dbc->internal); + return (0); } + /* - * __ham_c_iclose -- - * - * Internal cursor close routine; assumes it is being passed the correct - * handle, rather than getting and putting a handle. - * - * PUBLIC: int __ham_c_iclose __P((DB *, DBC *)); + * __ham_c_destroy -- + * Cleanup the access method private part of a cursor. */ -int -__ham_c_iclose(dbp, dbc) - DB *dbp; +static int +__ham_c_destroy(dbc) DBC *dbc; { HASH_CURSOR *hcp; - HTAB *hashp; - int ret; - hashp = (HTAB *)dbp->internal; hcp = (HASH_CURSOR *)dbc->internal; - ret = __ham_item_done(hashp, hcp, 0); - - if (hcp->big_key) - FREE(hcp->big_key, hcp->big_keylen); - if (hcp->big_data) - FREE(hcp->big_data, hcp->big_datalen); + if (hcp->split_buf != NULL) + __os_free(hcp->split_buf, dbc->dbp->pgsize); + __os_free(hcp, sizeof(HASH_CURSOR)); - /* - * All cursors (except the default ones) are linked off the master. - * Therefore, when we close the cursor, we have to remove it from - * the master, not the local one. - * XXX I am always removing from the master; what about local cursors? - */ - DB_THREAD_LOCK(dbc->dbp); - TAILQ_REMOVE(&dbc->dbp->curs_queue, dbc, links); - DB_THREAD_UNLOCK(dbc->dbp); - - FREE(hcp, sizeof(HASH_CURSOR)); - FREE(dbc, sizeof(DBC)); - - return (ret); + return (0); } static int -__ham_c_del(cursor, flags) - DBC *cursor; +__ham_c_del(dbc, flags) + DBC *dbc; u_int32_t flags; { - DB *ldbp; + DB *dbp; + DBT repldbt; HASH_CURSOR *hcp; HASH_CURSOR save_curs; - HTAB *hashp; db_pgno_t ppgno, chg_pgno; int ret, t_ret; - DEBUG_LWRITE(cursor->dbp, cursor->txn, "ham_c_del", NULL, NULL, flags); - ldbp = cursor->dbp; - if (F_ISSET(cursor->dbp, DB_AM_THREAD) && - (ret = __db_gethandle(cursor->dbp, __ham_hdup, &ldbp)) != 0) - return (ret); - hashp = (HTAB *)ldbp->internal; - hcp = (HASH_CURSOR *)cursor->internal; - save_curs = *hcp; - if ((ret = __db_cdelchk(ldbp, flags, - F_ISSET(ldbp, DB_AM_RDONLY), IS_VALID(hcp))) != 0) + DEBUG_LWRITE(dbc, dbc->txn, "ham_c_del", NULL, NULL, flags); + dbp = dbc->dbp; + DB_PANIC_CHECK(dbp); + hcp = (HASH_CURSOR *)dbc->internal; + + if ((ret = __db_cdelchk(dbc->dbp, flags, + F_ISSET(dbc->dbp, DB_AM_RDONLY), IS_VALID(hcp))) != 0) return (ret); + if (F_ISSET(hcp, H_DELETED)) return (DB_NOTFOUND); - SET_LOCKER(hashp->dbp, cursor->txn); - GET_META(hashp->dbp, hashp); - hashp->hash_accesses++; - if ((ret = __ham_get_cpage(hashp, hcp, DB_LOCK_WRITE)) != 0) + /* + * If we are in the concurrent DB product and this cursor + * is not a write cursor, then this request is invalid. + * If it is a simple write cursor, then we need to upgrade its + * lock. + */ + if (F_ISSET(dbp, DB_AM_CDB)) { + /* Make sure it's a valid update cursor. */ + if (!F_ISSET(dbc, DBC_RMW | DBC_WRITER)) + return (EINVAL); + + if (F_ISSET(dbc, DBC_RMW) && + (ret = lock_get(dbp->dbenv->lk_info, dbc->locker, + DB_LOCK_UPGRADE, &dbc->lock_dbt, DB_LOCK_WRITE, + &dbc->mylock)) != 0) + return (EAGAIN); + } + + GET_META(dbp, hcp, ret); + if (ret != 0) + return (ret); + + SAVE_CURSOR(hcp, &save_curs); + hcp->stats.hash_deleted++; + + if ((ret = __ham_get_cpage(dbc, DB_LOCK_WRITE)) != 0) goto out; if (F_ISSET(hcp, H_ISDUP) && hcp->dpgno != PGNO_INVALID) { /* @@ -695,20 +428,20 @@ __ham_c_del(cursor, flags) /* Remove item from duplicate page. */ chg_pgno = hcp->dpgno; - if ((ret = __db_drem(hashp->dbp, + if ((ret = __db_drem(dbc, &hcp->dpagep, hcp->dndx, __ham_del_page)) != 0) goto out; if (hcp->dpagep == NULL) { if (ppgno != PGNO_INVALID) { /* Case 3 */ hcp->dpgno = ppgno; - if ((ret = __ham_get_cpage(hashp, hcp, + if ((ret = __ham_get_cpage(dbc, DB_LOCK_READ)) != 0) goto out; hcp->dndx = NUM_ENT(hcp->dpagep); F_SET(hcp, H_DELETED); } else { /* Case 4 */ - ret = __ham_del_pair(hashp, hcp, 1); + ret = __ham_del_pair(dbc, 1); hcp->dpgno = PGNO_INVALID; /* * Delpair updated the cursor queue, so we @@ -723,6 +456,15 @@ __ham_c_del(cursor, flags) memcpy(HOFFDUP_PGNO(P_ENTRY(hcp->pagep, H_DATAINDEX(hcp->bndx))), &hcp->dpgno, sizeof(db_pgno_t)); + /* + * We need to put the master page here, because + * although we have a duplicate page, the master + * page is dirty, and ham_item_done assumes that + * if you have a duplicate page, it's the only one + * that can be dirty. + */ + ret = __ham_put_page(dbp, hcp->pagep, 1); + hcp->pagep = NULL; F_SET(hcp, H_DELETED); } else /* Case 1 */ F_SET(hcp, H_DELETED); @@ -730,17 +472,17 @@ __ham_c_del(cursor, flags) __ham_c_update(hcp, chg_pgno, 0, 0, 1); } else if (F_ISSET(hcp, H_ISDUP)) { /* on page */ if (hcp->dup_off == 0 && DUP_SIZE(hcp->dup_len) == - LEN_HDATA(hcp->pagep, hashp->hdr->pagesize, hcp->bndx)) - ret = __ham_del_pair(hashp, hcp, 1); + LEN_HDATA(hcp->pagep, hcp->hdr->pagesize, hcp->bndx)) + ret = __ham_del_pair(dbc, 1); else { - DBT repldbt; - repldbt.flags = 0; F_SET(&repldbt, DB_DBT_PARTIAL); repldbt.doff = hcp->dup_off; repldbt.dlen = DUP_SIZE(hcp->dup_len); repldbt.size = 0; - ret = __ham_replpair(hashp, hcp, &repldbt, 0); + repldbt.data = + HKEYDATA_DATA(H_PAIRDATA(hcp->pagep, hcp->bndx)); + ret = __ham_replpair(dbc, &repldbt, 0); hcp->dup_tlen -= DUP_SIZE(hcp->dup_len); F_SET(hcp, H_DELETED); __ham_c_update(hcp, hcp->pgno, @@ -749,48 +491,53 @@ __ham_c_del(cursor, flags) } else /* Not a duplicate */ -normal: ret = __ham_del_pair(hashp, hcp, 1); +normal: ret = __ham_del_pair(dbc, 1); -out: if ((t_ret = __ham_item_done(hashp, hcp, ret == 0)) != 0 && ret == 0) +out: if ((t_ret = __ham_item_done(dbc, ret == 0)) != 0 && ret == 0) ret = t_ret; - if (ret != 0) - *hcp = save_curs; - RELEASE_META(hashp->dbp, hashp); - if (F_ISSET(cursor->dbp, DB_AM_THREAD)) - __db_puthandle(ldbp); + RELEASE_META(dbp, hcp); + RESTORE_CURSOR(dbp, hcp, &save_curs, ret); + if (F_ISSET(dbp, DB_AM_CDB) && F_ISSET(dbc, DBC_RMW)) + (void)__lock_downgrade(dbp->dbenv->lk_info, dbc->mylock, + DB_LOCK_IWRITE, 0); return (ret); } static int -__ham_c_get(cursor, key, data, flags) - DBC *cursor; +__ham_c_get(dbc, key, data, flags) + DBC *dbc; DBT *key; DBT *data; u_int32_t flags; { - DB *ldbp; - HTAB *hashp; + DB *dbp; HASH_CURSOR *hcp, save_curs; + db_lockmode_t lock_type; int get_key, ret, t_ret; - DEBUG_LREAD(cursor->dbp, cursor->txn, "ham_c_get", + DEBUG_LREAD(dbc, dbc->txn, "ham_c_get", flags == DB_SET || flags == DB_SET_RANGE ? key : NULL, NULL, flags); - ldbp = cursor->dbp; - if (F_ISSET(cursor->dbp, DB_AM_THREAD) && - (ret = __db_gethandle(cursor->dbp, __ham_hdup, &ldbp)) != 0) - return (ret); - hashp = (HTAB *)(ldbp->internal); - hcp = (HASH_CURSOR *)cursor->internal; - save_curs = *hcp; + + hcp = (HASH_CURSOR *)dbc->internal; + dbp = dbc->dbp; + DB_PANIC_CHECK(dbp); + SAVE_CURSOR(hcp, &save_curs); if ((ret = - __db_cgetchk(hashp->dbp, key, data, flags, IS_VALID(hcp))) != 0) + __db_cgetchk(dbp, key, data, flags, IS_VALID(hcp))) != 0) return (ret); - SET_LOCKER(hashp->dbp, cursor->txn); - GET_META(hashp->dbp, hashp); - hashp->hash_accesses++; + /* Clear OR'd in additional bits so we can check for flag equality. */ + if (LF_ISSET(DB_RMW)) { + lock_type = DB_LOCK_WRITE; + LF_CLR(DB_RMW); + } else + lock_type = DB_LOCK_READ; + GET_META(dbp, hcp, ret); + if (ret != 0) + return (ret); + hcp->stats.hash_get++; hcp->seek_size = 0; ret = 0; @@ -798,24 +545,39 @@ __ham_c_get(cursor, key, data, flags) switch (flags) { case DB_PREV: if (hcp->bucket != BUCKET_INVALID) { - ret = __ham_item_prev(hashp, hcp, DB_LOCK_READ); + ret = __ham_item_prev(dbc, lock_type); break; } /* FALLTHROUGH */ case DB_LAST: - ret = __ham_item_last(hashp, hcp, DB_LOCK_READ); + ret = __ham_item_last(dbc, lock_type); break; case DB_FIRST: - ret = __ham_item_first(hashp, hcp, DB_LOCK_READ); + ret = __ham_item_first(dbc, lock_type); + break; + case DB_NEXT_DUP: + if (hcp->bucket == BUCKET_INVALID) + ret = EINVAL; + else { + F_SET(hcp, H_DUPONLY); + ret = __ham_item_next(dbc, lock_type); + } break; case DB_NEXT: if (hcp->bucket == BUCKET_INVALID) hcp->bucket = 0; - ret = __ham_item_next(hashp, hcp, DB_LOCK_READ); + ret = __ham_item_next(dbc, lock_type); break; case DB_SET: case DB_SET_RANGE: - ret = __ham_lookup(hashp, hcp, key, 0, DB_LOCK_READ); + case DB_GET_BOTH: + if (F_ISSET(dbc, DBC_CONTINUE)) { + F_SET(hcp, H_DUPONLY); + ret = __ham_item_next(dbc, lock_type); + } else if (F_ISSET(dbc, DBC_KEYSET)) + ret = __ham_item(dbc, lock_type); + else + ret = __ham_lookup(dbc, key, 0, lock_type); get_key = 0; break; case DB_CURRENT: @@ -824,7 +586,7 @@ __ham_c_get(cursor, key, data, flags) goto out; } - ret = __ham_item(hashp, hcp, DB_LOCK_READ); + ret = __ham_item(dbc, lock_type); break; } @@ -837,12 +599,12 @@ __ham_c_get(cursor, key, data, flags) goto out1; else if (F_ISSET(hcp, H_OK)) { /* Get the key. */ - if (get_key && (ret = __db_ret(hashp->dbp, hcp->pagep, - H_KEYINDEX(hcp->bndx), key, &hcp->big_key, - &hcp->big_keylen)) != 0) + if (get_key && (ret = __db_ret(dbp, hcp->pagep, + H_KEYINDEX(hcp->bndx), key, &dbc->rkey.data, + &dbc->rkey.size)) != 0) goto out1; - ret = __ham_dup_return(hashp, hcp, data, flags); + ret = __ham_dup_return(dbc, data, flags); break; } else if (!F_ISSET(hcp, H_NOMORE)) { abort(); @@ -855,7 +617,7 @@ __ham_c_get(cursor, key, data, flags) switch (flags) { case DB_LAST: case DB_PREV: - ret = __ham_item_done(hashp, hcp, 0); + ret = __ham_item_done(dbc, 0); if (hcp->bucket == 0) { ret = DB_NOTFOUND; goto out1; @@ -863,24 +625,24 @@ __ham_c_get(cursor, key, data, flags) hcp->bucket--; hcp->bndx = NDX_INVALID; if (ret == 0) - ret = __ham_item_prev(hashp, - hcp, DB_LOCK_READ); + ret = __ham_item_prev(dbc, lock_type); break; case DB_FIRST: case DB_NEXT: - ret = __ham_item_done(hashp, hcp, 0); + ret = __ham_item_done(dbc, 0); hcp->bndx = NDX_INVALID; hcp->bucket++; hcp->pgno = PGNO_INVALID; hcp->pagep = NULL; - if (hcp->bucket > hashp->hdr->max_bucket) { + if (hcp->bucket > hcp->hdr->max_bucket) { ret = DB_NOTFOUND; goto out1; } if (ret == 0) - ret = __ham_item_next(hashp, - hcp, DB_LOCK_READ); + ret = __ham_item_next(dbc, lock_type); break; + case DB_GET_BOTH: + case DB_NEXT_DUP: case DB_SET: case DB_SET_RANGE: /* Key not found. */ @@ -888,85 +650,137 @@ __ham_c_get(cursor, key, data, flags) goto out1; } } -out1: if ((t_ret = __ham_item_done(hashp, hcp, 0)) != 0 && ret == 0) +out1: if ((t_ret = __ham_item_done(dbc, 0)) != 0 && ret == 0) ret = t_ret; -out: if (ret) - *hcp = save_curs; - RELEASE_META(hashp->dbp, hashp); - if (F_ISSET(cursor->dbp, DB_AM_THREAD)) - __db_puthandle(ldbp); +out: RELEASE_META(dbp, hcp); + RESTORE_CURSOR(dbp, hcp, &save_curs, ret); return (ret); } static int -__ham_c_put(cursor, key, data, flags) - DBC *cursor; +__ham_c_put(dbc, key, data, flags) + DBC *dbc; DBT *key; DBT *data; u_int32_t flags; { - DB *ldbp; + DB *dbp; + DBT tmp_val, *myval; HASH_CURSOR *hcp, save_curs; - HTAB *hashp; u_int32_t nbytes; int ret, t_ret; - DEBUG_LWRITE(cursor->dbp, cursor->txn, "ham_c_put", + dbp = dbc->dbp; + DB_PANIC_CHECK(dbp); + DEBUG_LWRITE(dbc, dbc->txn, "ham_c_put", flags == DB_KEYFIRST || flags == DB_KEYLAST ? key : NULL, data, flags); - ldbp = cursor->dbp; - if (F_ISSET(cursor->dbp, DB_AM_THREAD) && - (ret = __db_gethandle(cursor->dbp, __ham_hdup, &ldbp)) != 0) - return (ret); - hashp = (HTAB *)(ldbp->internal); - hcp = (HASH_CURSOR *)cursor->internal; - save_curs = *hcp; + hcp = (HASH_CURSOR *)dbc->internal; - if ((ret = __db_cputchk(hashp->dbp, key, data, flags, - F_ISSET(ldbp, DB_AM_RDONLY), IS_VALID(hcp))) != 0) + if ((ret = __db_cputchk(dbp, key, data, flags, + F_ISSET(dbp, DB_AM_RDONLY), IS_VALID(hcp))) != 0) return (ret); - if (F_ISSET(hcp, H_DELETED)) + + if (F_ISSET(hcp, H_DELETED) && + flags != DB_KEYFIRST && flags != DB_KEYLAST) return (DB_NOTFOUND); - SET_LOCKER(hashp->dbp, cursor->txn); - GET_META(hashp->dbp, hashp); - ret = 0; + /* + * If we are in the concurrent DB product and this cursor + * is not a write cursor, then this request is invalid. + * If it is a simple write cursor, then we need to upgrade its + * lock. + */ + if (F_ISSET(dbp, DB_AM_CDB)) { + /* Make sure it's a valid update cursor. */ + if (!F_ISSET(dbc, DBC_RMW | DBC_WRITER)) + return (EINVAL); + + if (F_ISSET(dbc, DBC_RMW) && + (ret = lock_get(dbp->dbenv->lk_info, dbc->locker, + DB_LOCK_UPGRADE, &dbc->lock_dbt, DB_LOCK_WRITE, + &dbc->mylock)) != 0) + return (EAGAIN); + } + + GET_META(dbp, hcp, ret); + if (ret != 0) + return (ret); + + SAVE_CURSOR(hcp, &save_curs); + hcp->stats.hash_put++; switch (flags) { case DB_KEYLAST: case DB_KEYFIRST: - nbytes = (ISBIG(hashp, key->size) ? HOFFPAGE_PSIZE : + nbytes = (ISBIG(hcp, key->size) ? HOFFPAGE_PSIZE : HKEYDATA_PSIZE(key->size)) + - (ISBIG(hashp, data->size) ? HOFFPAGE_PSIZE : + (ISBIG(hcp, data->size) ? HOFFPAGE_PSIZE : HKEYDATA_PSIZE(data->size)); - ret = __ham_lookup(hashp, hcp, key, nbytes, DB_LOCK_WRITE); + if ((ret = __ham_lookup(dbc, + key, nbytes, DB_LOCK_WRITE)) == DB_NOTFOUND) { + ret = 0; + if (hcp->seek_found_page != PGNO_INVALID && + hcp->seek_found_page != hcp->pgno) { + if ((ret = __ham_item_done(dbc, 0)) != 0) + goto out; + hcp->pgno = hcp->seek_found_page; + hcp->bndx = NDX_INVALID; + } + + if (F_ISSET(data, DB_DBT_PARTIAL) && data->doff != 0) { + /* + * A partial put, but the key does not exist + * and we are not beginning the write at 0. + * We must create a data item padded up to doff + * and then write the new bytes represented by + * val. + */ + if ((ret = __ham_init_dbt(&tmp_val, + data->size + data->doff, + &dbc->rdata.data, &dbc->rdata.size)) == 0) { + memset(tmp_val.data, 0, data->doff); + memcpy((u_int8_t *)tmp_val.data + + data->doff, data->data, data->size); + myval = &tmp_val; + } + } else + myval = (DBT *)data; + + if (ret == 0) + ret = __ham_add_el(dbc, key, myval, H_KEYDATA); + goto done; + } break; case DB_BEFORE: case DB_AFTER: case DB_CURRENT: - ret = __ham_item(hashp, hcp, DB_LOCK_WRITE); + ret = __ham_item(dbc, DB_LOCK_WRITE); break; } if (ret == 0) { - if (flags == DB_CURRENT && !F_ISSET(ldbp, DB_AM_DUP)) - ret = __ham_overwrite(hashp, hcp, data); + if ((flags == DB_CURRENT && !F_ISSET(hcp, H_ISDUP)) || + ((flags == DB_KEYFIRST || flags == DB_KEYLAST) && + !F_ISSET(dbp, DB_AM_DUP))) + ret = __ham_overwrite(dbc, data); else - ret = __ham_add_dup(hashp, hcp, data, flags); + ret = __ham_add_dup(dbc, data, flags); } - if (ret == 0 && F_ISSET(hcp, H_EXPAND)) { - ret = __ham_expand_table(hashp); +done: if (ret == 0 && F_ISSET(hcp, H_EXPAND)) { + ret = __ham_expand_table(dbc); F_CLR(hcp, H_EXPAND); } - if ((t_ret = __ham_item_done(hashp, hcp, ret == 0)) != 0 && ret == 0) + if ((t_ret = __ham_item_done(dbc, ret == 0)) != 0 && ret == 0) ret = t_ret; - if (ret != 0) - *hcp = save_curs; - RELEASE_META(hashp->dbp, hashp); - if (F_ISSET(cursor->dbp, DB_AM_THREAD)) - __db_puthandle(ldbp); + +out: RELEASE_META(dbp, hcp); + RESTORE_CURSOR(dbp, hcp, &save_curs, ret); + if (F_ISSET(dbp, DB_AM_CDB) && F_ISSET(dbc, DBC_RMW)) + (void)__lock_downgrade(dbp->dbenv->lk_info, dbc->mylock, + DB_LOCK_IWRITE, 0); return (ret); } @@ -974,19 +788,21 @@ __ham_c_put(cursor, key, data, flags) /* * __ham_expand_table -- - * - * PUBLIC: int __ham_expand_table __P((HTAB *)); */ -int -__ham_expand_table(hashp) - HTAB *hashp; +static int +__ham_expand_table(dbc) + DBC *dbc; { + DB *dbp; + HASH_CURSOR *hcp; DB_LSN new_lsn; u_int32_t old_bucket, new_bucket, spare_ndx; int ret; + dbp = dbc->dbp; + hcp = (HASH_CURSOR *)dbc->internal; ret = 0; - DIRTY_META(hashp, ret); + DIRTY_META(dbp, hcp, ret); if (ret) return (ret); @@ -999,78 +815,78 @@ __ham_expand_table(hashp) * see what the log of one greater than that is; here we have to * look at the log of max + 2. VERY NASTY STUFF. */ - if (__db_log2(hashp->hdr->max_bucket + 2) > hashp->hdr->ovfl_point) { + if (__db_log2(hcp->hdr->max_bucket + 2) > hcp->hdr->ovfl_point) { /* * We are about to shift the split point. Make sure that * if the next doubling is going to be big (more than 8 * pages), we have some extra pages around. */ - if (hashp->hdr->max_bucket + 1 >= 8 && - hashp->hdr->spares[hashp->hdr->ovfl_point] < - hashp->hdr->spares[hashp->hdr->ovfl_point - 1] + - hashp->hdr->ovfl_point + 1) - __ham_init_ovflpages(hashp); + if (hcp->hdr->max_bucket + 1 >= 8 && + hcp->hdr->spares[hcp->hdr->ovfl_point] < + hcp->hdr->spares[hcp->hdr->ovfl_point - 1] + + hcp->hdr->ovfl_point + 1) + __ham_init_ovflpages(dbc); } /* Now we can log the meta-data split. */ - if (DB_LOGGING(hashp->dbp)) { - if ((ret = __ham_splitmeta_log(hashp->dbp->dbenv->lg_info, - (DB_TXN *)hashp->dbp->txn, &new_lsn, 0, - hashp->dbp->log_fileid, - hashp->hdr->max_bucket, hashp->hdr->ovfl_point, - hashp->hdr->spares[hashp->hdr->ovfl_point], - &hashp->hdr->lsn)) != 0) + if (DB_LOGGING(dbc)) { + if ((ret = __ham_splitmeta_log(dbp->dbenv->lg_info, + dbc->txn, &new_lsn, 0, dbp->log_fileid, + hcp->hdr->max_bucket, hcp->hdr->ovfl_point, + hcp->hdr->spares[hcp->hdr->ovfl_point], + &hcp->hdr->lsn)) != 0) return (ret); - hashp->hdr->lsn = new_lsn; + hcp->hdr->lsn = new_lsn; } - hashp->hash_expansions++; - new_bucket = ++hashp->hdr->max_bucket; - old_bucket = (hashp->hdr->max_bucket & hashp->hdr->low_mask); + hcp->stats.hash_expansions++; + new_bucket = ++hcp->hdr->max_bucket; + old_bucket = (hcp->hdr->max_bucket & hcp->hdr->low_mask); /* * If the split point is increasing, copy the current contents * of the spare split bucket to the next bucket. */ - spare_ndx = __db_log2(hashp->hdr->max_bucket + 1); - if (spare_ndx > hashp->hdr->ovfl_point) { - hashp->hdr->spares[spare_ndx] = - hashp->hdr->spares[hashp->hdr->ovfl_point]; - hashp->hdr->ovfl_point = spare_ndx; + spare_ndx = __db_log2(hcp->hdr->max_bucket + 1); + if (spare_ndx > hcp->hdr->ovfl_point) { + hcp->hdr->spares[spare_ndx] = + hcp->hdr->spares[hcp->hdr->ovfl_point]; + hcp->hdr->ovfl_point = spare_ndx; } - if (new_bucket > hashp->hdr->high_mask) { + if (new_bucket > hcp->hdr->high_mask) { /* Starting a new doubling */ - hashp->hdr->low_mask = hashp->hdr->high_mask; - hashp->hdr->high_mask = new_bucket | hashp->hdr->low_mask; + hcp->hdr->low_mask = hcp->hdr->high_mask; + hcp->hdr->high_mask = new_bucket | hcp->hdr->low_mask; } - if (BUCKET_TO_PAGE(hashp, new_bucket) > MAX_PAGES(hashp)) { - __db_err(hashp->dbp->dbenv, + if (BUCKET_TO_PAGE(hcp, new_bucket) > MAX_PAGES(hcp)) { + __db_err(dbp->dbenv, "hash: Cannot allocate new bucket. Pages exhausted."); return (ENOSPC); } /* Relocate records to the new bucket */ - return (__ham_split_page(hashp, old_bucket, new_bucket)); + return (__ham_split_page(dbc, old_bucket, new_bucket)); } /* - * PUBLIC: u_int32_t __ham_call_hash __P((HTAB *, u_int8_t *, int32_t)); + * PUBLIC: u_int32_t __ham_call_hash __P((HASH_CURSOR *, u_int8_t *, int32_t)); */ u_int32_t -__ham_call_hash(hashp, k, len) - HTAB *hashp; +__ham_call_hash(hcp, k, len) + HASH_CURSOR *hcp; u_int8_t *k; int32_t len; { u_int32_t n, bucket; - n = (u_int32_t)hashp->hash(k, len); - bucket = n & hashp->hdr->high_mask; - if (bucket > hashp->hdr->max_bucket) - bucket = bucket & hashp->hdr->low_mask; + n = (u_int32_t)(hcp->dbc->dbp->h_hash(k, len)); + + bucket = n & hcp->hdr->high_mask; + if (bucket > hcp->hdr->max_bucket) + bucket = bucket & hcp->hdr->low_mask; return (bucket); } @@ -1079,31 +895,36 @@ __ham_call_hash(hashp, k, len) * everything held by the cursor. */ static int -__ham_dup_return(hashp, hcp, val, flags) - HTAB *hashp; - HASH_CURSOR *hcp; +__ham_dup_return(dbc, val, flags) + DBC *dbc; DBT *val; u_int32_t flags; { + DB *dbp; + HASH_CURSOR *hcp; PAGE *pp; DBT *myval, tmp_val; db_indx_t ndx; db_pgno_t pgno; + u_int32_t off, tlen; u_int8_t *hk, type; - int ret; + int cmp, ret; db_indx_t len; /* Check for duplicate and return the first one. */ + dbp = dbc->dbp; + hcp = (HASH_CURSOR *)dbc->internal; ndx = H_DATAINDEX(hcp->bndx); type = HPAGE_TYPE(hcp->pagep, ndx); pp = hcp->pagep; myval = val; /* - * There are 3 cases: + * There are 4 cases: * 1. We are not in duplicate, simply call db_ret. * 2. We are looking at keys and stumbled onto a duplicate. * 3. We are in the middle of a duplicate set. (ISDUP set) + * 4. This is a duplicate and we need to return a specific item. */ /* @@ -1115,7 +936,7 @@ __ham_dup_return(hashp, hcp, val, flags) if (type == H_DUPLICATE) { F_SET(hcp, H_ISDUP); hcp->dup_tlen = LEN_HDATA(hcp->pagep, - hashp->hdr->pagesize, hcp->bndx); + hcp->hdr->pagesize, hcp->bndx); hk = H_PAIRDATA(hcp->pagep, hcp->bndx); if (flags == DB_LAST || flags == DB_PREV) { hcp->dndx = 0; @@ -1141,18 +962,63 @@ __ham_dup_return(hashp, hcp, val, flags) memcpy(&pgno, HOFFDUP_PGNO(P_ENTRY(hcp->pagep, ndx)), sizeof(db_pgno_t)); if (flags == DB_LAST || flags == DB_PREV) { - if ((ret = __db_dend(hashp->dbp, + if ((ret = __db_dend(dbc, pgno, &hcp->dpagep)) != 0) return (ret); hcp->dpgno = PGNO(hcp->dpagep); hcp->dndx = NUM_ENT(hcp->dpagep) - 1; - } else if ((ret = __ham_next_cpage(hashp, - hcp, pgno, 0, H_ISDUP)) != 0) + } else if ((ret = __ham_next_cpage(dbc, + pgno, 0, H_ISDUP)) != 0) return (ret); } } /* + * If we are retrieving a specific key/data pair, then we + * may need to adjust the cursor before returning data. + */ + if (flags == DB_GET_BOTH) { + if (F_ISSET(hcp, H_ISDUP)) { + if (hcp->dpgno != PGNO_INVALID) { + if ((ret = __db_dsearch(dbc, 0, val, + hcp->dpgno, &hcp->dndx, &hcp->dpagep, &cmp)) + != 0) + return (ret); + if (cmp == 0) + hcp->dpgno = PGNO(hcp->dpagep); + } else { + __ham_dsearch(dbc, val, &off, &cmp); + hcp->dup_off = off; + } + } else { + hk = H_PAIRDATA(hcp->pagep, hcp->bndx); + if (((HKEYDATA *)hk)->type == H_OFFPAGE) { + memcpy(&tlen, + HOFFPAGE_TLEN(hk), sizeof(u_int32_t)); + memcpy(&pgno, + HOFFPAGE_PGNO(hk), sizeof(db_pgno_t)); + if ((ret = __db_moff(dbp, val, + pgno, tlen, dbp->dup_compare, &cmp)) != 0) + return (ret); + } else { + /* + * We do not zero tmp_val since the comparison + * routines may only look at data and size. + */ + tmp_val.data = HKEYDATA_DATA(hk); + tmp_val.size = LEN_HDATA(hcp->pagep, + dbp->pgsize, hcp->bndx); + cmp = dbp->dup_compare == NULL ? + __bam_defcmp(&tmp_val, val) : + dbp->dup_compare(&tmp_val, val); + } + } + + if (cmp != 0) + return (DB_NOTFOUND); + } + + /* * Now, everything is initialized, grab a duplicate if * necessary. */ @@ -1162,14 +1028,34 @@ __ham_dup_return(hashp, hcp, val, flags) ndx = hcp->dndx; } else { /* - * Copy the DBT in case we are retrieving into - * user memory and we need the parameters for - * it. + * Copy the DBT in case we are retrieving into user + * memory and we need the parameters for it. If the + * user requested a partial, then we need to adjust + * the user's parameters to get the partial of the + * duplicate which is itself a partial. */ memcpy(&tmp_val, val, sizeof(*val)); - F_SET(&tmp_val, DB_DBT_PARTIAL); - tmp_val.dlen = hcp->dup_len; - tmp_val.doff = hcp->dup_off + sizeof(db_indx_t); + if (F_ISSET(&tmp_val, DB_DBT_PARTIAL)) { + /* + * Take the user's length unless it would go + * beyond the end of the duplicate. + */ + if (tmp_val.doff + hcp->dup_off > hcp->dup_len) + tmp_val.dlen = 0; + else if (tmp_val.dlen + tmp_val.doff > + hcp->dup_len) + tmp_val.dlen = + hcp->dup_len - tmp_val.doff; + + /* + * Calculate the new offset. + */ + tmp_val.doff += hcp->dup_off; + } else { + F_SET(&tmp_val, DB_DBT_PARTIAL); + tmp_val.dlen = hcp->dup_len; + tmp_val.doff = hcp->dup_off + sizeof(db_indx_t); + } myval = &tmp_val; } } @@ -1178,8 +1064,8 @@ __ham_dup_return(hashp, hcp, val, flags) * Finally, if we had a duplicate, pp, ndx, and myval should be * set appropriately. */ - if ((ret = __db_ret(hashp->dbp, pp, ndx, myval, &hcp->big_data, - &hcp->big_datalen)) != 0) + if ((ret = __db_ret(dbp, pp, ndx, myval, &dbc->rdata.data, + &dbc->rdata.size)) != 0) return (ret); /* @@ -1193,16 +1079,17 @@ __ham_dup_return(hashp, hcp, val, flags) } static int -__ham_overwrite(hashp, hcp, nval) - HTAB *hashp; - HASH_CURSOR *hcp; +__ham_overwrite(dbc, nval) + DBC *dbc; DBT *nval; { + HASH_CURSOR *hcp; DBT *myval, tmp_val; u_int8_t *hk; - if (F_ISSET(hashp->dbp, DB_AM_DUP)) - return (__ham_add_dup(hashp, hcp, nval, DB_KEYLAST)); + hcp = (HASH_CURSOR *)dbc->internal; + if (F_ISSET(dbc->dbp, DB_AM_DUP)) + return (__ham_add_dup(dbc, nval, DB_KEYLAST)); else if (!F_ISSET(nval, DB_DBT_PARTIAL)) { /* Put/overwrite */ memcpy(&tmp_val, nval, sizeof(*nval)); @@ -1214,12 +1101,12 @@ __ham_overwrite(hashp, hcp, nval) HOFFPAGE_TLEN(hk), sizeof(u_int32_t)); else tmp_val.dlen = LEN_HDATA(hcp->pagep, - hashp->hdr->pagesize,hcp->bndx); + hcp->hdr->pagesize,hcp->bndx); myval = &tmp_val; } else /* Regular partial put */ myval = nval; - return (__ham_replpair(hashp, hcp, myval, 0)); + return (__ham_replpair(dbc, myval, 0)); } /* @@ -1232,29 +1119,32 @@ __ham_overwrite(hashp, hcp, nval) * non of the cursor pointer field are valid. */ static int -__ham_lookup(hashp, hcp, key, sought, mode) - HTAB *hashp; - HASH_CURSOR *hcp; +__ham_lookup(dbc, key, sought, mode) + DBC *dbc; const DBT *key; u_int32_t sought; db_lockmode_t mode; { + DB *dbp; + HASH_CURSOR *hcp; db_pgno_t pgno; u_int32_t tlen; int match, ret, t_ret; u_int8_t *hk; + dbp = dbc->dbp; + hcp = (HASH_CURSOR *)dbc->internal; /* * Set up cursor so that we're looking for space to add an item * as we cycle through the pages looking for the key. */ - if ((ret = __ham_item_reset(hashp, hcp)) != 0) + if ((ret = __ham_item_reset(dbc)) != 0) return (ret); hcp->seek_size = sought; - hcp->bucket = __ham_call_hash(hashp, (u_int8_t *)key->data, key->size); + hcp->bucket = __ham_call_hash(hcp, (u_int8_t *)key->data, key->size); while (1) { - if ((ret = __ham_item_next(hashp, hcp, mode)) != 0) + if ((ret = __ham_item_next(dbc, mode)) != 0) return (ret); if (F_ISSET(hcp, H_NOMORE)) @@ -1267,7 +1157,9 @@ __ham_lookup(hashp, hcp, key, sought, mode) if (tlen == key->size) { memcpy(&pgno, HOFFPAGE_PGNO(hk), sizeof(db_pgno_t)); - match = __db_moff(hashp->dbp, key, pgno); + if ((ret = __db_moff(dbp, + key, pgno, tlen, NULL, &match)) != 0) + return (ret); if (match == 0) { F_SET(hcp, H_OK); return (0); @@ -1276,7 +1168,7 @@ __ham_lookup(hashp, hcp, key, sought, mode) break; case H_KEYDATA: if (key->size == LEN_HKEY(hcp->pagep, - hashp->hdr->pagesize, hcp->bndx) && + hcp->hdr->pagesize, hcp->bndx) && memcmp(key->data, HKEYDATA_DATA(hk), key->size) == 0) { F_SET(hcp, H_OK); @@ -1289,9 +1181,9 @@ __ham_lookup(hashp, hcp, key, sought, mode) * These are errors because keys are never * duplicated, only data items are. */ - return (__db_pgfmt(hashp->dbp, PGNO(hcp->pagep))); + return (__db_pgfmt(dbp, PGNO(hcp->pagep))); } - hashp->hash_collisions++; + hcp->stats.hash_collisions++; } /* @@ -1301,7 +1193,7 @@ __ham_lookup(hashp, hcp, key, sought, mode) if (sought != 0) return (ret); - if ((t_ret = __ham_item_done(hashp, hcp, 0)) != 0 && ret == 0) + if ((t_ret = __ham_item_done(dbc, 0)) != 0 && ret == 0) ret = t_ret; return (ret); } @@ -1318,12 +1210,13 @@ __ham_init_dbt(dbt, size, bufp, sizep) void **bufp; u_int32_t *sizep; { + int ret; + memset(dbt, 0, sizeof(*dbt)); if (*sizep < size) { - if ((*bufp = (void *)(*bufp == NULL ? - __db_malloc(size) : __db_realloc(*bufp, size))) == NULL) { + if ((ret = __os_realloc(bufp, size)) != 0) { *sizep = 0; - return (ENOMEM); + return (ret); } *sizep = size; } @@ -1352,8 +1245,8 @@ __ham_c_update(hcp, chg_pgno, len, add, is_dup) u_int32_t len; int add, is_dup; { + DB *dbp; DBC *cp; - HTAB *hp; HASH_CURSOR *lcp; int page_deleted; @@ -1379,10 +1272,10 @@ __ham_c_update(hcp, chg_pgno, len, add, is_dup) page_deleted = chg_pgno != PGNO_INVALID && chg_pgno != hcp->dpgno; - hp = hcp->db_cursor->dbp->master->internal; - DB_THREAD_LOCK(hp->dbp); + dbp = hcp->dbc->dbp; + DB_THREAD_LOCK(dbp); - for (cp = TAILQ_FIRST(&hp->dbp->curs_queue); cp != NULL; + for (cp = TAILQ_FIRST(&dbp->active_queue); cp != NULL; cp = TAILQ_NEXT(cp, links)) { if (cp->internal == hcp) continue; @@ -1440,43 +1333,5 @@ __ham_c_update(hcp, chg_pgno, len, add, is_dup) } } } - DB_THREAD_UNLOCK(hp->dbp); -} - -/* - * __ham_hdup -- - * This function gets called when we create a duplicate handle for a - * threaded DB. It should create the private part of the DB structure. - * - * PUBLIC: int __ham_hdup __P((DB *, DB *)); - */ -int -__ham_hdup(orig, new) - DB *orig, *new; -{ - DBC *curs; - HTAB *hashp; - int ret; - - if ((hashp = (HTAB *)__db_malloc(sizeof(HTAB))) == NULL) - return (ENOMEM); - - new->internal = hashp; - - hashp->dbp = new; - hashp->hlock = 0; - hashp->hdr = NULL; - hashp->hash = ((HTAB *)orig->internal)->hash; - if ((hashp->split_buf = (PAGE *)__db_malloc(orig->pgsize)) == NULL) - return (ENOMEM); - hashp->local_errno = 0; - hashp->hash_accesses = 0; - hashp->hash_collisions = 0; - hashp->hash_expansions = 0; - hashp->hash_overflows = 0; - hashp->hash_bigpages = 0; - /* Initialize the cursor queue. */ - ret = __ham_c_init(new, NULL, &curs); - TAILQ_INSERT_TAIL(&new->curs_queue, curs, links); - return (ret); + DB_THREAD_UNLOCK(dbp); } diff --git a/db2/hash/hash_auto.c b/db2/hash/hash_auto.c index 41b1ebed01..94a1dff6ed 100644 --- a/db2/hash/hash_auto.c +++ b/db2/hash/hash_auto.c @@ -10,7 +10,6 @@ #endif #include "db_int.h" -#include "shqueue.h" #include "db_page.h" #include "db_dispatch.h" #include "hash.h" @@ -46,8 +45,7 @@ int __ham_insdel_log(logp, txnid, ret_lsnp, flags, rectype = DB_ham_insdel; txn_num = txnid == NULL ? 0 : txnid->txnid; if (txnid == NULL) { - null_lsn.file = 0; - null_lsn.offset = 0; + ZERO_LSN(null_lsn); lsnp = &null_lsn; } else lsnp = &txnid->last_lsn; @@ -59,8 +57,8 @@ int __ham_insdel_log(logp, txnid, ret_lsnp, flags, + sizeof(*pagelsn) + sizeof(u_int32_t) + (key == NULL ? 0 : key->size) + sizeof(u_int32_t) + (data == NULL ? 0 : data->size); - if ((logrec.data = (void *)__db_malloc(logrec.size)) == NULL) - return (ENOMEM); + if ((ret = __os_malloc(logrec.size, NULL, &logrec.data)) != 0) + return (ret); bp = logrec.data; memcpy(bp, &rectype, sizeof(rectype)); @@ -109,7 +107,7 @@ int __ham_insdel_log(logp, txnid, ret_lsnp, flags, ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags); if (txnid != NULL) txnid->last_lsn = *ret_lsnp; - __db_free(logrec.data); + __os_free(logrec.data, 0); return (ret); } @@ -170,7 +168,7 @@ __ham_insdel_print(notused1, dbtp, lsnp, notused2, notused3) } printf("\n"); printf("\n"); - __db_free(argp); + __os_free(argp, 0); return (0); } @@ -184,11 +182,12 @@ __ham_insdel_read(recbuf, argpp) { __ham_insdel_args *argp; u_int8_t *bp; + int ret; - argp = (__ham_insdel_args *)__db_malloc(sizeof(__ham_insdel_args) + - sizeof(DB_TXN)); - if (argp == NULL) - return (ENOMEM); + ret = __os_malloc(sizeof(__ham_insdel_args) + + sizeof(DB_TXN), NULL, &argp); + if (ret != 0) + return (ret); argp->txnid = (DB_TXN *)&argp[1]; bp = recbuf; memcpy(&argp->type, bp, sizeof(argp->type)); @@ -250,8 +249,7 @@ int __ham_newpage_log(logp, txnid, ret_lsnp, flags, rectype = DB_ham_newpage; txn_num = txnid == NULL ? 0 : txnid->txnid; if (txnid == NULL) { - null_lsn.file = 0; - null_lsn.offset = 0; + ZERO_LSN(null_lsn); lsnp = &null_lsn; } else lsnp = &txnid->last_lsn; @@ -264,8 +262,8 @@ int __ham_newpage_log(logp, txnid, ret_lsnp, flags, + sizeof(*pagelsn) + sizeof(next_pgno) + sizeof(*nextlsn); - if ((logrec.data = (void *)__db_malloc(logrec.size)) == NULL) - return (ENOMEM); + if ((ret = __os_malloc(logrec.size, NULL, &logrec.data)) != 0) + return (ret); bp = logrec.data; memcpy(bp, &rectype, sizeof(rectype)); @@ -306,7 +304,7 @@ int __ham_newpage_log(logp, txnid, ret_lsnp, flags, ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags); if (txnid != NULL) txnid->last_lsn = *ret_lsnp; - __db_free(logrec.data); + __os_free(logrec.data, 0); return (ret); } @@ -354,7 +352,7 @@ __ham_newpage_print(notused1, dbtp, lsnp, notused2, notused3) printf("\tnextlsn: [%lu][%lu]\n", (u_long)argp->nextlsn.file, (u_long)argp->nextlsn.offset); printf("\n"); - __db_free(argp); + __os_free(argp, 0); return (0); } @@ -368,11 +366,12 @@ __ham_newpage_read(recbuf, argpp) { __ham_newpage_args *argp; u_int8_t *bp; + int ret; - argp = (__ham_newpage_args *)__db_malloc(sizeof(__ham_newpage_args) + - sizeof(DB_TXN)); - if (argp == NULL) - return (ENOMEM); + ret = __os_malloc(sizeof(__ham_newpage_args) + + sizeof(DB_TXN), NULL, &argp); + if (ret != 0) + return (ret); argp->txnid = (DB_TXN *)&argp[1]; bp = recbuf; memcpy(&argp->type, bp, sizeof(argp->type)); @@ -428,8 +427,7 @@ int __ham_splitmeta_log(logp, txnid, ret_lsnp, flags, rectype = DB_ham_splitmeta; txn_num = txnid == NULL ? 0 : txnid->txnid; if (txnid == NULL) { - null_lsn.file = 0; - null_lsn.offset = 0; + ZERO_LSN(null_lsn); lsnp = &null_lsn; } else lsnp = &txnid->last_lsn; @@ -439,8 +437,8 @@ int __ham_splitmeta_log(logp, txnid, ret_lsnp, flags, + sizeof(ovflpoint) + sizeof(spares) + sizeof(*metalsn); - if ((logrec.data = (void *)__db_malloc(logrec.size)) == NULL) - return (ENOMEM); + if ((ret = __os_malloc(logrec.size, NULL, &logrec.data)) != 0) + return (ret); bp = logrec.data; memcpy(bp, &rectype, sizeof(rectype)); @@ -469,7 +467,7 @@ int __ham_splitmeta_log(logp, txnid, ret_lsnp, flags, ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags); if (txnid != NULL) txnid->last_lsn = *ret_lsnp; - __db_free(logrec.data); + __os_free(logrec.data, 0); return (ret); } @@ -512,7 +510,7 @@ __ham_splitmeta_print(notused1, dbtp, lsnp, notused2, notused3) printf("\tmetalsn: [%lu][%lu]\n", (u_long)argp->metalsn.file, (u_long)argp->metalsn.offset); printf("\n"); - __db_free(argp); + __os_free(argp, 0); return (0); } @@ -526,11 +524,12 @@ __ham_splitmeta_read(recbuf, argpp) { __ham_splitmeta_args *argp; u_int8_t *bp; + int ret; - argp = (__ham_splitmeta_args *)__db_malloc(sizeof(__ham_splitmeta_args) + - sizeof(DB_TXN)); - if (argp == NULL) - return (ENOMEM); + ret = __os_malloc(sizeof(__ham_splitmeta_args) + + sizeof(DB_TXN), NULL, &argp); + if (ret != 0) + return (ret); argp->txnid = (DB_TXN *)&argp[1]; bp = recbuf; memcpy(&argp->type, bp, sizeof(argp->type)); @@ -581,8 +580,7 @@ int __ham_splitdata_log(logp, txnid, ret_lsnp, flags, rectype = DB_ham_splitdata; txn_num = txnid == NULL ? 0 : txnid->txnid; if (txnid == NULL) { - null_lsn.file = 0; - null_lsn.offset = 0; + ZERO_LSN(null_lsn); lsnp = &null_lsn; } else lsnp = &txnid->last_lsn; @@ -592,8 +590,8 @@ int __ham_splitdata_log(logp, txnid, ret_lsnp, flags, + sizeof(pgno) + sizeof(u_int32_t) + (pageimage == NULL ? 0 : pageimage->size) + sizeof(*pagelsn); - if ((logrec.data = (void *)__db_malloc(logrec.size)) == NULL) - return (ENOMEM); + if ((ret = __os_malloc(logrec.size, NULL, &logrec.data)) != 0) + return (ret); bp = logrec.data; memcpy(bp, &rectype, sizeof(rectype)); @@ -630,7 +628,7 @@ int __ham_splitdata_log(logp, txnid, ret_lsnp, flags, ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags); if (txnid != NULL) txnid->last_lsn = *ret_lsnp; - __db_free(logrec.data); + __os_free(logrec.data, 0); return (ret); } @@ -681,7 +679,7 @@ __ham_splitdata_print(notused1, dbtp, lsnp, notused2, notused3) printf("\tpagelsn: [%lu][%lu]\n", (u_long)argp->pagelsn.file, (u_long)argp->pagelsn.offset); printf("\n"); - __db_free(argp); + __os_free(argp, 0); return (0); } @@ -695,11 +693,12 @@ __ham_splitdata_read(recbuf, argpp) { __ham_splitdata_args *argp; u_int8_t *bp; + int ret; - argp = (__ham_splitdata_args *)__db_malloc(sizeof(__ham_splitdata_args) + - sizeof(DB_TXN)); - if (argp == NULL) - return (ENOMEM); + ret = __os_malloc(sizeof(__ham_splitdata_args) + + sizeof(DB_TXN), NULL, &argp); + if (ret != 0) + return (ret); argp->txnid = (DB_TXN *)&argp[1]; bp = recbuf; memcpy(&argp->type, bp, sizeof(argp->type)); @@ -756,8 +755,7 @@ int __ham_replace_log(logp, txnid, ret_lsnp, flags, rectype = DB_ham_replace; txn_num = txnid == NULL ? 0 : txnid->txnid; if (txnid == NULL) { - null_lsn.file = 0; - null_lsn.offset = 0; + ZERO_LSN(null_lsn); lsnp = &null_lsn; } else lsnp = &txnid->last_lsn; @@ -770,8 +768,8 @@ int __ham_replace_log(logp, txnid, ret_lsnp, flags, + sizeof(u_int32_t) + (olditem == NULL ? 0 : olditem->size) + sizeof(u_int32_t) + (newitem == NULL ? 0 : newitem->size) + sizeof(makedup); - if ((logrec.data = (void *)__db_malloc(logrec.size)) == NULL) - return (ENOMEM); + if ((ret = __os_malloc(logrec.size, NULL, &logrec.data)) != 0) + return (ret); bp = logrec.data; memcpy(bp, &rectype, sizeof(rectype)); @@ -822,7 +820,7 @@ int __ham_replace_log(logp, txnid, ret_lsnp, flags, ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags); if (txnid != NULL) txnid->last_lsn = *ret_lsnp; - __db_free(logrec.data); + __os_free(logrec.data, 0); return (ret); } @@ -884,7 +882,7 @@ __ham_replace_print(notused1, dbtp, lsnp, notused2, notused3) printf("\n"); printf("\tmakedup: %lu\n", (u_long)argp->makedup); printf("\n"); - __db_free(argp); + __os_free(argp, 0); return (0); } @@ -898,11 +896,12 @@ __ham_replace_read(recbuf, argpp) { __ham_replace_args *argp; u_int8_t *bp; + int ret; - argp = (__ham_replace_args *)__db_malloc(sizeof(__ham_replace_args) + - sizeof(DB_TXN)); - if (argp == NULL) - return (ENOMEM); + ret = __os_malloc(sizeof(__ham_replace_args) + + sizeof(DB_TXN), NULL, &argp); + if (ret != 0) + return (ret); argp->txnid = (DB_TXN *)&argp[1]; bp = recbuf; memcpy(&argp->type, bp, sizeof(argp->type)); @@ -968,8 +967,7 @@ int __ham_newpgno_log(logp, txnid, ret_lsnp, flags, rectype = DB_ham_newpgno; txn_num = txnid == NULL ? 0 : txnid->txnid; if (txnid == NULL) { - null_lsn.file = 0; - null_lsn.offset = 0; + ZERO_LSN(null_lsn); lsnp = &null_lsn; } else lsnp = &txnid->last_lsn; @@ -983,8 +981,8 @@ int __ham_newpgno_log(logp, txnid, ret_lsnp, flags, + sizeof(new_type) + sizeof(*pagelsn) + sizeof(*metalsn); - if ((logrec.data = (void *)__db_malloc(logrec.size)) == NULL) - return (ENOMEM); + if ((ret = __os_malloc(logrec.size, NULL, &logrec.data)) != 0) + return (ret); bp = logrec.data; memcpy(bp, &rectype, sizeof(rectype)); @@ -1024,7 +1022,7 @@ int __ham_newpgno_log(logp, txnid, ret_lsnp, flags, ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags); if (txnid != NULL) txnid->last_lsn = *ret_lsnp; - __db_free(logrec.data); + __os_free(logrec.data, 0); return (ret); } @@ -1072,7 +1070,7 @@ __ham_newpgno_print(notused1, dbtp, lsnp, notused2, notused3) printf("\tmetalsn: [%lu][%lu]\n", (u_long)argp->metalsn.file, (u_long)argp->metalsn.offset); printf("\n"); - __db_free(argp); + __os_free(argp, 0); return (0); } @@ -1086,11 +1084,12 @@ __ham_newpgno_read(recbuf, argpp) { __ham_newpgno_args *argp; u_int8_t *bp; + int ret; - argp = (__ham_newpgno_args *)__db_malloc(sizeof(__ham_newpgno_args) + - sizeof(DB_TXN)); - if (argp == NULL) - return (ENOMEM); + ret = __os_malloc(sizeof(__ham_newpgno_args) + + sizeof(DB_TXN), NULL, &argp); + if (ret != 0) + return (ret); argp->txnid = (DB_TXN *)&argp[1]; bp = recbuf; memcpy(&argp->type, bp, sizeof(argp->type)); @@ -1149,8 +1148,7 @@ int __ham_ovfl_log(logp, txnid, ret_lsnp, flags, rectype = DB_ham_ovfl; txn_num = txnid == NULL ? 0 : txnid->txnid; if (txnid == NULL) { - null_lsn.file = 0; - null_lsn.offset = 0; + ZERO_LSN(null_lsn); lsnp = &null_lsn; } else lsnp = &txnid->last_lsn; @@ -1161,8 +1159,8 @@ int __ham_ovfl_log(logp, txnid, ret_lsnp, flags, + sizeof(free_pgno) + sizeof(ovflpoint) + sizeof(*metalsn); - if ((logrec.data = (void *)__db_malloc(logrec.size)) == NULL) - return (ENOMEM); + if ((ret = __os_malloc(logrec.size, NULL, &logrec.data)) != 0) + return (ret); bp = logrec.data; memcpy(bp, &rectype, sizeof(rectype)); @@ -1193,7 +1191,7 @@ int __ham_ovfl_log(logp, txnid, ret_lsnp, flags, ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags); if (txnid != NULL) txnid->last_lsn = *ret_lsnp; - __db_free(logrec.data); + __os_free(logrec.data, 0); return (ret); } @@ -1237,7 +1235,7 @@ __ham_ovfl_print(notused1, dbtp, lsnp, notused2, notused3) printf("\tmetalsn: [%lu][%lu]\n", (u_long)argp->metalsn.file, (u_long)argp->metalsn.offset); printf("\n"); - __db_free(argp); + __os_free(argp, 0); return (0); } @@ -1251,11 +1249,12 @@ __ham_ovfl_read(recbuf, argpp) { __ham_ovfl_args *argp; u_int8_t *bp; + int ret; - argp = (__ham_ovfl_args *)__db_malloc(sizeof(__ham_ovfl_args) + - sizeof(DB_TXN)); - if (argp == NULL) - return (ENOMEM); + ret = __os_malloc(sizeof(__ham_ovfl_args) + + sizeof(DB_TXN), NULL, &argp); + if (ret != 0) + return (ret); argp->txnid = (DB_TXN *)&argp[1]; bp = recbuf; memcpy(&argp->type, bp, sizeof(argp->type)); @@ -1312,8 +1311,7 @@ int __ham_copypage_log(logp, txnid, ret_lsnp, flags, rectype = DB_ham_copypage; txn_num = txnid == NULL ? 0 : txnid->txnid; if (txnid == NULL) { - null_lsn.file = 0; - null_lsn.offset = 0; + ZERO_LSN(null_lsn); lsnp = &null_lsn; } else lsnp = &txnid->last_lsn; @@ -1326,8 +1324,8 @@ int __ham_copypage_log(logp, txnid, ret_lsnp, flags, + sizeof(nnext_pgno) + sizeof(*nnextlsn) + sizeof(u_int32_t) + (page == NULL ? 0 : page->size); - if ((logrec.data = (void *)__db_malloc(logrec.size)) == NULL) - return (ENOMEM); + if ((ret = __os_malloc(logrec.size, NULL, &logrec.data)) != 0) + return (ret); bp = logrec.data; memcpy(bp, &rectype, sizeof(rectype)); @@ -1376,7 +1374,7 @@ int __ham_copypage_log(logp, txnid, ret_lsnp, flags, ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags); if (txnid != NULL) txnid->last_lsn = *ret_lsnp; - __db_free(logrec.data); + __os_free(logrec.data, 0); return (ret); } @@ -1432,7 +1430,7 @@ __ham_copypage_print(notused1, dbtp, lsnp, notused2, notused3) } printf("\n"); printf("\n"); - __db_free(argp); + __os_free(argp, 0); return (0); } @@ -1446,11 +1444,12 @@ __ham_copypage_read(recbuf, argpp) { __ham_copypage_args *argp; u_int8_t *bp; + int ret; - argp = (__ham_copypage_args *)__db_malloc(sizeof(__ham_copypage_args) + - sizeof(DB_TXN)); - if (argp == NULL) - return (ENOMEM); + ret = __os_malloc(sizeof(__ham_copypage_args) + + sizeof(DB_TXN), NULL, &argp); + if (ret != 0) + return (ret); argp->txnid = (DB_TXN *)&argp[1]; bp = recbuf; memcpy(&argp->type, bp, sizeof(argp->type)); diff --git a/db2/hash/hash_debug.c b/db2/hash/hash_debug.c deleted file mode 100644 index 232906ae34..0000000000 --- a/db2/hash/hash_debug.c +++ /dev/null @@ -1,92 +0,0 @@ -/*- - * See the file LICENSE for redistribution information. - * - * Copyright (c) 1996, 1997, 1998 - * Sleepycat Software. All rights reserved. - */ -/* - * Copyright (c) 1995 - * The President and Fellows of Harvard University. All rights reserved. - * - * This code is derived from software contributed to Berkeley by - * Jeremy Rassen. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#include "config.h" - -#ifndef lint -static const char sccsid[] = "@(#)hash_debug.c 10.6 (Sleepycat) 5/7/98"; -#endif /* not lint */ - -#ifdef DEBUG -/* - * PACKAGE: hashing - * - * DESCRIPTION: - * Debug routines. - * - * ROUTINES: - * - * External - * __dump_bucket - */ -#ifndef NO_SYSTEM_INCLUDES -#include <sys/types.h> -#endif - -#include "db_int.h" -#include "db_page.h" -#include "hash.h" - -/* - * __ham_dump_bucket -- - * - * PUBLIC: #ifdef DEBUG - * PUBLIC: void __ham_dump_bucket __P((HTAB *, u_int32_t)); - * PUBLIC: #endif - */ -void -__ham_dump_bucket(hashp, bucket) - HTAB *hashp; - u_int32_t bucket; -{ - PAGE *p; - db_pgno_t pgno; - - for (pgno = BUCKET_TO_PAGE(hashp, bucket); pgno != PGNO_INVALID;) { - if (memp_fget(hashp->dbp->mpf, &pgno, 0, &p) != 0) - break; - (void)__db_prpage(p, 1); - pgno = p->next_pgno; - (void)memp_fput(hashp->dbp->mpf, p, 0); - } -} -#endif /* DEBUG */ diff --git a/db2/hash/hash_dup.c b/db2/hash/hash_dup.c index ba248ddb17..bb3466428d 100644 --- a/db2/hash/hash_dup.c +++ b/db2/hash/hash_dup.c @@ -42,7 +42,7 @@ #include "config.h" #ifndef lint -static const char sccsid[] = "@(#)hash_dup.c 10.14 (Sleepycat) 5/7/98"; +static const char sccsid[] = "@(#)hash_dup.c 10.27 (Sleepycat) 12/6/98"; #endif /* not lint */ /* @@ -61,15 +61,17 @@ static const char sccsid[] = "@(#)hash_dup.c 10.14 (Sleepycat) 5/7/98"; #ifndef NO_SYSTEM_INCLUDES #include <sys/types.h> +#include <errno.h> #include <string.h> #endif #include "db_int.h" #include "db_page.h" #include "hash.h" +#include "btree.h" -static int __ham_check_move __P((HTAB *, HASH_CURSOR *, int32_t)); -static int __ham_dup_convert __P((HTAB *, HASH_CURSOR *)); +static int __ham_check_move __P((DBC *, int32_t)); +static int __ham_dup_convert __P((DBC *)); static int __ham_make_dup __P((const DBT *, DBT *d, void **, u_int32_t *)); /* @@ -85,26 +87,29 @@ static int __ham_make_dup __P((const DBT *, DBT *d, void **, u_int32_t *)); * Case 4: The element is large enough to push the duplicate set onto a * separate page. * - * PUBLIC: int __ham_add_dup __P((HTAB *, HASH_CURSOR *, DBT *, u_int32_t)); + * PUBLIC: int __ham_add_dup __P((DBC *, DBT *, u_int32_t)); */ int -__ham_add_dup(hashp, hcp, nval, flags) - HTAB *hashp; - HASH_CURSOR *hcp; +__ham_add_dup(dbc, nval, flags) + DBC *dbc; DBT *nval; u_int32_t flags; { - DBT pval, tmp_val; + DB *dbp; + HASH_CURSOR *hcp; + DBT dbt, pval, tmp_val; u_int32_t del_len, new_size; - int ret; + int cmp, ret; u_int8_t *hk; + dbp = dbc->dbp; + hcp = (HASH_CURSOR *)dbc->internal; if (flags == DB_CURRENT && hcp->dpgno == PGNO_INVALID) del_len = hcp->dup_len; else del_len = 0; - if ((ret = __ham_check_move(hashp, hcp, + if ((ret = __ham_check_move(dbc, (int32_t)DUP_SIZE(nval->size) - (int32_t)del_len)) != 0) return (ret); @@ -117,7 +122,7 @@ __ham_add_dup(hashp, hcp, nval, flags) */ hk = H_PAIRDATA(hcp->pagep, hcp->bndx); new_size = DUP_SIZE(nval->size) - del_len + LEN_HKEYDATA(hcp->pagep, - hashp->hdr->pagesize, H_DATAINDEX(hcp->bndx)); + hcp->hdr->pagesize, H_DATAINDEX(hcp->bndx)); /* * We convert to off-page duplicates if the item is a big item, @@ -125,10 +130,10 @@ __ham_add_dup(hashp, hcp, nval, flags) * if there isn't enough room on this page to add the next item. */ if (HPAGE_PTYPE(hk) != H_OFFDUP && - (HPAGE_PTYPE(hk) == H_OFFPAGE || ISBIG(hashp, new_size) || + (HPAGE_PTYPE(hk) == H_OFFPAGE || ISBIG(hcp, new_size) || DUP_SIZE(nval->size) - del_len > P_FREESPACE(hcp->pagep))) { - if ((ret = __ham_dup_convert(hashp, hcp)) != 0) + if ((ret = __ham_dup_convert(dbc)) != 0) return (ret); else hk = H_PAIRDATA(hcp->pagep, hcp->bndx); @@ -140,30 +145,44 @@ __ham_add_dup(hashp, hcp, nval, flags) HPAGE_PTYPE(hk) = H_DUPLICATE; pval.flags = 0; pval.data = HKEYDATA_DATA(hk); - pval.size = LEN_HDATA(hcp->pagep, hashp->hdr->pagesize, + pval.size = LEN_HDATA(hcp->pagep, dbp->pgsize, hcp->bndx); if ((ret = - __ham_make_dup(&pval, &tmp_val, &hcp->big_data, - &hcp->big_datalen)) != 0 || (ret = - __ham_replpair(hashp, hcp, &tmp_val, 1)) != 0) + __ham_make_dup(&pval, &tmp_val, &dbc->rdata.data, + &dbc->rdata.size)) != 0 || (ret = + __ham_replpair(dbc, &tmp_val, 1)) != 0) return (ret); } /* Now make the new entry a duplicate. */ if ((ret = __ham_make_dup(nval, - &tmp_val, &hcp->big_data, &hcp->big_datalen)) != 0) + &tmp_val, &dbc->rdata.data, &dbc->rdata.size)) != 0) return (ret); tmp_val.dlen = 0; switch (flags) { /* On page. */ case DB_KEYFIRST: - tmp_val.doff = 0; - break; case DB_KEYLAST: - tmp_val.doff = LEN_HDATA(hcp->pagep, - hashp->hdr->pagesize, hcp->bndx); + if (dbp->dup_compare != NULL) + __ham_dsearch(dbc, nval, &tmp_val.doff, &cmp); + else if (flags == DB_KEYFIRST) + tmp_val.doff = 0; + else + tmp_val.doff = LEN_HDATA(hcp->pagep, + hcp->hdr->pagesize, hcp->bndx); break; case DB_CURRENT: + /* + * If we have a sort function, we need to verify that + * the new item sorts identically to the old item. + */ + if (dbp->dup_compare != NULL) { + dbt.data = HKEYDATA_DATA(H_PAIRDATA(hcp->pagep, + hcp->bndx)) + hcp->dup_off; + dbt.size = DUP_SIZE(hcp->dup_len); + if (dbp->dup_compare(nval, &dbt) != 0) + return (EINVAL); + } tmp_val.doff = hcp->dup_off; tmp_val.dlen = DUP_SIZE(hcp->dup_len); break; @@ -175,9 +194,9 @@ __ham_add_dup(hashp, hcp, nval, flags) break; } /* Add the duplicate. */ - ret = __ham_replpair(hashp, hcp, &tmp_val, 0); + ret = __ham_replpair(dbc, &tmp_val, 0); if (ret == 0) - ret = __ham_dirty_page(hashp, hcp->pagep); + ret = __ham_dirty_page(dbp, hcp->pagep); __ham_c_update(hcp, hcp->pgno, tmp_val.size, 1, 1); return (ret); } @@ -190,27 +209,48 @@ __ham_add_dup(hashp, hcp, nval, flags) switch (flags) { case DB_KEYFIRST: + if (dbp->dup_compare != NULL) + goto sorted_dups; /* * The only way that we are already on a dup page is * if we just converted the on-page representation. * In that case, we've only got one page of duplicates. */ if (hcp->dpagep == NULL && (ret = - __db_dend(hashp->dbp, hcp->dpgno, &hcp->dpagep)) != 0) + __db_dend(dbc, hcp->dpgno, &hcp->dpagep)) != 0) return (ret); hcp->dndx = 0; break; case DB_KEYLAST: - if (hcp->dpagep == NULL && (ret = - __db_dend(hashp->dbp, hcp->dpgno, &hcp->dpagep)) != 0) - return (ret); - hcp->dpgno = PGNO(hcp->dpagep); - hcp->dndx = NUM_ENT(hcp->dpagep); + if (dbp->dup_compare != NULL) { +sorted_dups: if ((ret = __db_dsearch(dbc, 1, nval, + hcp->dpgno, &hcp->dndx, &hcp->dpagep, &cmp)) != 0) + return (ret); + if (cmp == 0) + hcp->dpgno = PGNO(hcp->dpagep); + } else { + if (hcp->dpagep == NULL && (ret = + __db_dend(dbc, hcp->dpgno, &hcp->dpagep)) != 0) + return (ret); + hcp->dpgno = PGNO(hcp->dpagep); + hcp->dndx = NUM_ENT(hcp->dpagep); + } break; case DB_CURRENT: - if ((ret = __db_ditem(hashp->dbp, hcp->dpagep, hcp->dndx, - BKEYDATA_SIZE(GET_BKEYDATA(hcp->dpagep, hcp->dndx)->len))) - != 0) + if (dbp->dup_compare != NULL && __bam_cmp(dbp, + nval, hcp->dpagep, hcp->dndx, dbp->dup_compare) != 0) + return (EINVAL); + switch (GET_BKEYDATA(hcp->dpagep, hcp->dndx)->type) { + case B_KEYDATA: + del_len = BKEYDATA_SIZE(GET_BKEYDATA(hcp->dpagep, + hcp->dndx)->len); + break; + case B_OVERFLOW: + del_len = BOVERFLOW_SIZE; + break; + } + if ((ret = + __db_ditem(dbc, hcp->dpagep, hcp->dndx, del_len)) != 0) return (ret); break; case DB_BEFORE: /* The default behavior is correct. */ @@ -220,7 +260,7 @@ __ham_add_dup(hashp, hcp, nval, flags) break; } - ret = __db_dput(hashp->dbp, + ret = __db_dput(dbc, nval, &hcp->dpagep, &hcp->dndx, __ham_overflow_page); hcp->pgno = PGNO(hcp->pagep); __ham_c_update(hcp, hcp->pgno, nval->size, 1, 1); @@ -231,22 +271,25 @@ __ham_add_dup(hashp, hcp, nval, flags) * Convert an on-page set of duplicates to an offpage set of duplicates. */ static int -__ham_dup_convert(hashp, hcp) - HTAB *hashp; - HASH_CURSOR *hcp; +__ham_dup_convert(dbc) + DBC *dbc; { + DB *dbp; + HASH_CURSOR *hcp; BOVERFLOW bo; DBT dbt; HOFFPAGE ho; - db_indx_t dndx, len; + db_indx_t dndx, i, len, off; int ret; u_int8_t *p, *pend; /* * Create a new page for the duplicates. */ + dbp = dbc->dbp; + hcp = (HASH_CURSOR *)dbc->internal; if ((ret = - __ham_overflow_page(hashp->dbp, P_DUPLICATE, &hcp->dpagep)) != 0) + __ham_overflow_page(dbc, P_DUPLICATE, &hcp->dpagep)) != 0) return (ret); hcp->dpagep->type = P_DUPLICATE; hcp->dpgno = PGNO(hcp->dpagep); @@ -254,67 +297,80 @@ __ham_dup_convert(hashp, hcp) /* * Now put the duplicates onto the new page. */ + dndx = 0; dbt.flags = 0; switch (HPAGE_PTYPE(H_PAIRDATA(hcp->pagep, hcp->bndx))) { case H_KEYDATA: /* Simple case, one key on page; move it to dup page. */ - dndx = 0; dbt.size = - LEN_HDATA(hcp->pagep, hashp->hdr->pagesize, hcp->bndx); + LEN_HDATA(hcp->pagep, hcp->hdr->pagesize, hcp->bndx); dbt.data = HKEYDATA_DATA(H_PAIRDATA(hcp->pagep, hcp->bndx)); - ret = __db_pitem(hashp->dbp, hcp->dpagep, + ret = __db_pitem(dbc, hcp->dpagep, (u_int32_t)dndx, BKEYDATA_SIZE(dbt.size), NULL, &dbt); if (ret == 0) - __ham_dirty_page(hashp, hcp->dpagep); + __ham_dirty_page(dbp, hcp->dpagep); break; case H_OFFPAGE: /* Simple case, one key on page; move it to dup page. */ - dndx = 0; memcpy(&ho, P_ENTRY(hcp->pagep, H_DATAINDEX(hcp->bndx)), HOFFPAGE_SIZE); + UMRW(bo.unused1); B_TSET(bo.type, ho.type, 0); + UMRW(bo.unused2); bo.pgno = ho.pgno; bo.tlen = ho.tlen; dbt.size = BOVERFLOW_SIZE; dbt.data = &bo; - ret = __db_pitem(hashp->dbp, hcp->dpagep, + ret = __db_pitem(dbc, hcp->dpagep, (u_int32_t)dndx, dbt.size, &dbt, NULL); if (ret == 0) - __ham_dirty_page(hashp, hcp->dpagep); + __ham_dirty_page(dbp, hcp->dpagep); break; case H_DUPLICATE: p = HKEYDATA_DATA(H_PAIRDATA(hcp->pagep, hcp->bndx)); pend = p + - LEN_HDATA(hcp->pagep, hashp->hdr->pagesize, hcp->bndx); + LEN_HDATA(hcp->pagep, hcp->hdr->pagesize, hcp->bndx); - for (dndx = 0; p < pend; dndx++) { + /* + * We need to maintain the duplicate cursor position. + * Keep track of where we are in the duplicate set via + * the offset, and when it matches the one in the cursor, + * set the off-page duplicate cursor index to the current + * index. + */ + for (off = 0, i = 0; p < pend; i++) { + if (off == hcp->dup_off) + dndx = i; memcpy(&len, p, sizeof(db_indx_t)); dbt.size = len; p += sizeof(db_indx_t); dbt.data = p; p += len + sizeof(db_indx_t); - ret = __db_dput(hashp->dbp, &dbt, - &hcp->dpagep, &dndx, __ham_overflow_page); + off += len + 2 * sizeof(db_indx_t); + ret = __db_dput(dbc, &dbt, + &hcp->dpagep, &i, __ham_overflow_page); if (ret != 0) break; } break; default: - ret = __db_pgfmt(hashp->dbp, (u_long)hcp->pgno); + ret = __db_pgfmt(dbp, (u_long)hcp->pgno); + break; } if (ret == 0) { /* * Now attach this to the source page in place of * the old duplicate item. */ - __ham_move_offpage(hashp, hcp->pagep, + __ham_move_offpage(dbc, hcp->pagep, (u_int32_t)H_DATAINDEX(hcp->bndx), hcp->dpgno); /* Can probably just do a "put" here. */ - ret = __ham_dirty_page(hashp, hcp->pagep); + ret = __ham_dirty_page(dbp, hcp->pagep); + hcp->dndx = dndx; } else { - (void)__ham_del_page(hashp->dbp, hcp->dpagep); + (void)__ham_del_page(dbc, hcp->dpagep); hcp->dpagep = NULL; } return (ret); @@ -354,11 +410,12 @@ __ham_make_dup(notdup, duplicate, bufp, sizep) } static int -__ham_check_move(hashp, hcp, add_len) - HTAB *hashp; - HASH_CURSOR *hcp; +__ham_check_move(dbc, add_len) + DBC *dbc; int32_t add_len; { + DB *dbp; + HASH_CURSOR *hcp; DBT k, d; DB_LSN new_lsn; PAGE *next_pagep; @@ -367,6 +424,8 @@ __ham_check_move(hashp, hcp, add_len) u_int8_t *hk; int ret; + dbp = dbc->dbp; + hcp = (HASH_CURSOR *)dbc->internal; /* * Check if we can do whatever we need to on this page. If not, * then we'll have to move the current element to a new page. @@ -381,7 +440,7 @@ __ham_check_move(hashp, hcp, add_len) return (0); old_len = - LEN_HITEM(hcp->pagep, hashp->hdr->pagesize, H_DATAINDEX(hcp->bndx)); + LEN_HITEM(hcp->pagep, hcp->hdr->pagesize, H_DATAINDEX(hcp->bndx)); new_datalen = old_len - HKEYDATA_SIZE(0) + add_len; /* @@ -392,11 +451,11 @@ __ham_check_move(hashp, hcp, add_len) * threshold, but the new data won't fit on the page. * If neither of these is true, then we can return. */ - if (ISBIG(hashp, new_datalen) && (old_len > HOFFDUP_SIZE || + if (ISBIG(hcp, new_datalen) && (old_len > HOFFDUP_SIZE || HOFFDUP_SIZE - old_len <= P_FREESPACE(hcp->pagep))) return (0); - if (!ISBIG(hashp, new_datalen) && + if (!ISBIG(hcp, new_datalen) && add_len <= (int32_t)P_FREESPACE(hcp->pagep)) return (0); @@ -405,18 +464,18 @@ __ham_check_move(hashp, hcp, add_len) * Check if there are more pages in the chain. */ - new_datalen = ISBIG(hashp, new_datalen) ? + new_datalen = ISBIG(hcp, new_datalen) ? HOFFDUP_SIZE : HKEYDATA_SIZE(new_datalen); next_pagep = NULL; for (next_pgno = NEXT_PGNO(hcp->pagep); next_pgno != PGNO_INVALID; next_pgno = NEXT_PGNO(next_pagep)) { if (next_pagep != NULL && - (ret = __ham_put_page(hashp->dbp, next_pagep, 0)) != 0) + (ret = __ham_put_page(dbp, next_pagep, 0)) != 0) return (ret); if ((ret = - __ham_get_page(hashp->dbp, next_pgno, &next_pagep)) != 0) + __ham_get_page(dbp, next_pgno, &next_pagep)) != 0) return (ret); if (P_FREESPACE(next_pagep) >= new_datalen) @@ -424,17 +483,17 @@ __ham_check_move(hashp, hcp, add_len) } /* No more pages, add one. */ - if (next_pagep == NULL && - (ret = __ham_add_ovflpage(hashp, hcp->pagep, 0, &next_pagep)) != 0) + if (next_pagep == NULL && (ret = __ham_add_ovflpage(dbc, + hcp->pagep, 0, &next_pagep)) != 0) return (ret); /* Add new page at the end of the chain. */ - if (P_FREESPACE(next_pagep) < new_datalen && - (ret = __ham_add_ovflpage(hashp, next_pagep, 1, &next_pagep)) != 0) + if (P_FREESPACE(next_pagep) < new_datalen && (ret = + __ham_add_ovflpage(dbc, next_pagep, 1, &next_pagep)) != 0) return (ret); /* Copy the item to the new page. */ - if (DB_LOGGING(hashp->dbp)) { + if (DB_LOGGING(hcp->dbc)) { rectype = PUTPAIR; k.flags = 0; d.flags = 0; @@ -447,7 +506,7 @@ __ham_check_move(hashp, hcp, add_len) k.data = HKEYDATA_DATA(H_PAIRKEY(hcp->pagep, hcp->bndx)); k.size = LEN_HKEY(hcp->pagep, - hashp->hdr->pagesize, hcp->bndx); + hcp->hdr->pagesize, hcp->bndx); } if (HPAGE_PTYPE(hk) == H_OFFPAGE) { @@ -458,13 +517,13 @@ __ham_check_move(hashp, hcp, add_len) d.data = HKEYDATA_DATA(H_PAIRDATA(hcp->pagep, hcp->bndx)); d.size = LEN_HDATA(hcp->pagep, - hashp->hdr->pagesize, hcp->bndx); + hcp->hdr->pagesize, hcp->bndx); } - if ((ret = __ham_insdel_log(hashp->dbp->dbenv->lg_info, - (DB_TXN *)hashp->dbp->txn, &new_lsn, 0, rectype, - hashp->dbp->log_fileid, PGNO(next_pagep), + if ((ret = __ham_insdel_log(dbp->dbenv->lg_info, + dbc->txn, &new_lsn, 0, rectype, + dbp->log_fileid, PGNO(next_pagep), (u_int32_t)H_NUMPAIRS(next_pagep), &LSN(next_pagep), &k, &d)) != 0) return (ret); @@ -473,13 +532,15 @@ __ham_check_move(hashp, hcp, add_len) LSN(next_pagep) = new_lsn; /* Structure assignment. */ } - __ham_copy_item(hashp, hcp->pagep, H_KEYINDEX(hcp->bndx), next_pagep); - __ham_copy_item(hashp, hcp->pagep, H_DATAINDEX(hcp->bndx), next_pagep); + __ham_copy_item(dbp->pgsize, + hcp->pagep, H_KEYINDEX(hcp->bndx), next_pagep); + __ham_copy_item(dbp->pgsize, + hcp->pagep, H_DATAINDEX(hcp->bndx), next_pagep); /* Now delete the pair from the current page. */ - ret = __ham_del_pair(hashp, hcp, 0); + ret = __ham_del_pair(dbc, 0); - (void)__ham_put_page(hashp->dbp, hcp->pagep, 1); + (void)__ham_put_page(dbp, hcp->pagep, 1); hcp->pagep = next_pagep; hcp->pgno = PGNO(hcp->pagep); hcp->bndx = H_NUMPAIRS(hcp->pagep) - 1; @@ -488,19 +549,25 @@ __ham_check_move(hashp, hcp, add_len) } /* - * Replace an onpage set of duplicates with the OFFDUP structure that - * references the duplicate page. - * XXX This is really just a special case of __onpage_replace; we should + * __ham_move_offpage -- + * Replace an onpage set of duplicates with the OFFDUP structure + * that references the duplicate page. + * + * XXX + * This is really just a special case of __onpage_replace; we should * probably combine them. - * PUBLIC: void __ham_move_offpage __P((HTAB *, PAGE *, u_int32_t, db_pgno_t)); + * + * PUBLIC: void __ham_move_offpage __P((DBC *, PAGE *, u_int32_t, db_pgno_t)); */ void -__ham_move_offpage(hashp, pagep, ndx, pgno) - HTAB *hashp; +__ham_move_offpage(dbc, pagep, ndx, pgno) + DBC *dbc; PAGE *pagep; u_int32_t ndx; db_pgno_t pgno; { + DB *dbp; + HASH_CURSOR *hcp; DBT new_dbt; DBT old_dbt; HOFFDUP od; @@ -508,22 +575,27 @@ __ham_move_offpage(hashp, pagep, ndx, pgno) int32_t shrink; u_int8_t *src; + dbp = dbc->dbp; + hcp = (HASH_CURSOR *)dbc->internal; od.type = H_OFFDUP; + UMRW(od.unused[0]); + UMRW(od.unused[1]); + UMRW(od.unused[2]); od.pgno = pgno; - if (DB_LOGGING(hashp->dbp)) { + if (DB_LOGGING(dbc)) { new_dbt.data = &od; new_dbt.size = HOFFDUP_SIZE; old_dbt.data = P_ENTRY(pagep, ndx); - old_dbt.size = LEN_HITEM(pagep, hashp->hdr->pagesize, ndx); - (void)__ham_replace_log(hashp->dbp->dbenv->lg_info, - (DB_TXN *)hashp->dbp->txn, &LSN(pagep), 0, - hashp->dbp->log_fileid, PGNO(pagep), (u_int32_t)ndx, - &LSN(pagep), -1, &old_dbt, &new_dbt, 0); + old_dbt.size = LEN_HITEM(pagep, hcp->hdr->pagesize, ndx); + (void)__ham_replace_log(dbp->dbenv->lg_info, + dbc->txn, &LSN(pagep), 0, dbp->log_fileid, + PGNO(pagep), (u_int32_t)ndx, &LSN(pagep), -1, + &old_dbt, &new_dbt, 0); } shrink = - LEN_HITEM(pagep, hashp->hdr->pagesize, ndx) - HOFFDUP_SIZE; + LEN_HITEM(pagep, hcp->hdr->pagesize, ndx) - HOFFDUP_SIZE; if (shrink != 0) { /* Copy data. */ @@ -539,3 +611,46 @@ __ham_move_offpage(hashp, pagep, ndx, pgno) /* Now copy the offdup entry onto the page. */ memcpy(P_ENTRY(pagep, ndx), &od, HOFFDUP_SIZE); } + +/* + * __ham_dsearch: + * Locate a particular duplicate in a duplicate set. + * + * PUBLIC: void __ham_dsearch __P((DBC *, DBT *, u_int32_t *, int *)); + */ +void +__ham_dsearch(dbc, dbt, offp, cmpp) + DBC *dbc; + DBT *dbt; + u_int32_t *offp; + int *cmpp; +{ + DB *dbp; + HASH_CURSOR *hcp; + DBT cur; + db_indx_t i, len; + int (*func) __P((const DBT *, const DBT *)); + u_int8_t *data; + + dbp = dbc->dbp; + hcp = (HASH_CURSOR *)dbc->internal; + if (dbp->dup_compare == NULL) + func = __bam_defcmp; + else + func = dbp->dup_compare; + + i = F_ISSET(dbc, DBC_CONTINUE) ? hcp->dup_off: 0; + data = HKEYDATA_DATA(H_PAIRDATA(hcp->pagep, hcp->bndx)) + i; + while (i < LEN_HDATA(hcp->pagep, hcp->hdr->pagesize, hcp->bndx)) { + memcpy(&len, data, sizeof(db_indx_t)); + data += sizeof(db_indx_t); + cur.data = data; + cur.size = (u_int32_t)len; + *cmpp = func(dbt, &cur); + if (*cmpp == 0 || (*cmpp < 0 && dbp->dup_compare != NULL)) + break; + i += len + 2 * sizeof(db_indx_t); + data += len + sizeof(db_indx_t); + } + *offp = i; +} diff --git a/db2/hash/hash_page.c b/db2/hash/hash_page.c index 5b3463947b..3419c1215c 100644 --- a/db2/hash/hash_page.c +++ b/db2/hash/hash_page.c @@ -47,7 +47,7 @@ #include "config.h" #ifndef lint -static const char sccsid[] = "@(#)hash_page.c 10.40 (Sleepycat) 6/2/98"; +static const char sccsid[] = "@(#)hash_page.c 10.55 (Sleepycat) 1/3/99"; #endif /* not lint */ /* @@ -77,107 +77,118 @@ static const char sccsid[] = "@(#)hash_page.c 10.40 (Sleepycat) 6/2/98"; #include "db_page.h" #include "hash.h" -static int __ham_lock_bucket __P((DB *, HASH_CURSOR *, db_lockmode_t)); +static int __ham_lock_bucket __P((DBC *, db_lockmode_t)); #ifdef DEBUG_SLOW -static void __account_page(HTAB *, db_pgno_t, int); +static void __account_page(DB *, db_pgno_t, int); #endif /* - * PUBLIC: int __ham_item __P((HTAB *, HASH_CURSOR *, db_lockmode_t)); + * PUBLIC: int __ham_item __P((DBC *, db_lockmode_t)); */ int -__ham_item(hashp, cursorp, mode) - HTAB *hashp; - HASH_CURSOR *cursorp; +__ham_item(dbc, mode) + DBC *dbc; db_lockmode_t mode; { + DB *dbp; + HASH_CURSOR *hcp; db_pgno_t next_pgno; int ret; - if (F_ISSET(cursorp, H_DELETED)) + dbp = dbc->dbp; + hcp = (HASH_CURSOR *)dbc->internal; + + if (F_ISSET(hcp, H_DELETED)) return (EINVAL); - F_CLR(cursorp, H_OK | H_NOMORE); + F_CLR(hcp, H_OK | H_NOMORE); /* Check if we need to get a page for this cursor. */ - if ((ret = __ham_get_cpage(hashp, cursorp, mode)) != 0) + if ((ret = __ham_get_cpage(dbc, mode)) != 0) return (ret); /* Check if we are looking for space in which to insert an item. */ - if (cursorp->seek_size && cursorp->seek_found_page == PGNO_INVALID - && cursorp->seek_size < P_FREESPACE(cursorp->pagep)) - cursorp->seek_found_page = cursorp->pgno; + if (hcp->seek_size && hcp->seek_found_page == PGNO_INVALID + && hcp->seek_size < P_FREESPACE(hcp->pagep)) + hcp->seek_found_page = hcp->pgno; /* Check if we need to go on to the next page. */ - if (F_ISSET(cursorp, H_ISDUP) && cursorp->dpgno == PGNO_INVALID) + if (F_ISSET(hcp, H_ISDUP) && hcp->dpgno == PGNO_INVALID) /* * ISDUP is set, and offset is at the beginning of the datum. * We need to grab the length of the datum, then set the datum * pointer to be the beginning of the datum. */ - memcpy(&cursorp->dup_len, - HKEYDATA_DATA(H_PAIRDATA(cursorp->pagep, cursorp->bndx)) + - cursorp->dup_off, sizeof(db_indx_t)); - else if (F_ISSET(cursorp, H_ISDUP)) { + memcpy(&hcp->dup_len, + HKEYDATA_DATA(H_PAIRDATA(hcp->pagep, hcp->bndx)) + + hcp->dup_off, sizeof(db_indx_t)); + else if (F_ISSET(hcp, H_ISDUP)) { /* Make sure we're not about to run off the page. */ - if (cursorp->dpagep == NULL && (ret = __ham_get_page(hashp->dbp, - cursorp->dpgno, &cursorp->dpagep)) != 0) + if (hcp->dpagep == NULL && (ret = __ham_get_page(dbp, + hcp->dpgno, &hcp->dpagep)) != 0) return (ret); - if (cursorp->dndx >= NUM_ENT(cursorp->dpagep)) { - if (NEXT_PGNO(cursorp->dpagep) == PGNO_INVALID) { - if ((ret = __ham_put_page(hashp->dbp, - cursorp->dpagep, 0)) != 0) + if (hcp->dndx >= NUM_ENT(hcp->dpagep)) { + if (NEXT_PGNO(hcp->dpagep) == PGNO_INVALID) { + if (F_ISSET(hcp, H_DUPONLY)) { + F_CLR(hcp, H_OK); + F_SET(hcp, H_NOMORE); + return (0); + } + if ((ret = __ham_put_page(dbp, + hcp->dpagep, 0)) != 0) return (ret); - F_CLR(cursorp, H_ISDUP); - cursorp->dpagep = NULL; - cursorp->dpgno = PGNO_INVALID; - cursorp->dndx = NDX_INVALID; - cursorp->bndx++; - } else if ((ret = __ham_next_cpage(hashp, cursorp, - NEXT_PGNO(cursorp->dpagep), 0, H_ISDUP)) != 0) + F_CLR(hcp, H_ISDUP); + hcp->dpagep = NULL; + hcp->dpgno = PGNO_INVALID; + hcp->dndx = NDX_INVALID; + hcp->bndx++; + } else if ((ret = __ham_next_cpage(dbc, + NEXT_PGNO(hcp->dpagep), 0, H_ISDUP)) != 0) return (ret); } } - if (cursorp->bndx >= (db_indx_t)H_NUMPAIRS(cursorp->pagep)) { + if (hcp->bndx >= (db_indx_t)H_NUMPAIRS(hcp->pagep)) { /* Fetch next page. */ - if (NEXT_PGNO(cursorp->pagep) == PGNO_INVALID) { - F_SET(cursorp, H_NOMORE); - if (cursorp->dpagep != NULL && - (ret = __ham_put_page(hashp->dbp, - cursorp->dpagep, 0)) != 0) + if (NEXT_PGNO(hcp->pagep) == PGNO_INVALID) { + F_SET(hcp, H_NOMORE); + if (hcp->dpagep != NULL && + (ret = __ham_put_page(dbp, hcp->dpagep, 0)) != 0) return (ret); - cursorp->dpgno = PGNO_INVALID; + hcp->dpgno = PGNO_INVALID; return (DB_NOTFOUND); } - next_pgno = NEXT_PGNO(cursorp->pagep); - cursorp->bndx = 0; - if ((ret = __ham_next_cpage(hashp, - cursorp, next_pgno, 0, 0)) != 0) + next_pgno = NEXT_PGNO(hcp->pagep); + hcp->bndx = 0; + if ((ret = __ham_next_cpage(dbc, next_pgno, 0, 0)) != 0) return (ret); } - F_SET(cursorp, H_OK); + F_SET(hcp, H_OK); return (0); } /* - * PUBLIC: int __ham_item_reset __P((HTAB *, HASH_CURSOR *)); + * PUBLIC: int __ham_item_reset __P((DBC *)); */ int -__ham_item_reset(hashp, cursorp) - HTAB *hashp; - HASH_CURSOR *cursorp; +__ham_item_reset(dbc) + DBC *dbc; { + HASH_CURSOR *hcp; + DB *dbp; int ret; - if (cursorp->pagep) - ret = __ham_put_page(hashp->dbp, cursorp->pagep, 0); - else - ret = 0; - - __ham_item_init(cursorp); + ret = 0; + dbp = dbc->dbp; + hcp = (HASH_CURSOR *)dbc->internal; + if (hcp->pagep != NULL) + ret = __ham_put_page(dbp, hcp->pagep, 0); + if (ret == 0 && hcp->dpagep != NULL) + ret = __ham_put_page(dbp, hcp->dpagep, 0); + + __ham_item_init(hcp); return (ret); } @@ -185,57 +196,67 @@ __ham_item_reset(hashp, cursorp) * PUBLIC: void __ham_item_init __P((HASH_CURSOR *)); */ void -__ham_item_init(cursorp) - HASH_CURSOR *cursorp; +__ham_item_init(hcp) + HASH_CURSOR *hcp; { - cursorp->pagep = NULL; - cursorp->bucket = BUCKET_INVALID; - cursorp->lock = 0; - cursorp->bndx = NDX_INVALID; - cursorp->pgno = PGNO_INVALID; - cursorp->dpgno = PGNO_INVALID; - cursorp->dndx = NDX_INVALID; - cursorp->dpagep = NULL; - cursorp->flags = 0; - cursorp->seek_size = 0; - cursorp->seek_found_page = PGNO_INVALID; + /* + * If this cursor still holds any locks, we must + * release them if we are not running with transactions. + */ + if (hcp->lock && hcp->dbc->txn == NULL) + (void)lock_put(hcp->dbc->dbp->dbenv->lk_info, hcp->lock); + + /* + * The following fields must *not* be initialized here + * because they may have meaning across inits. + * hlock, hdr, split_buf, stats + */ + hcp->bucket = BUCKET_INVALID; + hcp->lbucket = BUCKET_INVALID; + hcp->lock = 0; + hcp->pagep = NULL; + hcp->pgno = PGNO_INVALID; + hcp->bndx = NDX_INVALID; + hcp->dpagep = NULL; + hcp->dpgno = PGNO_INVALID; + hcp->dndx = NDX_INVALID; + hcp->dup_off = 0; + hcp->dup_len = 0; + hcp->dup_tlen = 0; + hcp->seek_size = 0; + hcp->seek_found_page = PGNO_INVALID; + hcp->flags = 0; } /* - * PUBLIC: int __ham_item_done __P((HTAB *, HASH_CURSOR *, int)); + * PUBLIC: int __ham_item_done __P((DBC *, int)); */ int -__ham_item_done(hashp, cursorp, dirty) - HTAB *hashp; - HASH_CURSOR *cursorp; +__ham_item_done(dbc, dirty) + DBC *dbc; int dirty; { + DB *dbp; + HASH_CURSOR *hcp; int ret, t_ret; + dbp = dbc->dbp; + hcp = (HASH_CURSOR *)dbc->internal; t_ret = ret = 0; - if (cursorp->pagep) - ret = __ham_put_page(hashp->dbp, cursorp->pagep, - dirty && cursorp->dpagep == NULL); - cursorp->pagep = NULL; + if (hcp->pagep) + ret = __ham_put_page(dbp, hcp->pagep, + dirty && hcp->dpagep == NULL); + hcp->pagep = NULL; - if (cursorp->dpagep) - t_ret = __ham_put_page(hashp->dbp, cursorp->dpagep, dirty); - cursorp->dpagep = NULL; + if (hcp->dpagep) + t_ret = __ham_put_page(dbp, hcp->dpagep, dirty); + hcp->dpagep = NULL; if (ret == 0 && t_ret != 0) ret = t_ret; /* - * If we are running with transactions, then we must - * not relinquish locks explicitly. - */ - if (cursorp->lock && hashp->dbp->txn == NULL) - t_ret = lock_put(hashp->dbp->dbenv->lk_info, cursorp->lock); - cursorp->lock = 0; - - - /* * We don't throw out the page number since we might want to * continue getting on this page. */ @@ -245,40 +266,42 @@ __ham_item_done(hashp, cursorp, dirty) /* * Returns the last item in a bucket. * - * PUBLIC: int __ham_item_last __P((HTAB *, HASH_CURSOR *, db_lockmode_t)); + * PUBLIC: int __ham_item_last __P((DBC *, db_lockmode_t)); */ int -__ham_item_last(hashp, cursorp, mode) - HTAB *hashp; - HASH_CURSOR *cursorp; +__ham_item_last(dbc, mode) + DBC *dbc; db_lockmode_t mode; { + HASH_CURSOR *hcp; int ret; - if ((ret = __ham_item_reset(hashp, cursorp)) != 0) + hcp = (HASH_CURSOR *)dbc->internal; + if ((ret = __ham_item_reset(dbc)) != 0) return (ret); - cursorp->bucket = hashp->hdr->max_bucket; - F_SET(cursorp, H_OK); - return (__ham_item_prev(hashp, cursorp, mode)); + hcp->bucket = hcp->hdr->max_bucket; + F_SET(hcp, H_OK); + return (__ham_item_prev(dbc, mode)); } /* - * PUBLIC: int __ham_item_first __P((HTAB *, HASH_CURSOR *, db_lockmode_t)); + * PUBLIC: int __ham_item_first __P((DBC *, db_lockmode_t)); */ int -__ham_item_first(hashp, cursorp, mode) - HTAB *hashp; - HASH_CURSOR *cursorp; +__ham_item_first(dbc, mode) + DBC *dbc; db_lockmode_t mode; { + HASH_CURSOR *hcp; int ret; - if ((ret = __ham_item_reset(hashp, cursorp)) != 0) + hcp = (HASH_CURSOR *)dbc->internal; + if ((ret = __ham_item_reset(dbc)) != 0) return (ret); - F_SET(cursorp, H_OK); - cursorp->bucket = 0; - return (__ham_item_next(hashp, cursorp, mode)); + F_SET(hcp, H_OK); + hcp->bucket = 0; + return (__ham_item_next(dbc, mode)); } /* @@ -287,17 +310,20 @@ __ham_item_first(hashp, cursorp, mode) * bigkeys, just returns the page number and index of the bigkey * pointer pair. * - * PUBLIC: int __ham_item_prev __P((HTAB *, HASH_CURSOR *, db_lockmode_t)); + * PUBLIC: int __ham_item_prev __P((DBC *, db_lockmode_t)); */ int -__ham_item_prev(hashp, cursorp, mode) - HTAB *hashp; - HASH_CURSOR *cursorp; +__ham_item_prev(dbc, mode) + DBC *dbc; db_lockmode_t mode; { + DB *dbp; + HASH_CURSOR *hcp; db_pgno_t next_pgno; int ret; + dbp = dbc->dbp; + hcp = (HASH_CURSOR *)dbc->internal; /* * There are N cases for backing up in a hash file. * Case 1: In the middle of a page, no duplicates, just dec the index. @@ -307,52 +333,56 @@ __ham_item_prev(hashp, cursorp, mode) * Case 4: At the beginning of a page; go to previous page. * Case 5: At the beginning of a bucket; go to prev bucket. */ - F_CLR(cursorp, H_OK | H_NOMORE | H_DELETED); + F_CLR(hcp, H_OK | H_NOMORE | H_DELETED); /* * First handle the duplicates. Either you'll get the key here * or you'll exit the duplicate set and drop into the code below * to handle backing up through keys. */ - if (F_ISSET(cursorp, H_ISDUP)) { - if (cursorp->dpgno == PGNO_INVALID) { + if (F_ISSET(hcp, H_ISDUP)) { + if (hcp->dpgno == PGNO_INVALID) { /* Duplicates are on-page. */ - if (cursorp->dup_off != 0) { - if ((ret = __ham_get_cpage(hashp, - cursorp, mode)) != 0) + if (hcp->dup_off != 0) { + if ((ret = __ham_get_cpage(dbc, mode)) != 0) return (ret); else { HASH_CURSOR *h; - h = cursorp; + h = hcp; memcpy(&h->dup_len, HKEYDATA_DATA( H_PAIRDATA(h->pagep, h->bndx)) + h->dup_off - sizeof(db_indx_t), sizeof(db_indx_t)); - cursorp->dup_off -= - DUP_SIZE(cursorp->dup_len); - cursorp->dndx--; - return (__ham_item(hashp, - cursorp, mode)); + hcp->dup_off -= + DUP_SIZE(hcp->dup_len); + hcp->dndx--; + return (__ham_item(dbc, mode)); } } - } else if (cursorp->dndx > 0) { /* Duplicates are off-page. */ - cursorp->dndx--; - return (__ham_item(hashp, cursorp, mode)); - } else if ((ret = __ham_get_cpage(hashp, cursorp, mode)) != 0) + } else if (hcp->dndx > 0) { /* Duplicates are off-page. */ + hcp->dndx--; + return (__ham_item(dbc, mode)); + } else if ((ret = __ham_get_cpage(dbc, mode)) != 0) return (ret); - else if (PREV_PGNO(cursorp->dpagep) == PGNO_INVALID) { - F_CLR(cursorp, H_ISDUP); /* End of dups */ - cursorp->dpgno = PGNO_INVALID; - if (cursorp->dpagep != NULL) - (void)__ham_put_page(hashp->dbp, - cursorp->dpagep, 0); - cursorp->dpagep = NULL; - } else if ((ret = __ham_next_cpage(hashp, cursorp, - PREV_PGNO(cursorp->dpagep), 0, H_ISDUP)) != 0) + else if (PREV_PGNO(hcp->dpagep) == PGNO_INVALID) { + if (F_ISSET(hcp, H_DUPONLY)) { + F_CLR(hcp, H_OK); + F_SET(hcp, H_NOMORE); + return (0); + } else { + F_CLR(hcp, H_ISDUP); /* End of dups */ + hcp->dpgno = PGNO_INVALID; + if (hcp->dpagep != NULL) + (void)__ham_put_page(dbp, + hcp->dpagep, 0); + hcp->dpagep = NULL; + } + } else if ((ret = __ham_next_cpage(dbc, + PREV_PGNO(hcp->dpagep), 0, H_ISDUP)) != 0) return (ret); else { - cursorp->dndx = NUM_ENT(cursorp->pagep) - 1; - return (__ham_item(hashp, cursorp, mode)); + hcp->dndx = NUM_ENT(hcp->pagep) - 1; + return (__ham_item(dbc, mode)); } } @@ -362,95 +392,123 @@ __ham_item_prev(hashp, cursorp, mode) * midpage, beginning of page, beginning of bucket. */ - if (cursorp->bndx == 0) { /* Beginning of page. */ - if ((ret = __ham_get_cpage(hashp, cursorp, mode)) != 0) + if (F_ISSET(hcp, H_DUPONLY)) { + F_CLR(hcp, H_OK); + F_SET(hcp, H_NOMORE); + return (0); + } + + if (hcp->bndx == 0) { /* Beginning of page. */ + if ((ret = __ham_get_cpage(dbc, mode)) != 0) return (ret); - cursorp->pgno = PREV_PGNO(cursorp->pagep); - if (cursorp->pgno == PGNO_INVALID) { + hcp->pgno = PREV_PGNO(hcp->pagep); + if (hcp->pgno == PGNO_INVALID) { /* Beginning of bucket. */ - F_SET(cursorp, H_NOMORE); + F_SET(hcp, H_NOMORE); return (DB_NOTFOUND); - } else if ((ret = __ham_next_cpage(hashp, - cursorp, cursorp->pgno, 0, 0)) != 0) + } else if ((ret = + __ham_next_cpage(dbc, hcp->pgno, 0, 0)) != 0) return (ret); else - cursorp->bndx = H_NUMPAIRS(cursorp->pagep); + hcp->bndx = H_NUMPAIRS(hcp->pagep); } /* * Either we've got the cursor set up to be decremented, or we * have to find the end of a bucket. */ - if (cursorp->bndx == NDX_INVALID) { - if (cursorp->pagep == NULL) - next_pgno = BUCKET_TO_PAGE(hashp, cursorp->bucket); + if (hcp->bndx == NDX_INVALID) { + if (hcp->pagep == NULL) + next_pgno = BUCKET_TO_PAGE(hcp, hcp->bucket); else goto got_page; do { - if ((ret = __ham_next_cpage(hashp, - cursorp, next_pgno, 0, 0)) != 0) + if ((ret = __ham_next_cpage(dbc, next_pgno, 0, 0)) != 0) return (ret); -got_page: next_pgno = NEXT_PGNO(cursorp->pagep); - cursorp->bndx = H_NUMPAIRS(cursorp->pagep); +got_page: next_pgno = NEXT_PGNO(hcp->pagep); + hcp->bndx = H_NUMPAIRS(hcp->pagep); } while (next_pgno != PGNO_INVALID); - if (cursorp->bndx == 0) { + if (hcp->bndx == 0) { /* Bucket was empty. */ - F_SET(cursorp, H_NOMORE); + F_SET(hcp, H_NOMORE); return (DB_NOTFOUND); } } - cursorp->bndx--; + hcp->bndx--; - return (__ham_item(hashp, cursorp, mode)); + return (__ham_item(dbc, mode)); } /* * Sets the cursor to the next key/data pair on a page. * - * PUBLIC: int __ham_item_next __P((HTAB *, HASH_CURSOR *, db_lockmode_t)); + * PUBLIC: int __ham_item_next __P((DBC *, db_lockmode_t)); */ int -__ham_item_next(hashp, cursorp, mode) - HTAB *hashp; - HASH_CURSOR *cursorp; +__ham_item_next(dbc, mode) + DBC *dbc; db_lockmode_t mode; { + HASH_CURSOR *hcp; + + hcp = (HASH_CURSOR *)dbc->internal; /* * Deleted on-page duplicates are a weird case. If we delete the last * one, then our cursor is at the very end of a duplicate set and * we actually need to go on to the next key. */ - if (F_ISSET(cursorp, H_DELETED)) { - if (cursorp->bndx != NDX_INVALID && - F_ISSET(cursorp, H_ISDUP) && - cursorp->dpgno == PGNO_INVALID && - cursorp->dup_tlen == cursorp->dup_off) { - F_CLR(cursorp, H_ISDUP); - cursorp->dpgno = PGNO_INVALID; - cursorp->bndx++; + if (F_ISSET(hcp, H_DELETED)) { + if (hcp->bndx != NDX_INVALID && + F_ISSET(hcp, H_ISDUP) && + hcp->dpgno == PGNO_INVALID && + hcp->dup_tlen == hcp->dup_off) { + if (F_ISSET(hcp, H_DUPONLY)) { + F_CLR(hcp, H_OK); + F_SET(hcp, H_NOMORE); + return (0); + } else { + F_CLR(hcp, H_ISDUP); + hcp->dpgno = PGNO_INVALID; + hcp->bndx++; + } + } else if (!F_ISSET(hcp, H_ISDUP) && + F_ISSET(hcp, H_DUPONLY)) { + F_CLR(hcp, H_OK); + F_SET(hcp, H_NOMORE); + return (0); } - F_CLR(cursorp, H_DELETED); - } else if (cursorp->bndx == NDX_INVALID) { - cursorp->bndx = 0; - cursorp->dpgno = PGNO_INVALID; - F_CLR(cursorp, H_ISDUP); - } else if (F_ISSET(cursorp, H_ISDUP) && cursorp->dpgno != PGNO_INVALID) - cursorp->dndx++; - else if (F_ISSET(cursorp, H_ISDUP)) { - cursorp->dndx++; - cursorp->dup_off += DUP_SIZE(cursorp->dup_len); - if (cursorp->dup_off >= cursorp->dup_tlen) { - F_CLR(cursorp, H_ISDUP); - cursorp->dpgno = PGNO_INVALID; - cursorp->bndx++; + F_CLR(hcp, H_DELETED); + } else if (hcp->bndx == NDX_INVALID) { + hcp->bndx = 0; + hcp->dpgno = PGNO_INVALID; + F_CLR(hcp, H_ISDUP); + } else if (F_ISSET(hcp, H_ISDUP) && hcp->dpgno != PGNO_INVALID) + hcp->dndx++; + else if (F_ISSET(hcp, H_ISDUP)) { + if (hcp->dup_off + DUP_SIZE(hcp->dup_len) >= + hcp->dup_tlen && F_ISSET(hcp, H_DUPONLY)) { + F_CLR(hcp, H_OK); + F_SET(hcp, H_NOMORE); + return (0); + } + hcp->dndx++; + hcp->dup_off += DUP_SIZE(hcp->dup_len); + if (hcp->dup_off >= hcp->dup_tlen) { + F_CLR(hcp, H_ISDUP); + hcp->dpgno = PGNO_INVALID; + hcp->bndx++; } + } else if (F_ISSET(hcp, H_DUPONLY)) { + F_CLR(hcp, H_OK); + F_SET(hcp, H_NOMORE); + return (0); } else - cursorp->bndx++; + hcp->bndx++; - return (__ham_item(hashp, cursorp, mode)); + return (__ham_item(dbc, mode)); } /* @@ -537,18 +595,15 @@ __ham_reputpair(p, psize, ndx, key, data) /* - * PUBLIC: int __ham_del_pair __P((HTAB *, HASH_CURSOR *, int)); - * - * XXX - * TODO: if the item is an offdup, delete the other pages and then remove - * the pair. If the offpage page is 0, then you can just remove the pair. + * PUBLIC: int __ham_del_pair __P((DBC *, int)); */ int -__ham_del_pair(hashp, cursorp, reclaim_page) - HTAB *hashp; - HASH_CURSOR *cursorp; +__ham_del_pair(dbc, reclaim_page) + DBC *dbc; int reclaim_page; { + DB *dbp; + HASH_CURSOR *hcp; DBT data_dbt, key_dbt; DB_ENV *dbenv; DB_LSN new_lsn, *n_lsn, tmp_lsn; @@ -557,13 +612,16 @@ __ham_del_pair(hashp, cursorp, reclaim_page) db_pgno_t chg_pgno, pgno; int ret, tret; - dbenv = hashp->dbp->dbenv; - ndx = cursorp->bndx; - if (cursorp->pagep == NULL && (ret = - __ham_get_page(hashp->dbp, cursorp->pgno, &cursorp->pagep)) != 0) + dbp = dbc->dbp; + hcp = (HASH_CURSOR *)dbc->internal; + + dbenv = dbp->dbenv; + ndx = hcp->bndx; + if (hcp->pagep == NULL && + (ret = __ham_get_page(dbp, hcp->pgno, &hcp->pagep)) != 0) return (ret); - p = cursorp->pagep; + p = hcp->pagep; /* * We optimize for the normal case which is when neither the key nor @@ -576,7 +634,7 @@ __ham_del_pair(hashp, cursorp, reclaim_page) if (HPAGE_PTYPE(H_PAIRKEY(p, ndx)) == H_OFFPAGE) { memcpy(&pgno, HOFFPAGE_PGNO(P_ENTRY(p, H_KEYINDEX(ndx))), sizeof(db_pgno_t)); - ret = __db_doff(hashp->dbp, pgno, __ham_del_page); + ret = __db_doff(dbc, pgno, __ham_del_page); } if (ret == 0) @@ -585,14 +643,14 @@ __ham_del_pair(hashp, cursorp, reclaim_page) memcpy(&pgno, HOFFPAGE_PGNO(P_ENTRY(p, H_DATAINDEX(ndx))), sizeof(db_pgno_t)); - ret = __db_doff(hashp->dbp, pgno, __ham_del_page); + ret = __db_doff(dbc, pgno, __ham_del_page); break; case H_OFFDUP: memcpy(&pgno, HOFFDUP_PGNO(P_ENTRY(p, H_DATAINDEX(ndx))), sizeof(db_pgno_t)); - ret = __db_ddup(hashp->dbp, pgno, __ham_del_page); - F_CLR(cursorp, H_ISDUP); + ret = __db_ddup(dbc, pgno, __ham_del_page); + F_CLR(hcp, H_ISDUP); break; case H_DUPLICATE: /* @@ -600,7 +658,7 @@ __ham_del_pair(hashp, cursorp, reclaim_page) * we had better clear the flag so that we update the * cursor appropriately. */ - F_CLR(cursorp, H_ISDUP); + F_CLR(hcp, H_ISDUP); break; } @@ -608,17 +666,17 @@ __ham_del_pair(hashp, cursorp, reclaim_page) return (ret); /* Now log the delete off this page. */ - if (DB_LOGGING(hashp->dbp)) { + if (DB_LOGGING(dbc)) { key_dbt.data = P_ENTRY(p, H_KEYINDEX(ndx)); key_dbt.size = - LEN_HITEM(p, hashp->hdr->pagesize, H_KEYINDEX(ndx)); + LEN_HITEM(p, hcp->hdr->pagesize, H_KEYINDEX(ndx)); data_dbt.data = P_ENTRY(p, H_DATAINDEX(ndx)); data_dbt.size = - LEN_HITEM(p, hashp->hdr->pagesize, H_DATAINDEX(ndx)); + LEN_HITEM(p, hcp->hdr->pagesize, H_DATAINDEX(ndx)); if ((ret = __ham_insdel_log(dbenv->lg_info, - (DB_TXN *)hashp->dbp->txn, &new_lsn, 0, DELPAIR, - hashp->dbp->log_fileid, PGNO(p), (u_int32_t)ndx, + dbc->txn, &new_lsn, 0, DELPAIR, + dbp->log_fileid, PGNO(p), (u_int32_t)ndx, &LSN(p), &key_dbt, &data_dbt)) != 0) return (ret); @@ -626,15 +684,16 @@ __ham_del_pair(hashp, cursorp, reclaim_page) LSN(p) = new_lsn; } - __ham_dpair(hashp->dbp, p, ndx); + __ham_dpair(dbp, p, ndx); /* - * If we are locking, we will not maintain this. - * XXXX perhaps we can retain incremental numbers and apply them + * If we are locking, we will not maintain this, because it is + * a hot spot. + * XXX perhaps we can retain incremental numbers and apply them * later. */ - if (!F_ISSET(hashp->dbp, DB_AM_LOCKING)) - --hashp->hdr->nelem; + if (!F_ISSET(dbp, DB_AM_LOCKING)) + --hcp->hdr->nelem; /* * If we need to reclaim the page, then check if the page is empty. @@ -653,25 +712,25 @@ __ham_del_pair(hashp, cursorp, reclaim_page) * are more pages in the chain. */ if ((ret = - __ham_get_page(hashp->dbp, NEXT_PGNO(p), &n_pagep)) != 0) + __ham_get_page(dbp, NEXT_PGNO(p), &n_pagep)) != 0) return (ret); if (NEXT_PGNO(n_pagep) != PGNO_INVALID) { if ((ret = - __ham_get_page(hashp->dbp, NEXT_PGNO(n_pagep), + __ham_get_page(dbp, NEXT_PGNO(n_pagep), &nn_pagep)) != 0) { - (void) __ham_put_page(hashp->dbp, n_pagep, 0); + (void) __ham_put_page(dbp, n_pagep, 0); return (ret); } } - if (DB_LOGGING(hashp->dbp)) { + if (DB_LOGGING(dbc)) { key_dbt.data = n_pagep; - key_dbt.size = hashp->hdr->pagesize; + key_dbt.size = hcp->hdr->pagesize; if ((ret = __ham_copypage_log(dbenv->lg_info, - (DB_TXN *)hashp->dbp->txn, &new_lsn, 0, - hashp->dbp->log_fileid, PGNO(p), &LSN(p), - PGNO(n_pagep), &LSN(n_pagep), NEXT_PGNO(n_pagep), + dbc->txn, &new_lsn, 0, dbp->log_fileid, PGNO(p), + &LSN(p), PGNO(n_pagep), &LSN(n_pagep), + NEXT_PGNO(n_pagep), NEXT_PGNO(n_pagep) == PGNO_INVALID ? NULL : &LSN(nn_pagep), &key_dbt)) != 0) return (ret); @@ -684,12 +743,12 @@ __ham_del_pair(hashp, cursorp, reclaim_page) } if (NEXT_PGNO(n_pagep) != PGNO_INVALID) { PREV_PGNO(nn_pagep) = PGNO(p); - (void)__ham_put_page(hashp->dbp, nn_pagep, 1); + (void)__ham_put_page(dbp, nn_pagep, 1); } tmp_pgno = PGNO(p); tmp_lsn = LSN(p); - memcpy(p, n_pagep, hashp->hdr->pagesize); + memcpy(p, n_pagep, hcp->hdr->pagesize); PGNO(p) = tmp_pgno; LSN(p) = tmp_lsn; PREV_PGNO(p) = PGNO_INVALID; @@ -697,25 +756,25 @@ __ham_del_pair(hashp, cursorp, reclaim_page) /* * Cursor is advanced to the beginning of the next page. */ - cursorp->bndx = 0; - cursorp->pgno = PGNO(p); - F_SET(cursorp, H_DELETED); + hcp->bndx = 0; + hcp->pgno = PGNO(p); + F_SET(hcp, H_DELETED); chg_pgno = PGNO(p); - if ((ret = __ham_dirty_page(hashp, p)) != 0 || - (ret = __ham_del_page(hashp->dbp, n_pagep)) != 0) + if ((ret = __ham_dirty_page(dbp, p)) != 0 || + (ret = __ham_del_page(dbc, n_pagep)) != 0) return (ret); } else if (reclaim_page && NUM_ENT(p) == 0 && PREV_PGNO(p) != PGNO_INVALID) { PAGE *n_pagep, *p_pagep; if ((ret = - __ham_get_page(hashp->dbp, PREV_PGNO(p), &p_pagep)) != 0) + __ham_get_page(dbp, PREV_PGNO(p), &p_pagep)) != 0) return (ret); if (NEXT_PGNO(p) != PGNO_INVALID) { - if ((ret = __ham_get_page(hashp->dbp, + if ((ret = __ham_get_page(dbp, NEXT_PGNO(p), &n_pagep)) != 0) { - (void)__ham_put_page(hashp->dbp, p_pagep, 0); + (void)__ham_put_page(dbp, p_pagep, 0); return (ret); } n_lsn = &LSN(n_pagep); @@ -728,10 +787,10 @@ __ham_del_pair(hashp, cursorp, reclaim_page) if (n_pagep != NULL) PREV_PGNO(n_pagep) = PGNO(p_pagep); - if (DB_LOGGING(hashp->dbp)) { + if (DB_LOGGING(dbc)) { if ((ret = __ham_newpage_log(dbenv->lg_info, - (DB_TXN *)hashp->dbp->txn, &new_lsn, 0, DELOVFL, - hashp->dbp->log_fileid, PREV_PGNO(p), &LSN(p_pagep), + dbc->txn, &new_lsn, 0, DELOVFL, + dbp->log_fileid, PREV_PGNO(p), &LSN(p_pagep), PGNO(p), &LSN(p), NEXT_PGNO(p), n_lsn)) != 0) return (ret); @@ -741,21 +800,21 @@ __ham_del_pair(hashp, cursorp, reclaim_page) LSN(n_pagep) = new_lsn; LSN(p) = new_lsn; } - cursorp->pgno = NEXT_PGNO(p); - cursorp->bndx = 0; + hcp->pgno = NEXT_PGNO(p); + hcp->bndx = 0; /* * Since we are about to delete the cursor page and we have * just moved the cursor, we need to make sure that the * old page pointer isn't left hanging around in the cursor. */ - cursorp->pagep = NULL; + hcp->pagep = NULL; chg_pgno = PGNO(p); - ret = __ham_del_page(hashp->dbp, p); - if ((tret = __ham_put_page(hashp->dbp, p_pagep, 1)) != 0 && + ret = __ham_del_page(dbc, p); + if ((tret = __ham_put_page(dbp, p_pagep, 1)) != 0 && ret == 0) ret = tret; if (n_pagep != NULL && - (tret = __ham_put_page(hashp->dbp, n_pagep, 1)) != 0 && + (tret = __ham_put_page(dbp, n_pagep, 1)) != 0 && ret == 0) ret = tret; if (ret != 0) @@ -766,19 +825,19 @@ __ham_del_pair(hashp, cursorp, reclaim_page) * so that we update the cursor correctly on the next call * to next. */ - F_SET(cursorp, H_DELETED); - chg_pgno = cursorp->pgno; - ret = __ham_dirty_page(hashp, p); + F_SET(hcp, H_DELETED); + chg_pgno = hcp->pgno; + ret = __ham_dirty_page(dbp, p); } - __ham_c_update(cursorp, chg_pgno, 0, 0, 0); + __ham_c_update(hcp, chg_pgno, 0, 0, 0); /* * Since we just deleted a pair from the master page, anything - * in cursorp->dpgno should be cleared. + * in hcp->dpgno should be cleared. */ - cursorp->dpgno = PGNO_INVALID; + hcp->dpgno = PGNO_INVALID; - F_CLR(cursorp, H_OK); + F_CLR(hcp, H_OK); return (ret); } @@ -787,15 +846,16 @@ __ham_del_pair(hashp, cursorp, reclaim_page) * Given the key data indicated by the cursor, replace part/all of it * according to the fields in the dbt. * - * PUBLIC: int __ham_replpair __P((HTAB *, HASH_CURSOR *, DBT *, u_int32_t)); + * PUBLIC: int __ham_replpair __P((DBC *, DBT *, u_int32_t)); */ int -__ham_replpair(hashp, hcp, dbt, make_dup) - HTAB *hashp; - HASH_CURSOR *hcp; +__ham_replpair(dbc, dbt, make_dup) + DBC *dbc; DBT *dbt; u_int32_t make_dup; { + DB *dbp; + HASH_CURSOR *hcp; DBT old_dbt, tdata, tmp; DB_LSN new_lsn; int32_t change; /* XXX: Possible overflow. */ @@ -814,6 +874,8 @@ __ham_replpair(hashp, hcp, dbt, make_dup) * be the common case). We handle case 3 as a delete and * add. */ + dbp = dbc->dbp; + hcp = (HASH_CURSOR *)dbc->internal; /* * We need to compute the number of bytes that we are adding or @@ -833,7 +895,7 @@ __ham_replpair(hashp, hcp, dbt, make_dup) memcpy(&len, HOFFPAGE_TLEN(hk), sizeof(u_int32_t)); else len = LEN_HKEYDATA(hcp->pagep, - hashp->dbp->pgsize, H_DATAINDEX(hcp->bndx)); + dbp->pgsize, H_DATAINDEX(hcp->bndx)); if (dbt->doff + dbt->dlen > len) change += dbt->doff + dbt->dlen - len; @@ -854,41 +916,39 @@ __ham_replpair(hashp, hcp, dbt, make_dup) tmp.flags = 0; F_SET(&tmp, DB_DBT_MALLOC | DB_DBT_INTERNAL); if ((ret = - __db_ret(hashp->dbp, hcp->pagep, H_KEYINDEX(hcp->bndx), - &tmp, &hcp->big_key, &hcp->big_keylen)) != 0) + __db_ret(dbp, hcp->pagep, H_KEYINDEX(hcp->bndx), + &tmp, &dbc->rkey.data, &dbc->rkey.size)) != 0) return (ret); if (dbt->doff == 0 && dbt->dlen == len) { - ret = __ham_del_pair(hashp, hcp, 0); + ret = __ham_del_pair(dbc, 0); if (ret == 0) - ret = __ham_add_el(hashp, - hcp, &tmp, dbt, H_KEYDATA); + ret = __ham_add_el(dbc, &tmp, dbt, H_KEYDATA); } else { /* Case B */ type = HPAGE_PTYPE(hk) != H_OFFPAGE ? HPAGE_PTYPE(hk) : H_KEYDATA; tdata.flags = 0; F_SET(&tdata, DB_DBT_MALLOC | DB_DBT_INTERNAL); - if ((ret = __db_ret(hashp->dbp, hcp->pagep, - H_DATAINDEX(hcp->bndx), &tdata, &hcp->big_data, - &hcp->big_datalen)) != 0) + if ((ret = __db_ret(dbp, hcp->pagep, + H_DATAINDEX(hcp->bndx), &tdata, &dbc->rdata.data, + &dbc->rdata.size)) != 0) goto err; /* Now we can delete the item. */ - if ((ret = __ham_del_pair(hashp, hcp, 0)) != 0) { - __db_free(tdata.data); + if ((ret = __ham_del_pair(dbc, 0)) != 0) { + __os_free(tdata.data, tdata.size); goto err; } /* Now shift old data around to make room for new. */ if (change > 0) { - tdata.data = (void *)__db_realloc(tdata.data, - tdata.size + change); + if ((ret = __os_realloc(&tdata.data, + tdata.size + change)) != 0) + return (ret); memset((u_int8_t *)tdata.data + tdata.size, 0, change); } - if (tdata.data == NULL) - return (ENOMEM); end = (u_int8_t *)tdata.data + tdata.size; src = (u_int8_t *)tdata.data + dbt->doff + dbt->dlen; @@ -902,10 +962,10 @@ __ham_replpair(hashp, hcp, dbt, make_dup) tdata.size += change; /* Now add the pair. */ - ret = __ham_add_el(hashp, hcp, &tmp, &tdata, type); - __db_free(tdata.data); + ret = __ham_add_el(dbc, &tmp, &tdata, type); + __os_free(tdata.data, tdata.size); } -err: __db_free(tmp.data); +err: __os_free(tmp.data, tmp.size); return (ret); } @@ -921,12 +981,11 @@ err: __db_free(tmp.data); * all the parameters here. Then log the call before moving * anything around. */ - if (DB_LOGGING(hashp->dbp)) { + if (DB_LOGGING(dbc)) { old_dbt.data = beg; old_dbt.size = dbt->dlen; - if ((ret = __ham_replace_log(hashp->dbp->dbenv->lg_info, - (DB_TXN *)hashp->dbp->txn, &new_lsn, 0, - hashp->dbp->log_fileid, PGNO(hcp->pagep), + if ((ret = __ham_replace_log(dbp->dbenv->lg_info, + dbc->txn, &new_lsn, 0, dbp->log_fileid, PGNO(hcp->pagep), (u_int32_t)H_DATAINDEX(hcp->bndx), &LSN(hcp->pagep), (u_int32_t)dbt->doff, &old_dbt, dbt, make_dup)) != 0) return (ret); @@ -934,7 +993,7 @@ err: __db_free(tmp.data); LSN(hcp->pagep) = new_lsn; /* Structure assignment. */ } - __ham_onpage_replace(hcp->pagep, hashp->dbp->pgsize, + __ham_onpage_replace(hcp->pagep, dbp->pgsize, (u_int32_t)H_DATAINDEX(hcp->bndx), (int32_t)dbt->doff, change, dbt); return (0); @@ -997,13 +1056,15 @@ __ham_onpage_replace(pagep, pgsize, ndx, off, change, dbt) } /* - * PUBLIC: int __ham_split_page __P((HTAB *, u_int32_t, u_int32_t)); + * PUBLIC: int __ham_split_page __P((DBC *, u_int32_t, u_int32_t)); */ int -__ham_split_page(hashp, obucket, nbucket) - HTAB *hashp; +__ham_split_page(dbc, obucket, nbucket) + DBC *dbc; u_int32_t obucket, nbucket; { + DB *dbp; + HASH_CURSOR *hcp; DBT key, page_dbt; DB_ENV *dbenv; DB_LSN new_lsn; @@ -1014,33 +1075,34 @@ __ham_split_page(hashp, obucket, nbucket) int ret, tret; void *big_buf; - dbenv = hashp->dbp->dbenv; + dbp = dbc->dbp; + hcp = (HASH_CURSOR *)dbc->internal; + dbenv = dbp->dbenv; temp_pagep = old_pagep = new_pagep = NULL; - bucket_pgno = BUCKET_TO_PAGE(hashp, obucket); - if ((ret = __ham_get_page(hashp->dbp, bucket_pgno, &old_pagep)) != 0) + bucket_pgno = BUCKET_TO_PAGE(hcp, obucket); + if ((ret = __ham_get_page(dbp, bucket_pgno, &old_pagep)) != 0) return (ret); - if ((ret = __ham_new_page(hashp, BUCKET_TO_PAGE(hashp, nbucket), P_HASH, + if ((ret = __ham_new_page(dbp, BUCKET_TO_PAGE(hcp, nbucket), P_HASH, &new_pagep)) != 0) goto err; - temp_pagep = hashp->split_buf; - memcpy(temp_pagep, old_pagep, hashp->hdr->pagesize); + temp_pagep = hcp->split_buf; + memcpy(temp_pagep, old_pagep, hcp->hdr->pagesize); - if (DB_LOGGING(hashp->dbp)) { - page_dbt.size = hashp->hdr->pagesize; + if (DB_LOGGING(dbc)) { + page_dbt.size = hcp->hdr->pagesize; page_dbt.data = old_pagep; if ((ret = __ham_splitdata_log(dbenv->lg_info, - (DB_TXN *)hashp->dbp->txn, &new_lsn, 0, - hashp->dbp->log_fileid, SPLITOLD, PGNO(old_pagep), - &page_dbt, &LSN(old_pagep))) != 0) + dbc->txn, &new_lsn, 0, dbp->log_fileid, SPLITOLD, + PGNO(old_pagep), &page_dbt, &LSN(old_pagep))) != 0) goto err; } - P_INIT(old_pagep, hashp->hdr->pagesize, PGNO(old_pagep), PGNO_INVALID, + P_INIT(old_pagep, hcp->hdr->pagesize, PGNO(old_pagep), PGNO_INVALID, PGNO_INVALID, 0, P_HASH); - if (DB_LOGGING(hashp->dbp)) + if (DB_LOGGING(dbc)) LSN(old_pagep) = new_lsn; /* Structure assignment. */ big_len = 0; @@ -1049,11 +1111,11 @@ __ham_split_page(hashp, obucket, nbucket) while (temp_pagep != NULL) { for (n = 0; n < (db_indx_t)H_NUMPAIRS(temp_pagep); n++) { if ((ret = - __db_ret(hashp->dbp, temp_pagep, H_KEYINDEX(n), + __db_ret(dbp, temp_pagep, H_KEYINDEX(n), &key, &big_buf, &big_len)) != 0) goto err; - if (__ham_call_hash(hashp, key.data, key.size) + if (__ham_call_hash(hcp, key.data, key.size) == obucket) pp = &old_pagep; else @@ -1064,59 +1126,59 @@ __ham_split_page(hashp, obucket, nbucket) * page to store the key/data pair. */ - len = LEN_HITEM(temp_pagep, hashp->hdr->pagesize, + len = LEN_HITEM(temp_pagep, hcp->hdr->pagesize, H_DATAINDEX(n)) + - LEN_HITEM(temp_pagep, hashp->hdr->pagesize, + LEN_HITEM(temp_pagep, hcp->hdr->pagesize, H_KEYINDEX(n)) + 2 * sizeof(db_indx_t); if (P_FREESPACE(*pp) < len) { - if (DB_LOGGING(hashp->dbp)) { - page_dbt.size = hashp->hdr->pagesize; + if (DB_LOGGING(dbc)) { + page_dbt.size = hcp->hdr->pagesize; page_dbt.data = *pp; if ((ret = __ham_splitdata_log( - dbenv->lg_info, - (DB_TXN *)hashp->dbp->txn, - &new_lsn, 0, - hashp->dbp->log_fileid, SPLITNEW, - PGNO(*pp), &page_dbt, + dbenv->lg_info, dbc->txn, + &new_lsn, 0, dbp->log_fileid, + SPLITNEW, PGNO(*pp), &page_dbt, &LSN(*pp))) != 0) goto err; LSN(*pp) = new_lsn; } - if ((ret = __ham_add_ovflpage(hashp, - *pp, 1, pp)) != 0) + if ((ret = + __ham_add_ovflpage(dbc, *pp, 1, pp)) != 0) goto err; } - __ham_copy_item(hashp, temp_pagep, H_KEYINDEX(n), *pp); - __ham_copy_item(hashp, temp_pagep, H_DATAINDEX(n), *pp); + __ham_copy_item(dbp->pgsize, + temp_pagep, H_KEYINDEX(n), *pp); + __ham_copy_item(dbp->pgsize, + temp_pagep, H_DATAINDEX(n), *pp); } next_pgno = NEXT_PGNO(temp_pagep); /* Clear temp_page; if it's a link overflow page, free it. */ if (PGNO(temp_pagep) != bucket_pgno && (ret = - __ham_del_page(hashp->dbp, temp_pagep)) != 0) + __ham_del_page(dbc, temp_pagep)) != 0) goto err; if (next_pgno == PGNO_INVALID) temp_pagep = NULL; else if ((ret = - __ham_get_page(hashp->dbp, next_pgno, &temp_pagep)) != 0) + __ham_get_page(dbp, next_pgno, &temp_pagep)) != 0) goto err; - if (temp_pagep != NULL && DB_LOGGING(hashp->dbp)) { - page_dbt.size = hashp->hdr->pagesize; + if (temp_pagep != NULL && DB_LOGGING(dbc)) { + page_dbt.size = hcp->hdr->pagesize; page_dbt.data = temp_pagep; if ((ret = __ham_splitdata_log(dbenv->lg_info, - (DB_TXN *)hashp->dbp->txn, &new_lsn, 0, - hashp->dbp->log_fileid, SPLITOLD, PGNO(temp_pagep), + dbc->txn, &new_lsn, 0, dbp->log_fileid, + SPLITOLD, PGNO(temp_pagep), &page_dbt, &LSN(temp_pagep))) != 0) goto err; LSN(temp_pagep) = new_lsn; } } if (big_buf != NULL) - __db_free(big_buf); + __os_free(big_buf, big_len); /* * If the original bucket spanned multiple pages, then we've got @@ -1124,42 +1186,41 @@ __ham_split_page(hashp, obucket, nbucket) * should be deleted. */ if (temp_pagep != NULL && PGNO(temp_pagep) != bucket_pgno && - (ret = __ham_del_page(hashp->dbp, temp_pagep)) != 0) + (ret = __ham_del_page(dbc, temp_pagep)) != 0) goto err; /* * Write new buckets out. */ - if (DB_LOGGING(hashp->dbp)) { - page_dbt.size = hashp->hdr->pagesize; + if (DB_LOGGING(dbc)) { + page_dbt.size = hcp->hdr->pagesize; page_dbt.data = old_pagep; if ((ret = __ham_splitdata_log(dbenv->lg_info, - (DB_TXN *)hashp->dbp->txn, &new_lsn, 0, - hashp->dbp->log_fileid, SPLITNEW, PGNO(old_pagep), + dbc->txn, &new_lsn, 0, dbp->log_fileid, + SPLITNEW, PGNO(old_pagep), &page_dbt, &LSN(old_pagep))) != 0) goto err; LSN(old_pagep) = new_lsn; page_dbt.data = new_pagep; if ((ret = __ham_splitdata_log(dbenv->lg_info, - (DB_TXN *)hashp->dbp->txn, &new_lsn, 0, - hashp->dbp->log_fileid, SPLITNEW, PGNO(new_pagep), - &page_dbt, &LSN(new_pagep))) != 0) + dbc->txn, &new_lsn, 0, dbp->log_fileid, + SPLITNEW, PGNO(new_pagep), &page_dbt, &LSN(new_pagep))) != 0) goto err; LSN(new_pagep) = new_lsn; } - ret = __ham_put_page(hashp->dbp, old_pagep, 1); - if ((tret = __ham_put_page(hashp->dbp, new_pagep, 1)) != 0 && + ret = __ham_put_page(dbp, old_pagep, 1); + if ((tret = __ham_put_page(dbp, new_pagep, 1)) != 0 && ret == 0) ret = tret; if (0) { err: if (old_pagep != NULL) - (void)__ham_put_page(hashp->dbp, old_pagep, 1); + (void)__ham_put_page(dbp, old_pagep, 1); if (new_pagep != NULL) - (void)__ham_put_page(hashp->dbp, new_pagep, 1); + (void)__ham_put_page(dbp, new_pagep, 1); if (temp_pagep != NULL && PGNO(temp_pagep) != bucket_pgno) - (void)__ham_put_page(hashp->dbp, temp_pagep, 1); + (void)__ham_put_page(dbp, temp_pagep, 1); } return (ret); } @@ -1171,16 +1232,16 @@ err: if (old_pagep != NULL) * to which we just added something. This allows us to link overflow * pages and return the new page having correctly put the last page. * - * PUBLIC: int __ham_add_el - * PUBLIC: __P((HTAB *, HASH_CURSOR *, const DBT *, const DBT *, int)); + * PUBLIC: int __ham_add_el __P((DBC *, const DBT *, const DBT *, int)); */ int -__ham_add_el(hashp, hcp, key, val, type) - HTAB *hashp; - HASH_CURSOR *hcp; +__ham_add_el(dbc, key, val, type) + DBC *dbc; const DBT *key, *val; int type; { + DB *dbp; + HASH_CURSOR *hcp; const DBT *pkey, *pdata; DBT key_dbt, data_dbt; DB_LSN new_lsn; @@ -1190,17 +1251,19 @@ __ham_add_el(hashp, hcp, key, val, type) int do_expand, is_keybig, is_databig, ret; int key_type, data_type; + dbp = dbc->dbp; + hcp = (HASH_CURSOR *)dbc->internal; do_expand = 0; - if (hcp->pagep == NULL && (ret = __ham_get_page(hashp->dbp, + if (hcp->pagep == NULL && (ret = __ham_get_page(dbp, hcp->seek_found_page != PGNO_INVALID ? hcp->seek_found_page : hcp->pgno, &hcp->pagep)) != 0) return (ret); key_size = HKEYDATA_PSIZE(key->size); data_size = HKEYDATA_PSIZE(val->size); - is_keybig = ISBIG(hashp, key->size); - is_databig = ISBIG(hashp, val->size); + is_keybig = ISBIG(hcp, key->size); + is_databig = ISBIG(hcp, val->size); if (is_keybig) key_size = HOFFPAGE_PSIZE; if (is_databig) @@ -1220,7 +1283,7 @@ __ham_add_el(hashp, hcp, key, val, type) break; next_pgno = NEXT_PGNO(hcp->pagep); if ((ret = - __ham_next_cpage(hashp, hcp, next_pgno, 0, 0)) != 0) + __ham_next_cpage(dbc, next_pgno, 0, 0)) != 0) return (ret); } @@ -1229,7 +1292,7 @@ __ham_add_el(hashp, hcp, key, val, type) */ if (P_FREESPACE(hcp->pagep) < pairsize) { do_expand = 1; - if ((ret = __ham_add_ovflpage(hashp, + if ((ret = __ham_add_ovflpage(dbc, hcp->pagep, 1, &hcp->pagep)) != 0) return (ret); hcp->pgno = PGNO(hcp->pagep); @@ -1241,10 +1304,13 @@ __ham_add_el(hashp, hcp, key, val, type) hcp->bndx = H_NUMPAIRS(hcp->pagep); F_CLR(hcp, H_DELETED); if (is_keybig) { - if ((ret = __db_poff(hashp->dbp, + koff.type = H_OFFPAGE; + UMRW(koff.unused[0]); + UMRW(koff.unused[1]); + UMRW(koff.unused[2]); + if ((ret = __db_poff(dbc, key, &koff.pgno, __ham_overflow_page)) != 0) return (ret); - koff.type = H_OFFPAGE; koff.tlen = key->size; key_dbt.data = &koff; key_dbt.size = sizeof(koff); @@ -1256,10 +1322,13 @@ __ham_add_el(hashp, hcp, key, val, type) } if (is_databig) { - if ((ret = __db_poff(hashp->dbp, + doff.type = H_OFFPAGE; + UMRW(doff.unused[0]); + UMRW(doff.unused[1]); + UMRW(doff.unused[2]); + if ((ret = __db_poff(dbc, val, &doff.pgno, __ham_overflow_page)) != 0) return (ret); - doff.type = H_OFFPAGE; doff.tlen = val->size; data_dbt.data = &doff; data_dbt.size = sizeof(doff); @@ -1270,16 +1339,16 @@ __ham_add_el(hashp, hcp, key, val, type) data_type = type; } - if (DB_LOGGING(hashp->dbp)) { + if (DB_LOGGING(dbc)) { rectype = PUTPAIR; if (is_databig) rectype |= PAIR_DATAMASK; if (is_keybig) rectype |= PAIR_KEYMASK; - if ((ret = __ham_insdel_log(hashp->dbp->dbenv->lg_info, - (DB_TXN *)hashp->dbp->txn, &new_lsn, 0, rectype, - hashp->dbp->log_fileid, PGNO(hcp->pagep), + if ((ret = __ham_insdel_log(dbp->dbenv->lg_info, + dbc->txn, &new_lsn, 0, rectype, + dbp->log_fileid, PGNO(hcp->pagep), (u_int32_t)H_NUMPAIRS(hcp->pagep), &LSN(hcp->pagep), pkey, pdata)) != 0) return (ret); @@ -1303,11 +1372,11 @@ __ham_add_el(hashp, hcp, key, val, type) /* * XXX Maybe keep incremental numbers here */ - if (!F_ISSET(hashp->dbp, DB_AM_LOCKING)) - hashp->hdr->nelem++; + if (!F_ISSET(dbp, DB_AM_LOCKING)) + hcp->hdr->nelem++; - if (do_expand || (hashp->hdr->ffactor != 0 && - (u_int32_t)H_NUMPAIRS(hcp->pagep) > hashp->hdr->ffactor)) + if (do_expand || (hcp->hdr->ffactor != 0 && + (u_int32_t)H_NUMPAIRS(hcp->pagep) > hcp->hdr->ffactor)) F_SET(hcp, H_EXPAND); return (0); } @@ -1319,11 +1388,11 @@ __ham_add_el(hashp, hcp, key, val, type) * H_DUPLICATE, H_OFFDUP). Since we log splits at a high level, we * do not need to do any logging here. * - * PUBLIC: void __ham_copy_item __P((HTAB *, PAGE *, u_int32_t, PAGE *)); + * PUBLIC: void __ham_copy_item __P((size_t, PAGE *, u_int32_t, PAGE *)); */ void -__ham_copy_item(hashp, src_page, src_ndx, dest_page) - HTAB *hashp; +__ham_copy_item(pgsize, src_page, src_ndx, dest_page) + size_t pgsize; PAGE *src_page; u_int32_t src_ndx; PAGE *dest_page; @@ -1337,7 +1406,7 @@ __ham_copy_item(hashp, src_page, src_ndx, dest_page) src = P_ENTRY(src_page, src_ndx); /* Set up space on dest. */ - len = LEN_HITEM(src_page, hashp->hdr->pagesize, src_ndx); + len = LEN_HITEM(src_page, pgsize, src_ndx); HOFFSET(dest_page) -= len; dest_page->inp[NUM_ENT(dest_page)] = HOFFSET(dest_page); dest = P_ENTRY(dest_page, NUM_ENT(dest_page)); @@ -1352,29 +1421,31 @@ __ham_copy_item(hashp, src_page, src_ndx, dest_page) * pointer on success * NULL on error * - * PUBLIC: int __ham_add_ovflpage __P((HTAB *, PAGE *, int, PAGE **)); + * PUBLIC: int __ham_add_ovflpage __P((DBC *, PAGE *, int, PAGE **)); */ int -__ham_add_ovflpage(hashp, pagep, release, pp) - HTAB *hashp; +__ham_add_ovflpage(dbc, pagep, release, pp) + DBC *dbc; PAGE *pagep; int release; PAGE **pp; { - DB_ENV *dbenv; + DB *dbp; + HASH_CURSOR *hcp; DB_LSN new_lsn; PAGE *new_pagep; int ret; - dbenv = hashp->dbp->dbenv; + dbp = dbc->dbp; + hcp = (HASH_CURSOR *)dbc->internal; - if ((ret = __ham_overflow_page(hashp->dbp, P_HASH, &new_pagep)) != 0) + if ((ret = __ham_overflow_page(dbc, P_HASH, &new_pagep)) != 0) return (ret); - if (DB_LOGGING(hashp->dbp)) { - if ((ret = __ham_newpage_log(dbenv->lg_info, - (DB_TXN *)hashp->dbp->txn, &new_lsn, 0, PUTOVFL, - hashp->dbp->log_fileid, PGNO(pagep), &LSN(pagep), + if (DB_LOGGING(dbc)) { + if ((ret = __ham_newpage_log(dbp->dbenv->lg_info, + dbc->txn, &new_lsn, 0, PUTOVFL, + dbp->log_fileid, PGNO(pagep), &LSN(pagep), PGNO(new_pagep), &LSN(new_pagep), PGNO_INVALID, NULL)) != 0) return (ret); @@ -1385,78 +1456,76 @@ __ham_add_ovflpage(hashp, pagep, release, pp) PREV_PGNO(new_pagep) = PGNO(pagep); if (release) - ret = __ham_put_page(hashp->dbp, pagep, 1); + ret = __ham_put_page(dbp, pagep, 1); - hashp->hash_overflows++; + hcp->stats.hash_overflows++; *pp = new_pagep; return (ret); } /* - * PUBLIC: int __ham_new_page __P((HTAB *, u_int32_t, u_int32_t, PAGE **)); + * PUBLIC: int __ham_new_page __P((DB *, u_int32_t, u_int32_t, PAGE **)); */ int -__ham_new_page(hashp, addr, type, pp) - HTAB *hashp; +__ham_new_page(dbp, addr, type, pp) + DB *dbp; u_int32_t addr, type; PAGE **pp; { PAGE *pagep; int ret; - if ((ret = memp_fget(hashp->dbp->mpf, + if ((ret = memp_fget(dbp->mpf, &addr, DB_MPOOL_CREATE, &pagep)) != 0) return (ret); -#ifdef DEBUG_SLOW - __account_page(hashp, addr, 1); -#endif /* This should not be necessary because page-in should do it. */ - P_INIT(pagep, - hashp->hdr->pagesize, addr, PGNO_INVALID, PGNO_INVALID, 0, type); + P_INIT(pagep, dbp->pgsize, addr, PGNO_INVALID, PGNO_INVALID, 0, type); *pp = pagep; return (0); } /* - * PUBLIC: int __ham_del_page __P((DB *, PAGE *)); + * PUBLIC: int __ham_del_page __P((DBC *, PAGE *)); */ int -__ham_del_page(dbp, pagep) - DB *dbp; +__ham_del_page(dbc, pagep) + DBC *dbc; PAGE *pagep; { + DB *dbp; + HASH_CURSOR *hcp; DB_LSN new_lsn; - HTAB *hashp; int ret; - hashp = (HTAB *)dbp->internal; + dbp = dbc->dbp; + hcp = (HASH_CURSOR *)dbc->internal; ret = 0; - DIRTY_META(hashp, ret); + DIRTY_META(dbp, hcp, ret); if (ret != 0) { if (ret != EAGAIN) - __db_err(hashp->dbp->dbenv, + __db_err(dbp->dbenv, "free_ovflpage: unable to lock meta data page %s\n", strerror(ret)); /* * If we are going to return an error, then we should free * the page, so it doesn't stay pinned forever. */ - (void)__ham_put_page(hashp->dbp, pagep, 0); + (void)__ham_put_page(dbp, pagep, 0); return (ret); } - if (DB_LOGGING(hashp->dbp)) { - if ((ret = __ham_newpgno_log(hashp->dbp->dbenv->lg_info, - (DB_TXN *)hashp->dbp->txn, &new_lsn, 0, DELPGNO, - hashp->dbp->log_fileid, PGNO(pagep), hashp->hdr->last_freed, + if (DB_LOGGING(dbc)) { + if ((ret = __ham_newpgno_log(dbp->dbenv->lg_info, + dbc->txn, &new_lsn, 0, DELPGNO, + dbp->log_fileid, PGNO(pagep), hcp->hdr->last_freed, (u_int32_t)TYPE(pagep), NEXT_PGNO(pagep), P_INVALID, - &LSN(pagep), &hashp->hdr->lsn)) != 0) + &LSN(pagep), &hcp->hdr->lsn)) != 0) return (ret); - hashp->hdr->lsn = new_lsn; + hcp->hdr->lsn = new_lsn; LSN(pagep) = new_lsn; } @@ -1466,16 +1535,16 @@ __ham_del_page(dbp, pagep) DB_LSN __lsn; __pgno = pagep->pgno; __lsn = pagep->lsn; - memset(pagep, 0xff, dbp->pgsize); + memset(pagep, 0xdb, dbp->pgsize); pagep->pgno = __pgno; pagep->lsn = __lsn; } #endif TYPE(pagep) = P_INVALID; - NEXT_PGNO(pagep) = hashp->hdr->last_freed; - hashp->hdr->last_freed = PGNO(pagep); + NEXT_PGNO(pagep) = hcp->hdr->last_freed; + hcp->hdr->last_freed = PGNO(pagep); - return (__ham_put_page(hashp->dbp, pagep, 1)); + return (__ham_put_page(dbp, pagep, 1)); } @@ -1489,8 +1558,7 @@ __ham_put_page(dbp, pagep, is_dirty) int32_t is_dirty; { #ifdef DEBUG_SLOW - __account_page((HTAB *)dbp->cookie, - ((BKT *)((char *)pagep - sizeof(BKT)))->pgno, -1); + __account_page(dbp, ((BKT *)((char *)pagep - sizeof(BKT)))->pgno, -1); #endif return (memp_fput(dbp->mpf, pagep, (is_dirty ? DB_MPOOL_DIRTY : 0))); } @@ -1499,14 +1567,14 @@ __ham_put_page(dbp, pagep, is_dirty) * __ham_dirty_page -- * Mark a page dirty. * - * PUBLIC: int __ham_dirty_page __P((HTAB *, PAGE *)); + * PUBLIC: int __ham_dirty_page __P((DB *, PAGE *)); */ int -__ham_dirty_page(hashp, pagep) - HTAB *hashp; +__ham_dirty_page(dbp, pagep) + DB *dbp; PAGE *pagep; { - return (memp_fset(hashp->dbp->mpf, pagep, DB_MPOOL_DIRTY)); + return (memp_fset(dbp->mpf, pagep, DB_MPOOL_DIRTY)); } /* @@ -1523,31 +1591,33 @@ __ham_get_page(dbp, addr, pagep) ret = memp_fget(dbp->mpf, &addr, DB_MPOOL_CREATE, pagep); #ifdef DEBUG_SLOW if (*pagep != NULL) - __account_page((HTAB *)dbp->internal, addr, 1); + __account_page(dbp, addr, 1); #endif return (ret); } /* - * PUBLIC: int __ham_overflow_page __P((DB *, u_int32_t, PAGE **)); + * PUBLIC: int __ham_overflow_page + * PUBLIC: __P((DBC *, u_int32_t, PAGE **)); */ int -__ham_overflow_page(dbp, type, pp) - DB *dbp; +__ham_overflow_page(dbc, type, pp) + DBC *dbc; u_int32_t type; PAGE **pp; { + DB *dbp; + HASH_CURSOR *hcp; DB_LSN *lsnp, new_lsn; - HTAB *hashp; PAGE *p; db_pgno_t new_addr, next_free, newalloc_flag; u_int32_t offset, splitnum; int ret; - hashp = (HTAB *)dbp->internal; - ret = 0; - DIRTY_META(hashp, ret); + dbp = dbc->dbp; + hcp = (HASH_CURSOR *)dbc->internal; + DIRTY_META(dbp, hcp, ret); if (ret != 0) return (ret); @@ -1558,22 +1628,22 @@ __ham_overflow_page(dbp, type, pp) * after the log do we get to complete allocation of the * new page. */ - new_addr = hashp->hdr->last_freed; + new_addr = hcp->hdr->last_freed; if (new_addr != PGNO_INVALID) { - if ((ret = __ham_get_page(hashp->dbp, new_addr, &p)) != 0) + if ((ret = __ham_get_page(dbp, new_addr, &p)) != 0) return (ret); next_free = NEXT_PGNO(p); lsnp = &LSN(p); newalloc_flag = 0; } else { - splitnum = hashp->hdr->ovfl_point; - hashp->hdr->spares[splitnum]++; - offset = hashp->hdr->spares[splitnum] - - (splitnum ? hashp->hdr->spares[splitnum - 1] : 0); - new_addr = PGNO_OF(hashp, hashp->hdr->ovfl_point, offset); - if (new_addr > MAX_PAGES(hashp)) { - __db_err(hashp->dbp->dbenv, "hash: out of file pages"); - hashp->hdr->spares[splitnum]--; + splitnum = hcp->hdr->ovfl_point; + hcp->hdr->spares[splitnum]++; + offset = hcp->hdr->spares[splitnum] - + (splitnum ? hcp->hdr->spares[splitnum - 1] : 0); + new_addr = PGNO_OF(hcp, hcp->hdr->ovfl_point, offset); + if (new_addr > MAX_PAGES(hcp)) { + __db_err(dbp->dbenv, "hash: out of file pages"); + hcp->hdr->spares[splitnum]--; return (ENOMEM); } next_free = PGNO_INVALID; @@ -1582,29 +1652,29 @@ __ham_overflow_page(dbp, type, pp) newalloc_flag = 1; } - if (DB_LOGGING(hashp->dbp)) { - if ((ret = __ham_newpgno_log(hashp->dbp->dbenv->lg_info, - (DB_TXN *)hashp->dbp->txn, &new_lsn, 0, ALLOCPGNO, - hashp->dbp->log_fileid, new_addr, next_free, - 0, newalloc_flag, type, lsnp, &hashp->hdr->lsn)) != 0) + if (DB_LOGGING(dbc)) { + if ((ret = __ham_newpgno_log(dbp->dbenv->lg_info, + dbc->txn, &new_lsn, 0, ALLOCPGNO, + dbp->log_fileid, new_addr, next_free, + 0, newalloc_flag, type, lsnp, &hcp->hdr->lsn)) != 0) return (ret); - hashp->hdr->lsn = new_lsn; + hcp->hdr->lsn = new_lsn; if (lsnp != NULL) *lsnp = new_lsn; } if (p != NULL) { /* We just took something off the free list, initialize it. */ - hashp->hdr->last_freed = next_free; - P_INIT(p, hashp->hdr->pagesize, PGNO(p), PGNO_INVALID, + hcp->hdr->last_freed = next_free; + P_INIT(p, hcp->hdr->pagesize, PGNO(p), PGNO_INVALID, PGNO_INVALID, 0, (u_int8_t)type); } else { /* Get the new page. */ - if ((ret = __ham_new_page(hashp, new_addr, type, &p)) != 0) + if ((ret = __ham_new_page(dbp, new_addr, type, &p)) != 0) return (ret); } - if (DB_LOGGING(hashp->dbp)) + if (DB_LOGGING(dbc)) LSN(p) = new_lsn; *pp = p; @@ -1614,94 +1684,123 @@ __ham_overflow_page(dbp, type, pp) #ifdef DEBUG /* * PUBLIC: #ifdef DEBUG - * PUBLIC: db_pgno_t __bucket_to_page __P((HTAB *, db_pgno_t)); + * PUBLIC: db_pgno_t __bucket_to_page __P((HASH_CURSOR *, db_pgno_t)); * PUBLIC: #endif */ db_pgno_t -__bucket_to_page(hashp, n) - HTAB *hashp; +__bucket_to_page(hcp, n) + HASH_CURSOR *hcp; db_pgno_t n; { int ret_val; ret_val = n + 1; if (n != 0) - ret_val += hashp->hdr->spares[__db_log2(n + 1) - 1]; + ret_val += hcp->hdr->spares[__db_log2(n + 1) - 1]; return (ret_val); } #endif /* * Create a bunch of overflow pages at the current split point. - * PUBLIC: void __ham_init_ovflpages __P((HTAB *)); + * PUBLIC: void __ham_init_ovflpages __P((DBC *)); */ void -__ham_init_ovflpages(hp) - HTAB *hp; +__ham_init_ovflpages(dbc) + DBC *dbc; { + DB *dbp; + HASH_CURSOR *hcp; DB_LSN new_lsn; PAGE *p; db_pgno_t last_pgno, new_pgno; u_int32_t i, curpages, numpages; - curpages = hp->hdr->spares[hp->hdr->ovfl_point] - - hp->hdr->spares[hp->hdr->ovfl_point - 1]; - numpages = hp->hdr->ovfl_point + 1 - curpages; - - last_pgno = hp->hdr->last_freed; - new_pgno = PGNO_OF(hp, hp->hdr->ovfl_point, curpages + 1); - if (DB_LOGGING(hp->dbp)) { - (void)__ham_ovfl_log(hp->dbp->dbenv->lg_info, - (DB_TXN *)hp->dbp->txn, &new_lsn, 0, - hp->dbp->log_fileid, new_pgno, - numpages, last_pgno, hp->hdr->ovfl_point, &hp->hdr->lsn); - hp->hdr->lsn = new_lsn; + dbp = dbc->dbp; + hcp = (HASH_CURSOR *)dbc->internal; + + curpages = hcp->hdr->spares[hcp->hdr->ovfl_point] - + hcp->hdr->spares[hcp->hdr->ovfl_point - 1]; + numpages = hcp->hdr->ovfl_point + 1 - curpages; + + last_pgno = hcp->hdr->last_freed; + new_pgno = PGNO_OF(hcp, hcp->hdr->ovfl_point, curpages + 1); + if (DB_LOGGING(dbc)) { + (void)__ham_ovfl_log(dbp->dbenv->lg_info, + dbc->txn, &new_lsn, 0, dbp->log_fileid, new_pgno, + numpages, last_pgno, hcp->hdr->ovfl_point, &hcp->hdr->lsn); + hcp->hdr->lsn = new_lsn; } else ZERO_LSN(new_lsn); - hp->hdr->spares[hp->hdr->ovfl_point] += numpages; + hcp->hdr->spares[hcp->hdr->ovfl_point] += numpages; for (i = numpages; i > 0; i--) { - if (__ham_new_page(hp, - PGNO_OF(hp, hp->hdr->ovfl_point, curpages + i), + if (__ham_new_page(dbp, + PGNO_OF(hcp, hcp->hdr->ovfl_point, curpages + i), P_INVALID, &p) != 0) break; LSN(p) = new_lsn; NEXT_PGNO(p) = last_pgno; last_pgno = PGNO(p); - (void)__ham_put_page(hp->dbp, p, 1); + (void)__ham_put_page(dbp, p, 1); } - hp->hdr->last_freed = last_pgno; + hcp->hdr->last_freed = last_pgno; } /* - * PUBLIC: int __ham_get_cpage __P((HTAB *, HASH_CURSOR *, db_lockmode_t)); + * PUBLIC: int __ham_get_cpage __P((DBC *, db_lockmode_t)); */ int -__ham_get_cpage(hashp, hcp, mode) - HTAB *hashp; - HASH_CURSOR *hcp; +__ham_get_cpage(dbc, mode) + DBC *dbc; db_lockmode_t mode; { + DB *dbp; + HASH_CURSOR *hcp; int ret; - if (hcp->lock == 0 && F_ISSET(hashp->dbp, DB_AM_LOCKING) && - (ret = __ham_lock_bucket(hashp->dbp, hcp, mode)) != 0) - return (ret); + dbp = dbc->dbp; + hcp = (HASH_CURSOR *)dbc->internal; + + /* + * There are three cases with respect to buckets and locks. If there + * is no lock held, then if we are locking, we should get the lock. + * If there is a lock held and it's for the current bucket, we don't + * need to do anything. If there is a lock, but it's for a different + * bucket, then we need to release and get. + */ + if (F_ISSET(dbp, DB_AM_LOCKING)) { + if (hcp->lock != 0 && hcp->lbucket != hcp->bucket) { + /* + * If this is the original lock, don't release it, + * because we may need to restore it upon exit. + */ + if (dbc->txn == NULL && + !F_ISSET(hcp, H_ORIGINAL) && (ret = + lock_put(dbp->dbenv->lk_info, hcp->lock)) != 0) + return (ret); + F_CLR(hcp, H_ORIGINAL); + hcp->lock = 0; + } + if (hcp->lock == 0 && (ret = __ham_lock_bucket(dbc, mode)) != 0) + return (ret); + hcp->lbucket = hcp->bucket; + } if (hcp->pagep == NULL) { if (hcp->pgno == PGNO_INVALID) { - hcp->pgno = BUCKET_TO_PAGE(hashp, hcp->bucket); + hcp->pgno = BUCKET_TO_PAGE(hcp, hcp->bucket); hcp->bndx = 0; } if ((ret = - __ham_get_page(hashp->dbp, hcp->pgno, &hcp->pagep)) != 0) + __ham_get_page(dbp, hcp->pgno, &hcp->pagep)) != 0) return (ret); } if (hcp->dpgno != PGNO_INVALID && hcp->dpagep == NULL) if ((ret = - __ham_get_page(hashp->dbp, hcp->dpgno, &hcp->dpagep)) != 0) + __ham_get_page(dbp, hcp->dpgno, &hcp->dpagep)) != 0) return (ret); return (0); } @@ -1711,28 +1810,30 @@ __ham_get_cpage(hashp, hcp, mode) * If the flag is set to H_ISDUP, then we are talking about the * duplicate page, not the main page. * - * PUBLIC: int __ham_next_cpage - * PUBLIC: __P((HTAB *, HASH_CURSOR *, db_pgno_t, int, u_int32_t)); + * PUBLIC: int __ham_next_cpage __P((DBC *, db_pgno_t, int, u_int32_t)); */ int -__ham_next_cpage(hashp, hcp, pgno, dirty, flags) - HTAB *hashp; - HASH_CURSOR *hcp; +__ham_next_cpage(dbc, pgno, dirty, flags) + DBC *dbc; db_pgno_t pgno; int dirty; u_int32_t flags; { + DB *dbp; + HASH_CURSOR *hcp; PAGE *p; int ret; + dbp = dbc->dbp; + hcp = (HASH_CURSOR *)dbc->internal; if (LF_ISSET(H_ISDUP) && hcp->dpagep != NULL && - (ret = __ham_put_page(hashp->dbp, hcp->dpagep, dirty)) != 0) + (ret = __ham_put_page(dbp, hcp->dpagep, dirty)) != 0) return (ret); else if (!LF_ISSET(H_ISDUP) && hcp->pagep != NULL && - (ret = __ham_put_page(hashp->dbp, hcp->pagep, dirty)) != 0) + (ret = __ham_put_page(dbp, hcp->pagep, dirty)) != 0) return (ret); - if ((ret = __ham_get_page(hashp->dbp, pgno, &p)) != 0) + if ((ret = __ham_get_page(dbp, pgno, &p)) != 0) return (ret); if (LF_ISSET(H_ISDUP)) { @@ -1753,22 +1854,21 @@ __ham_next_cpage(hashp, hcp, pgno, dirty, flags) * Get the lock on a particular bucket. */ static int -__ham_lock_bucket(dbp, hcp, mode) - DB *dbp; - HASH_CURSOR *hcp; +__ham_lock_bucket(dbc, mode) + DBC *dbc; db_lockmode_t mode; { + HASH_CURSOR *hcp; int ret; - /* - * What a way to trounce on the memory system. It might be - * worth copying the lk_info into the hashp. - */ - ret = 0; - dbp->lock.pgno = (db_pgno_t)(hcp->bucket); - ret = lock_get(dbp->dbenv->lk_info, - dbp->txn == NULL ? dbp->locker : dbp->txn->txnid, 0, - &dbp->lock_dbt, mode, &hcp->lock); + hcp = (HASH_CURSOR *)dbc->internal; + dbc->lock.pgno = (db_pgno_t)(hcp->bucket); + if (dbc->txn == NULL) + ret = lock_get(dbc->dbp->dbenv->lk_info, dbc->locker, 0, + &dbc->lock_dbt, mode, &hcp->lock); + else + ret = lock_tget(dbc->dbp->dbenv->lk_info, dbc->txn, 0, + &dbc->lock_dbt, mode, &hcp->lock); return (ret < 0 ? EAGAIN : ret); } @@ -1827,45 +1927,3 @@ __ham_dpair(dbp, p, pndx) HOFFSET(p) = HOFFSET(p) + delta; NUM_ENT(p) = NUM_ENT(p) - 2; } - -#ifdef DEBUG_SLOW -static void -__account_page(hashp, pgno, inout) - HTAB *hashp; - db_pgno_t pgno; - int inout; -{ - static struct { - db_pgno_t pgno; - int times; - } list[100]; - static int last; - int i, j; - - if (inout == -1) /* XXX: Kluge */ - inout = 0; - - /* Find page in list. */ - for (i = 0; i < last; i++) - if (list[i].pgno == pgno) - break; - /* Not found. */ - if (i == last) { - list[last].times = inout; - list[last].pgno = pgno; - last++; - } - list[i].times = inout; - if (list[i].times == 0) { - for (j = i; j < last; j++) - list[j] = list[j + 1]; - last--; - } - for (i = 0; i < last; i++, list[i].times++) - if (list[i].times > 20 && - !__is_bitmap_pgno(hashp, list[i].pgno)) - (void)fprintf(stderr, - "Warning: pg %lu has been out for %d times\n", - (u_long)list[i].pgno, list[i].times); -} -#endif /* DEBUG_SLOW */ diff --git a/db2/hash/hash_rec.c b/db2/hash/hash_rec.c index 727f615828..b58f2c6eb7 100644 --- a/db2/hash/hash_rec.c +++ b/db2/hash/hash_rec.c @@ -47,7 +47,7 @@ #include "config.h" #ifndef lint -static const char sccsid[] = "@(#)hash_rec.c 10.19 (Sleepycat) 5/23/98"; +static const char sccsid[] = "@(#)hash_rec.c 10.22 (Sleepycat) 10/21/98"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -80,17 +80,19 @@ __ham_insdel_recover(logp, dbtp, lsnp, redo, info) void *info; { __ham_insdel_args *argp; - DB *mdbp, *file_dbp; + DB *file_dbp; + DBC *dbc; + HASH_CURSOR *hcp; DB_MPOOLFILE *mpf; - HTAB *hashp; PAGE *pagep; u_int32_t op; int cmp_n, cmp_p, getmeta, ret; getmeta = 0; - hashp = NULL; /* XXX: shut the compiler up. */ + hcp = NULL; REC_PRINT(__ham_insdel_print); REC_INTRO(__ham_insdel_read); + hcp = (HASH_CURSOR *)dbc->internal; ret = memp_fget(mpf, &argp->pgno, 0, &pagep); if (ret != 0) { @@ -101,16 +103,15 @@ __ham_insdel_recover(logp, dbtp, lsnp, redo, info) * would not have to undo anything. In this case, * don't bother creating a page. */ - *lsnp = argp->prev_lsn; - ret = 0; - goto out; + goto done; } else if ((ret = memp_fget(mpf, &argp->pgno, DB_MPOOL_CREATE, &pagep)) != 0) goto out; } - hashp = (HTAB *)file_dbp->internal; - GET_META(file_dbp, hashp); + GET_META(file_dbp, hcp, ret); + if (ret != 0) + goto out; getmeta = 1; cmp_n = log_compare(lsnp, &LSN(pagep)); @@ -144,7 +145,7 @@ __ham_insdel_recover(logp, dbtp, lsnp, redo, info) !redo || PAIR_ISDATABIG(argp->opcode) ? H_OFFPAGE : H_KEYDATA); } else - (void) __ham_reputpair(pagep, hashp->hdr->pagesize, + (void) __ham_reputpair(pagep, hcp->hdr->pagesize, argp->ndx, &argp->key, &argp->data); LSN(pagep) = redo ? *lsnp : argp->pagelsn; @@ -163,10 +164,11 @@ __ham_insdel_recover(logp, dbtp, lsnp, redo, info) goto out; /* Return the previous LSN. */ - *lsnp = argp->prev_lsn; +done: *lsnp = argp->prev_lsn; + ret = 0; out: if (getmeta) - RELEASE_META(file_dbp, hashp); + RELEASE_META(file_dbp, hcp); REC_CLOSE; } @@ -187,16 +189,18 @@ __ham_newpage_recover(logp, dbtp, lsnp, redo, info) void *info; { __ham_newpage_args *argp; - DB *mdbp, *file_dbp; + DB *file_dbp; + DBC *dbc; + HASH_CURSOR *hcp; DB_MPOOLFILE *mpf; - HTAB *hashp; PAGE *pagep; int cmp_n, cmp_p, change, getmeta, ret; getmeta = 0; - hashp = NULL; /* XXX: shut the compiler up. */ + hcp = NULL; REC_PRINT(__ham_newpage_print); REC_INTRO(__ham_newpage_read); + hcp = (HASH_CURSOR *)dbc->internal; ret = memp_fget(mpf, &argp->new_pgno, 0, &pagep); if (ret != 0) { @@ -214,8 +218,9 @@ __ham_newpage_recover(logp, dbtp, lsnp, redo, info) goto out; } - hashp = (HTAB *)file_dbp->internal; - GET_META(file_dbp, hashp); + GET_META(file_dbp, (HASH_CURSOR *)dbc->internal, ret); + if (ret != 0) + goto out; getmeta = 1; /* @@ -289,11 +294,13 @@ ppage: if (argp->prev_pgno != PGNO_INVALID) { } if (!change) { - if ((ret = __ham_put_page(file_dbp, (PAGE *)pagep, 0)) != 0) + if ((ret = + __ham_put_page(file_dbp, (PAGE *)pagep, 0)) != 0) goto out; } else { LSN(pagep) = redo ? *lsnp : argp->prevlsn; - if ((ret = __ham_put_page(file_dbp, (PAGE *)pagep, 1)) != 0) + if ((ret = + __ham_put_page(file_dbp, (PAGE *)pagep, 1)) != 0) goto out; } } @@ -310,9 +317,7 @@ npage: if (argp->next_pgno != PGNO_INVALID) { * so we would not have to undo anything. In * this case, don't bother creating a page. */ - *lsnp = argp->prev_lsn; - ret = 0; - goto out; + goto done; } else if ((ret = memp_fget(mpf, &argp->next_pgno, DB_MPOOL_CREATE, &pagep)) != 0) @@ -346,10 +351,11 @@ npage: if (argp->next_pgno != PGNO_INVALID) { goto out; } } - *lsnp = argp->prev_lsn; +done: *lsnp = argp->prev_lsn; + ret = 0; out: if (getmeta) - RELEASE_META(file_dbp, hashp); + RELEASE_META(file_dbp, hcp); REC_CLOSE; } @@ -372,19 +378,21 @@ __ham_replace_recover(logp, dbtp, lsnp, redo, info) void *info; { __ham_replace_args *argp; - DB *mdbp, *file_dbp; + DB *file_dbp; + DBC *dbc; + HASH_CURSOR *hcp; DB_MPOOLFILE *mpf; DBT dbt; - HTAB *hashp; PAGE *pagep; int32_t grow; int change, cmp_n, cmp_p, getmeta, ret; u_int8_t *hk; getmeta = 0; - hashp = NULL; /* XXX: shut the compiler up. */ + hcp = NULL; REC_PRINT(__ham_replace_print); REC_INTRO(__ham_replace_read); + hcp = (HASH_CURSOR *)dbc->internal; ret = memp_fget(mpf, &argp->pgno, 0, &pagep); if (ret != 0) { @@ -395,16 +403,15 @@ __ham_replace_recover(logp, dbtp, lsnp, redo, info) * would not have to undo anything. In this case, * don't bother creating a page. */ - *lsnp = argp->prev_lsn; - ret = 0; - goto out; + goto done; } else if ((ret = memp_fget(mpf, &argp->pgno, DB_MPOOL_CREATE, &pagep)) != 0) goto out; } - hashp = (HTAB *)file_dbp->internal; - GET_META(file_dbp, hashp); + GET_META(file_dbp, (HASH_CURSOR *)dbc->internal, ret); + if (ret != 0) + goto out; getmeta = 1; cmp_n = log_compare(lsnp, &LSN(pagep)); @@ -444,10 +451,11 @@ __ham_replace_recover(logp, dbtp, lsnp, redo, info) if ((ret = __ham_put_page(file_dbp, pagep, change)) != 0) goto out; - *lsnp = argp->prev_lsn; +done: *lsnp = argp->prev_lsn; + ret = 0; out: if (getmeta) - RELEASE_META(file_dbp, hashp); + RELEASE_META(file_dbp, hcp); REC_CLOSE; } @@ -468,19 +476,22 @@ __ham_newpgno_recover(logp, dbtp, lsnp, redo, info) void *info; { __ham_newpgno_args *argp; - DB *mdbp, *file_dbp; + DB *file_dbp; + DBC *dbc; + HASH_CURSOR *hcp; DB_MPOOLFILE *mpf; - HTAB *hashp; PAGE *pagep; int change, cmp_n, cmp_p, getmeta, ret; getmeta = 0; - hashp = NULL; /* XXX: shut the compiler up. */ + hcp = NULL; REC_PRINT(__ham_newpgno_print); REC_INTRO(__ham_newpgno_read); + hcp = (HASH_CURSOR *)dbc->internal; - hashp = (HTAB *)file_dbp->internal; - GET_META(file_dbp, hashp); + GET_META(file_dbp, (HASH_CURSOR *)dbc->internal, ret); + if (ret != 0) + goto out; getmeta = 1; /* @@ -488,34 +499,34 @@ __ham_newpgno_recover(logp, dbtp, lsnp, redo, info) * to update the meta data; then we need to update the page. * We'll do the meta-data first. */ - cmp_n = log_compare(lsnp, &hashp->hdr->lsn); - cmp_p = log_compare(&hashp->hdr->lsn, &argp->metalsn); + cmp_n = log_compare(lsnp, &hcp->hdr->lsn); + cmp_p = log_compare(&hcp->hdr->lsn, &argp->metalsn); change = 0; if ((cmp_p == 0 && redo && argp->opcode == ALLOCPGNO) || (cmp_n == 0 && !redo && argp->opcode == DELPGNO)) { /* Need to redo an allocation or undo a deletion. */ - hashp->hdr->last_freed = argp->free_pgno; + hcp->hdr->last_freed = argp->free_pgno; if (redo && argp->old_pgno != 0) /* Must be ALLOCPGNO */ - hashp->hdr->spares[hashp->hdr->ovfl_point]++; + hcp->hdr->spares[hcp->hdr->ovfl_point]++; change = 1; } else if (cmp_p == 0 && redo && argp->opcode == DELPGNO) { /* Need to redo a deletion */ - hashp->hdr->last_freed = argp->pgno; + hcp->hdr->last_freed = argp->pgno; change = 1; } else if (cmp_n == 0 && !redo && argp->opcode == ALLOCPGNO) { /* undo an allocation. */ if (argp->old_pgno == 0) - hashp->hdr->last_freed = argp->pgno; + hcp->hdr->last_freed = argp->pgno; else { - hashp->hdr->spares[hashp->hdr->ovfl_point]--; - hashp->hdr->last_freed = 0; + hcp->hdr->spares[hcp->hdr->ovfl_point]--; + hcp->hdr->last_freed = 0; } change = 1; } if (change) { - hashp->hdr->lsn = redo ? *lsnp : argp->metalsn; - F_SET(file_dbp, DB_HS_DIRTYMETA); + hcp->hdr->lsn = redo ? *lsnp : argp->metalsn; + F_SET(hcp, H_DIRTY); } @@ -530,9 +541,7 @@ __ham_newpgno_recover(logp, dbtp, lsnp, redo, info) * would not have to undo anything. In this case, * don't bother creating a page. */ - *lsnp = argp->prev_lsn; - ret = 0; - goto out; + goto done; } else if ((ret = memp_fget(mpf, &argp->pgno, DB_MPOOL_CREATE, &pagep)) != 0) goto out; @@ -565,10 +574,11 @@ __ham_newpgno_recover(logp, dbtp, lsnp, redo, info) if ((ret = __ham_put_page(file_dbp, pagep, change)) != 0) goto out; - *lsnp = argp->prev_lsn; +done: *lsnp = argp->prev_lsn; + ret = 0; out: if (getmeta) - RELEASE_META(file_dbp, hashp); + RELEASE_META(file_dbp, hcp); REC_CLOSE; } @@ -590,19 +600,22 @@ __ham_splitmeta_recover(logp, dbtp, lsnp, redo, info) void *info; { __ham_splitmeta_args *argp; - DB *mdbp, *file_dbp; + DB *file_dbp; + DBC *dbc; + HASH_CURSOR *hcp; DB_MPOOLFILE *mpf; - HTAB *hashp; int change, cmp_n, cmp_p, getmeta, ret; u_int32_t pow; getmeta = 0; - hashp = NULL; /* XXX: shut the compiler up. */ + hcp = NULL; REC_PRINT(__ham_splitmeta_print); REC_INTRO(__ham_splitmeta_read); + hcp = (HASH_CURSOR *)dbc->internal; - hashp = (HTAB *)file_dbp->internal; - GET_META(file_dbp, hashp); + GET_META(file_dbp, (HASH_CURSOR *)dbc->internal, ret); + if (ret != 0) + goto out; getmeta = 1; /* @@ -610,43 +623,45 @@ __ham_splitmeta_recover(logp, dbtp, lsnp, redo, info) * to update the meta data; then we need to update the page. * We'll do the meta-data first. */ - cmp_n = log_compare(lsnp, &hashp->hdr->lsn); - cmp_p = log_compare(&hashp->hdr->lsn, &argp->metalsn); + cmp_n = log_compare(lsnp, &hcp->hdr->lsn); + cmp_p = log_compare(&hcp->hdr->lsn, &argp->metalsn); change = 0; if (cmp_p == 0 && redo) { /* Need to redo the split information. */ - hashp->hdr->max_bucket = argp->bucket + 1; - pow = __db_log2(hashp->hdr->max_bucket + 1); - if (pow > hashp->hdr->ovfl_point) { - hashp->hdr->spares[pow] = - hashp->hdr->spares[hashp->hdr->ovfl_point]; - hashp->hdr->ovfl_point = pow; + hcp->hdr->max_bucket = argp->bucket + 1; + pow = __db_log2(hcp->hdr->max_bucket + 1); + if (pow > hcp->hdr->ovfl_point) { + hcp->hdr->spares[pow] = + hcp->hdr->spares[hcp->hdr->ovfl_point]; + hcp->hdr->ovfl_point = pow; } - if (hashp->hdr->max_bucket > hashp->hdr->high_mask) { - hashp->hdr->low_mask = hashp->hdr->high_mask; - hashp->hdr->high_mask = - hashp->hdr->max_bucket | hashp->hdr->low_mask; + if (hcp->hdr->max_bucket > hcp->hdr->high_mask) { + hcp->hdr->low_mask = hcp->hdr->high_mask; + hcp->hdr->high_mask = + hcp->hdr->max_bucket | hcp->hdr->low_mask; } change = 1; } else if (cmp_n == 0 && !redo) { /* Need to undo the split information. */ - hashp->hdr->max_bucket = argp->bucket; - hashp->hdr->ovfl_point = argp->ovflpoint; - hashp->hdr->spares[hashp->hdr->ovfl_point] = argp->spares; - pow = 1 << __db_log2(hashp->hdr->max_bucket + 1); - hashp->hdr->high_mask = pow - 1; - hashp->hdr->low_mask = (pow >> 1) - 1; + hcp->hdr->max_bucket = argp->bucket; + hcp->hdr->ovfl_point = argp->ovflpoint; + hcp->hdr->spares[hcp->hdr->ovfl_point] = argp->spares; + pow = 1 << __db_log2(hcp->hdr->max_bucket + 1); + hcp->hdr->high_mask = pow - 1; + hcp->hdr->low_mask = (pow >> 1) - 1; change = 1; } if (change) { - hashp->hdr->lsn = redo ? *lsnp : argp->metalsn; - F_SET(file_dbp, DB_HS_DIRTYMETA); + hcp->hdr->lsn = redo ? *lsnp : argp->metalsn; + F_SET(hcp, H_DIRTY); } - *lsnp = argp->prev_lsn; + +done: *lsnp = argp->prev_lsn; + ret = 0; out: if (getmeta) - RELEASE_META(file_dbp, hashp); + RELEASE_META(file_dbp, hcp); REC_CLOSE; } @@ -665,16 +680,18 @@ __ham_splitdata_recover(logp, dbtp, lsnp, redo, info) void *info; { __ham_splitdata_args *argp; - DB *mdbp, *file_dbp; + DB *file_dbp; + DBC *dbc; + HASH_CURSOR *hcp; DB_MPOOLFILE *mpf; - HTAB *hashp; PAGE *pagep; int change, cmp_n, cmp_p, getmeta, ret; getmeta = 0; - hashp = NULL; /* XXX: shut the compiler up. */ + hcp = NULL; REC_PRINT(__ham_splitdata_print); REC_INTRO(__ham_splitdata_read); + hcp = (HASH_CURSOR *)dbc->internal; ret = memp_fget(mpf, &argp->pgno, 0, &pagep); if (ret != 0) { @@ -685,16 +702,15 @@ __ham_splitdata_recover(logp, dbtp, lsnp, redo, info) * would not have to undo anything. In this case, * don't bother creating a page. */ - *lsnp = argp->prev_lsn; - ret = 0; - goto out; + goto done; } else if ((ret = memp_fget(mpf, &argp->pgno, DB_MPOOL_CREATE, &pagep)) != 0) goto out; } - hashp = (HTAB *)file_dbp->internal; - GET_META(file_dbp, hashp); + GET_META(file_dbp, (HASH_CURSOR *)dbc->internal, ret); + if (ret != 0) + goto out; getmeta = 1; cmp_n = log_compare(lsnp, &LSN(pagep)); @@ -732,10 +748,11 @@ __ham_splitdata_recover(logp, dbtp, lsnp, redo, info) if ((ret = __ham_put_page(file_dbp, pagep, change)) != 0) goto out; - *lsnp = argp->prev_lsn; +done: *lsnp = argp->prev_lsn; + ret = 0; out: if (getmeta) - RELEASE_META(file_dbp, hashp); + RELEASE_META(file_dbp, hcp); REC_CLOSE; } @@ -755,50 +772,52 @@ __ham_ovfl_recover(logp, dbtp, lsnp, redo, info) void *info; { __ham_ovfl_args *argp; - DB *mdbp, *file_dbp; + DB *file_dbp; + DBC *dbc; + HASH_CURSOR *hcp; DB_MPOOLFILE *mpf; - HTAB *hashp; PAGE *pagep; db_pgno_t max_pgno, pgno; int cmp_n, cmp_p, getmeta, ret; getmeta = 0; - hashp = NULL; /* XXX: shut the compiler up. */ + hcp = NULL; REC_PRINT(__ham_ovfl_print); REC_INTRO(__ham_ovfl_read); + hcp = (HASH_CURSOR *)dbc->internal; - hashp = (HTAB *)file_dbp->internal; - GET_META(file_dbp, hashp); + GET_META(file_dbp, (HASH_CURSOR *)dbc->internal, ret); + if (ret != 0) + goto out; getmeta = 1; - cmp_n = log_compare(lsnp, &hashp->hdr->lsn); - cmp_p = log_compare(&hashp->hdr->lsn, &argp->metalsn); + cmp_n = log_compare(lsnp, &hcp->hdr->lsn); + cmp_p = log_compare(&hcp->hdr->lsn, &argp->metalsn); if (cmp_p == 0 && redo) { /* Redo the allocation. */ - hashp->hdr->last_freed = argp->start_pgno; - hashp->hdr->spares[argp->ovflpoint] += argp->npages; - hashp->hdr->lsn = *lsnp; - F_SET(file_dbp, DB_HS_DIRTYMETA); + hcp->hdr->last_freed = argp->start_pgno; + hcp->hdr->spares[argp->ovflpoint] += argp->npages; + hcp->hdr->lsn = *lsnp; + F_SET(hcp, H_DIRTY); } else if (cmp_n == 0 && !redo) { - hashp->hdr->last_freed = argp->free_pgno; - hashp->hdr->spares[argp->ovflpoint] -= argp->npages; - hashp->hdr->lsn = argp->metalsn; - F_SET(file_dbp, DB_HS_DIRTYMETA); + hcp->hdr->last_freed = argp->free_pgno; + hcp->hdr->spares[argp->ovflpoint] -= argp->npages; + hcp->hdr->lsn = argp->metalsn; + F_SET(hcp, H_DIRTY); } max_pgno = argp->start_pgno + argp->npages - 1; ret = 0; for (pgno = argp->start_pgno; pgno <= max_pgno; pgno++) { - ret = memp_fget(mpf, &pgno, 0, &pagep); - if (ret != 0) { - if (redo && (ret = memp_fget(mpf, &pgno, - DB_MPOOL_CREATE, &pagep)) != 0) - goto out; - else if (!redo) { - (void)__ham_put_page(file_dbp, pagep, 0); + if ((ret = memp_fget(mpf, &pgno, 0, &pagep)) != 0) { + if (!redo) { + ret = 0; continue; } + if ((ret = memp_fget(mpf, + &pgno, DB_MPOOL_CREATE, &pagep)) != 0) + goto out; } if (redo && log_compare((const DB_LSN *)lsnp, (const DB_LSN *)&LSN(pagep)) > 0) { @@ -816,9 +835,11 @@ __ham_ovfl_recover(logp, dbtp, lsnp, redo, info) goto out; } - *lsnp = argp->prev_lsn; +done: *lsnp = argp->prev_lsn; + ret = 0; + out: if (getmeta) - RELEASE_META(file_dbp, hashp); + RELEASE_META(file_dbp, hcp); REC_CLOSE; } @@ -838,19 +859,22 @@ __ham_copypage_recover(logp, dbtp, lsnp, redo, info) void *info; { __ham_copypage_args *argp; - DB *file_dbp, *mdbp; + DB *file_dbp; + DBC *dbc; + HASH_CURSOR *hcp; DB_MPOOLFILE *mpf; - HTAB *hashp; PAGE *pagep; int cmp_n, cmp_p, getmeta, modified, ret; getmeta = 0; - hashp = NULL; /* XXX: shut the compiler up. */ + hcp = NULL; REC_PRINT(__ham_copypage_print); REC_INTRO(__ham_copypage_read); + hcp = (HASH_CURSOR *)dbc->internal; - hashp = (HTAB *)file_dbp->internal; - GET_META(file_dbp, hashp); + GET_META(file_dbp, (HASH_CURSOR *)dbc->internal, ret); + if (ret != 0) + goto out; getmeta = 1; modified = 0; @@ -881,7 +905,7 @@ __ham_copypage_recover(logp, dbtp, lsnp, redo, info) modified = 1; } else if (cmp_n == 0 && !redo) { /* Need to undo update described. */ - P_INIT(pagep, hashp->hdr->pagesize, argp->pgno, PGNO_INVALID, + P_INIT(pagep, hcp->hdr->pagesize, argp->pgno, PGNO_INVALID, argp->next_pgno, 0, P_HASH); LSN(pagep) = argp->pagelsn; modified = 1; @@ -918,10 +942,8 @@ donext: ret = memp_fget(mpf, &argp->next_pgno, 0, &pagep); goto out; /* Now fix up the next's next page. */ -do_nn: if (argp->nnext_pgno == PGNO_INVALID) { - *lsnp = argp->prev_lsn; - goto out; - } +do_nn: if (argp->nnext_pgno == PGNO_INVALID) + goto done; ret = memp_fget(mpf, &argp->nnext_pgno, 0, &pagep); if (ret != 0) { @@ -932,9 +954,7 @@ do_nn: if (argp->nnext_pgno == PGNO_INVALID) { * would not have to undo anything. In this case, * don't bother creating a page. */ - ret = 0; - *lsnp = argp->prev_lsn; - goto out; + goto done; } else if ((ret = memp_fget(mpf, &argp->nnext_pgno, DB_MPOOL_CREATE, &pagep)) != 0) goto out; @@ -957,9 +977,10 @@ do_nn: if (argp->nnext_pgno == PGNO_INVALID) { if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0) goto out; - *lsnp = argp->prev_lsn; +done: *lsnp = argp->prev_lsn; + ret = 0; out: if (getmeta) - RELEASE_META(file_dbp, hashp); + RELEASE_META(file_dbp, hcp); REC_CLOSE; } diff --git a/db2/hash/hash_stat.c b/db2/hash/hash_stat.c index b57ca0950d..1b493d5f40 100644 --- a/db2/hash/hash_stat.c +++ b/db2/hash/hash_stat.c @@ -8,7 +8,7 @@ #include "config.h" #ifndef lint -static const char sccsid[] = "@(#)hash_stat.c 10.8 (Sleepycat) 4/26/98"; +static const char sccsid[] = "@(#)hash_stat.c 10.12 (Sleepycat) 12/19/98"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -23,35 +23,22 @@ static const char sccsid[] = "@(#)hash_stat.c 10.8 (Sleepycat) 4/26/98"; /* * __ham_stat -- - * Gather/print the hash statistics. + * Gather/print the hash statistics * - * PUBLIC: int __ham_stat __P((DB *, FILE *)); + * PUBLIC: int __ham_stat __P((DB *, void *, void *(*)(size_t), u_int32_t)); */ int -__ham_stat(dbp, fp) +__ham_stat(dbp, spp, db_malloc, flags) DB *dbp; - FILE *fp; + void *spp; + void *(*db_malloc) __P((size_t)); + u_int32_t flags; { - HTAB *hashp; - int i; + COMPQUIET(spp, NULL); + COMPQUIET(db_malloc, NULL); + COMPQUIET(flags, 0); - hashp = (HTAB *)dbp->internal; + DB_PANIC_CHECK(dbp); - fprintf(fp, "hash: accesses %lu collisions %lu\n", - hashp->hash_accesses, hashp->hash_collisions); - fprintf(fp, "hash: expansions %lu\n", hashp->hash_expansions); - fprintf(fp, "hash: overflows %lu\n", hashp->hash_overflows); - fprintf(fp, "hash: big key/data pages %lu\n", hashp->hash_bigpages); - - SET_LOCKER(dbp, NULL); - GET_META(dbp, hashp); - fprintf(fp, "keys %lu maxp %lu\n", - (u_long)hashp->hdr->nelem, (u_long)hashp->hdr->max_bucket); - - for (i = 0; i < NCACHED; i++) - fprintf(fp, - "spares[%d] = %lu\n", i, (u_long)hashp->hdr->spares[i]); - - RELEASE_META(dbp, hashp); - return (0); + return (__db_eopnotsup(dbp->dbenv)); } diff --git a/db2/include/btree.h b/db2/include/btree.h index 1660d331e7..b0c04b1508 100644 --- a/db2/include/btree.h +++ b/db2/include/btree.h @@ -43,38 +43,19 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * @(#)btree.h 10.21 (Sleepycat) 5/23/98 + * @(#)btree.h 10.26 (Sleepycat) 12/16/98 */ /* Forward structure declarations. */ struct __btree; typedef struct __btree BTREE; struct __cursor; typedef struct __cursor CURSOR; struct __epg; typedef struct __epg EPG; -struct __rcursor; typedef struct __rcursor RCURSOR; struct __recno; typedef struct __recno RECNO; -#undef DEFMINKEYPAGE /* Minimum keys per page */ #define DEFMINKEYPAGE (2) -#undef ISINTERNAL /* If an internal page. */ -#define ISINTERNAL(p) (TYPE(p) == P_IBTREE || TYPE(p) == P_IRECNO) -#undef ISLEAF /* If a leaf page. */ -#define ISLEAF(p) (TYPE(p) == P_LBTREE || TYPE(p) == P_LRECNO) - -/* Allocate and discard thread structures. */ -#define GETHANDLE(dbp, set_txn, dbpp, ret) { \ - if (F_ISSET(dbp, DB_AM_THREAD)) { \ - if ((ret = __db_gethandle(dbp, __bam_bdup, dbpp)) != 0) \ - return (ret); \ - } else \ - *dbpp = dbp; \ - *dbpp->txn = set_txn; \ -} -#define PUTHANDLE(dbp) { \ - dbp->txn = NULL; \ - if (F_ISSET(dbp, DB_AM_THREAD)) \ - __db_puthandle(dbp); \ -} +#define ISINTERNAL(p) (TYPE(p) == P_IBTREE || TYPE(p) == P_IRECNO) +#define ISLEAF(p) (TYPE(p) == P_LBTREE || TYPE(p) == P_LRECNO) /* * If doing transactions we have to hold the locks associated with a data item @@ -82,15 +63,15 @@ struct __recno; typedef struct __recno RECNO; * locks associated with walking the tree. Distinguish between the two so that * we don't tie up the internal pages of the tree longer than necessary. */ -#define __BT_LPUT(dbp, lock) \ - (F_ISSET((dbp), DB_AM_LOCKING) ? \ - lock_put((dbp)->dbenv->lk_info, lock) : 0) -#define __BT_TLPUT(dbp, lock) \ - (F_ISSET((dbp), DB_AM_LOCKING) && (dbp)->txn == NULL ? \ - lock_put((dbp)->dbenv->lk_info, lock) : 0) +#define __BT_LPUT(dbc, lock) \ + (F_ISSET((dbc)->dbp, DB_AM_LOCKING) ? \ + lock_put((dbc)->dbp->dbenv->lk_info, lock) : 0) +#define __BT_TLPUT(dbc, lock) \ + (F_ISSET((dbc)->dbp, DB_AM_LOCKING) && (dbc)->txn == NULL ? \ + lock_put((dbc)->dbp->dbenv->lk_info, lock) : 0) /* - * Flags to __bt_search() and __rec_search(). + * Flags to __bam_search() and __bam_rsearch(). * * Note, internal page searches must find the largest record less than key in * the tree so that descents work. Leaf page searches must find the smallest @@ -113,22 +94,19 @@ struct __recno; typedef struct __recno RECNO; #define S_EXACT 0x00400 /* Exact items only. */ #define S_PARENT 0x00800 /* Lock page pair. */ #define S_STACK 0x01000 /* Need a complete stack. */ +#define S_PAST_EOF 0x02000 /* If doing insert search (or keyfirst + * or keylast operations), or a split + * on behalf of an insert, it's okay to + * return an entry one past end-of-page. + */ #define S_DELETE (S_WRITE | S_DUPFIRST | S_DELNO | S_EXACT | S_STACK) #define S_FIND (S_READ | S_DUPFIRST | S_DELNO) -#define S_INSERT (S_WRITE | S_DUPLAST | S_STACK) -#define S_KEYFIRST (S_WRITE | S_DUPFIRST | S_STACK) -#define S_KEYLAST (S_WRITE | S_DUPLAST | S_STACK) -#define S_WRPAIR (S_WRITE | S_DUPLAST | S_PARENT) - -/* - * If doing insert search (including keyfirst or keylast operations) or a - * split search on behalf of an insert, it's okay to return the entry one - * past the end of the page. - */ -#define PAST_END_OK(f) \ - ((f) == S_INSERT || \ - (f) == S_KEYFIRST || (f) == S_KEYLAST || (f) == S_WRPAIR) +#define S_FIND_WR (S_WRITE | S_DUPFIRST | S_DELNO) +#define S_INSERT (S_WRITE | S_DUPLAST | S_PAST_EOF | S_STACK) +#define S_KEYFIRST (S_WRITE | S_DUPFIRST | S_PAST_EOF | S_STACK) +#define S_KEYLAST (S_WRITE | S_DUPLAST | S_PAST_EOF | S_STACK) +#define S_WRPAIR (S_WRITE | S_DUPLAST | S_PAST_EOF | S_PARENT) /* * Flags to __bam_iitem(). @@ -149,23 +127,32 @@ struct __epg { }; /* - * All cursors are queued from the master DB structure. Convert the user's - * DB reference to the master DB reference. We lock the master DB mutex - * so that we can walk the cursor queue. There's no race in accessing the - * cursors, because if we're modifying a page, we have a write lock on it, - * and therefore no other thread than the current one can have a cursor that - * references the page. + * We maintain a stack of the pages that we're locking in the tree. Btree's + * (currently) only save two levels of the tree at a time, so the default + * stack is always large enough. Recno trees have to lock the entire tree to + * do inserts/deletes, however. Grow the stack as necessary. */ -#define CURSOR_SETUP(dbp) { \ - (dbp) = (dbp)->master; \ - DB_THREAD_LOCK(dbp); \ -} -#define CURSOR_TEARDOWN(dbp) \ - DB_THREAD_UNLOCK(dbp); +#define BT_STK_CLR(c) \ + ((c)->csp = (c)->sp) + +#define BT_STK_ENTER(c, pagep, page_indx, lock, ret) do { \ + if ((ret = \ + (c)->csp == (c)->esp ? __bam_stkgrow(c) : 0) == 0) { \ + (c)->csp->page = pagep; \ + (c)->csp->indx = page_indx; \ + (c)->csp->lock = lock; \ + } \ +} while (0) + +#define BT_STK_PUSH(c, pagep, page_indx, lock, ret) do { \ + BT_STK_ENTER(c, pagep, page_indx, lock, ret); \ + ++(c)->csp; \ +} while (0) + +#define BT_STK_POP(c) \ + ((c)->csp == (c)->stack ? NULL : --(c)->csp) /* - * Btree cursor. - * * Arguments passed to __bam_ca_replace(). */ typedef enum { @@ -173,9 +160,27 @@ typedef enum { REPLACE_SUCCESS, REPLACE_FAILED } ca_replace_arg; + +/* Arguments passed to __ram_ca(). */ +typedef enum { + CA_DELETE, + CA_IAFTER, + CA_IBEFORE +} ca_recno_arg; + +#define RECNO_OOB 0 /* Illegal record number. */ + +/* Btree/Recno cursor. */ struct __cursor { DBC *dbc; /* Enclosing DBC. */ + /* Per-thread information: shared by btree/recno. */ + EPG *sp; /* Stack pointer. */ + EPG *csp; /* Current stack entry. */ + EPG *esp; /* End stack pointer. */ + EPG stack[5]; + + /* Per-thread information: btree private. */ PAGE *page; /* Cursor page. */ db_pgno_t pgno; /* Page. */ @@ -187,90 +192,25 @@ struct __cursor { DB_LOCK lock; /* Cursor read lock. */ db_lockmode_t mode; /* Lock mode. */ - /* - * If a cursor record is deleted, the key/data pair has to remain on - * the page so that subsequent inserts/deletes don't interrupt the - * cursor progression through the file. This results in interesting - * cases when "standard" operations, e.g., dbp->put() are done in the - * context of "deleted" cursors. - * - * C_DELETED -- The item referenced by the cursor has been "deleted" - * but not physically removed from the page. - * C_REPLACE -- The "deleted" item referenced by a cursor has been - * replaced by a dbp->put(), so the cursor is no longer - * responsible for physical removal from the page. - * C_REPLACE_SETUP -- - * We are about to overwrite a "deleted" item, flag any - * cursors referencing it for transition to C_REPLACE - * state. - */ -#define C_DELETED 0x0001 -#define C_REPLACE 0x0002 -#define C_REPLACE_SETUP 0x0004 - - /* - * Internal cursor held for DB->get; don't hold locks unless involved - * in a TXN. - */ -#define C_INTERNAL 0x0008 - u_int32_t flags; -}; - -/* - * Recno cursor. - * - * Arguments passed to __ram_ca(). - */ -typedef enum { - CA_DELETE, - CA_IAFTER, - CA_IBEFORE -} ca_recno_arg; -struct __rcursor { - DBC *dbc; /* Enclosing DBC. */ - + /* Per-thread information: recno private. */ db_recno_t recno; /* Current record number. */ /* - * Cursors referencing "deleted" records are positioned between - * two records, and so must be specially adjusted until they are - * moved. + * Btree: + * We set a flag in the cursor structure if the underlying object has + * been deleted. It's not strictly necessary, we could get the same + * information by looking at the page itself. + * + * Recno: + * When renumbering recno databases during deletes, cursors referencing + * "deleted" records end up positioned between two records, and so must + * be specially adjusted on the next operation. */ -#define CR_DELETED 0x0001 /* Record deleted. */ +#define C_DELETED 0x0001 /* Record was deleted. */ u_int32_t flags; }; /* - * We maintain a stack of the pages that we're locking in the tree. Btree's - * (currently) only save two levels of the tree at a time, so the default - * stack is always large enough. Recno trees have to lock the entire tree to - * do inserts/deletes, however. Grow the stack as necessary. - */ -#undef BT_STK_CLR -#define BT_STK_CLR(t) \ - ((t)->bt_csp = (t)->bt_sp) - -#undef BT_STK_ENTER -#define BT_STK_ENTER(t, pagep, page_indx, lock, ret) do { \ - if ((ret = \ - (t)->bt_csp == (t)->bt_esp ? __bam_stkgrow(t) : 0) == 0) { \ - (t)->bt_csp->page = pagep; \ - (t)->bt_csp->indx = page_indx; \ - (t)->bt_csp->lock = lock; \ - } \ -} while (0) - -#undef BT_STK_PUSH -#define BT_STK_PUSH(t, pagep, page_indx, lock, ret) do { \ - BT_STK_ENTER(t, pagep, page_indx, lock, ret); \ - ++(t)->bt_csp; \ -} while (0) - -#undef BT_STK_POP -#define BT_STK_POP(t) \ - ((t)->bt_csp == (t)->bt_stack ? NULL : --(t)->bt_csp) - -/* * The in-memory recno data structure. * * !!! @@ -278,9 +218,6 @@ struct __rcursor { * are no transaction semantics associated with backing files, nor is there * any thread protection. */ -#undef RECNO_OOB -#define RECNO_OOB 0 /* Illegal record number. */ - struct __recno { int re_delim; /* Variable-length delimiting byte. */ int re_pad; /* Fixed-length padding byte. */ @@ -294,7 +231,7 @@ struct __recno { void *re_emap; /* End of mapped space. */ size_t re_msize; /* Size of mapped region. */ /* Recno input function. */ - int (*re_irec) __P((DB *, db_recno_t)); + int (*re_irec) __P((DBC *, db_recno_t)); #define RECNO_EOF 0x0001 /* EOF on backing source file. */ #define RECNO_MODIFIED 0x0002 /* Tree was modified. */ @@ -302,31 +239,11 @@ struct __recno { }; /* - * The in-memory btree data structure. + * The in-memory, per-tree btree data structure. */ struct __btree { -/* - * These fields are per-thread and are initialized when the BTREE structure - * is created. - */ db_pgno_t bt_lpgno; /* Last insert location. */ - DBT bt_rkey; /* Returned key. */ - DBT bt_rdata; /* Returned data. */ - - EPG *bt_sp; /* Stack pointer. */ - EPG *bt_csp; /* Current stack entry. */ - EPG *bt_esp; /* End stack pointer. */ - EPG bt_stack[5]; - - RECNO *bt_recno; /* Private recno structure. */ - - DB_BTREE_LSTAT lstat; /* Btree local statistics. */ - -/* - * These fields are copied from the original BTREE structure and never - * change. - */ db_indx_t bt_maxkey; /* Maximum keys per page. */ db_indx_t bt_minkey; /* Minimum keys per page. */ @@ -336,6 +253,8 @@ struct __btree { __P((const DBT *, const DBT *)); db_indx_t bt_ovflsize; /* Maximum key/data on-page size. */ + + RECNO *recno; /* Private recno structure. */ }; #include "btree_auto.h" diff --git a/db2/include/btree_ext.h b/db2/include/btree_ext.h index b8a137364c..fbc2ed958f 100644 --- a/db2/include/btree_ext.h +++ b/db2/include/btree_ext.h @@ -1,45 +1,41 @@ /* DO NOT EDIT: automatically built by dist/distrib. */ #ifndef _btree_ext_h_ #define _btree_ext_h_ -int __bam_close __P((DB *)); -int __bam_sync __P((DB *, u_int32_t)); -int __bam_cmp __P((DB *, const DBT *, EPG *)); +int __bam_cmp __P((DB *, const DBT *, + PAGE *, u_int32_t, int (*)(const DBT *, const DBT *))); int __bam_defcmp __P((const DBT *, const DBT *)); size_t __bam_defpfx __P((const DBT *, const DBT *)); int __bam_pgin __P((db_pgno_t, void *, DBT *)); int __bam_pgout __P((db_pgno_t, void *, DBT *)); int __bam_mswap __P((PAGE *)); -int __bam_cursor __P((DB *, DB_TXN *, DBC **)); -int __bam_c_iclose __P((DB *, DBC *)); -int __bam_get __P((DB *, DB_TXN *, DBT *, DBT *, u_int32_t)); -int __bam_ovfl_chk __P((DB *, CURSOR *, u_int32_t, int)); int __bam_cprint __P((DB *)); -int __bam_ca_delete __P((DB *, db_pgno_t, u_int32_t, CURSOR *, int)); +int __bam_ca_delete __P((DB *, db_pgno_t, u_int32_t, int)); void __bam_ca_di __P((DB *, db_pgno_t, u_int32_t, int)); void __bam_ca_dup __P((DB *, db_pgno_t, u_int32_t, u_int32_t, db_pgno_t, u_int32_t)); -void __bam_ca_move __P((DB *, db_pgno_t, db_pgno_t)); -void __bam_ca_replace - __P((DB *, db_pgno_t, u_int32_t, ca_replace_arg)); +void __bam_ca_rsplit __P((DB *, db_pgno_t, db_pgno_t)); void __bam_ca_split __P((DB *, db_pgno_t, db_pgno_t, db_pgno_t, u_int32_t, int)); +int __bam_c_init __P((DBC *)); +int __bam_dup __P((DBC *, CURSOR *, u_int32_t, int)); int __bam_delete __P((DB *, DB_TXN *, DBT *, u_int32_t)); -int __ram_delete __P((DB *, DB_TXN *, DBT *, u_int32_t)); -int __bam_ditem __P((DB *, PAGE *, u_int32_t)); -int __bam_adjindx __P((DB *, PAGE *, u_int32_t, u_int32_t, int)); -int __bam_dpage __P((DB *, const DBT *)); -int __bam_open __P((DB *, DBTYPE, DB_INFO *)); -int __bam_bdup __P((DB *, DB *)); -int __bam_new __P((DB *, u_int32_t, PAGE **)); -int __bam_free __P((DB *, PAGE *)); -int __bam_lt __P((DB *)); -int __bam_lget __P((DB *, int, db_pgno_t, db_lockmode_t, DB_LOCK *)); -int __bam_lput __P((DB *, DB_LOCK)); -int __bam_pget __P((DB *, PAGE **, db_pgno_t *, u_int32_t)); -int __bam_put __P((DB *, DB_TXN *, DBT *, DBT *, u_int32_t)); -int __bam_iitem __P((DB *, +int __bam_ditem __P((DBC *, PAGE *, u_int32_t)); +int __bam_adjindx __P((DBC *, PAGE *, u_int32_t, u_int32_t, int)); +int __bam_dpage __P((DBC *, const DBT *)); +int __bam_dpages __P((DBC *)); +int __bam_open __P((DB *, DB_INFO *)); +int __bam_close __P((DB *)); +void __bam_setovflsize __P((DB *)); +int __bam_read_root __P((DB *)); +int __bam_new __P((DBC *, u_int32_t, PAGE **)); +int __bam_lput __P((DBC *, DB_LOCK)); +int __bam_free __P((DBC *, PAGE *)); +int __bam_lt __P((DBC *)); +int __bam_lget + __P((DBC *, int, db_pgno_t, db_lockmode_t, DB_LOCK *)); +int __bam_iitem __P((DBC *, PAGE **, db_indx_t *, DBT *, DBT *, u_int32_t, u_int32_t)); -int __bam_ritem __P((DB *, PAGE *, u_int32_t, DBT *)); +int __bam_ritem __P((DBC *, PAGE *, u_int32_t, DBT *)); int __bam_pg_alloc_recover __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); int __bam_pg_free_recover @@ -56,28 +52,24 @@ int __bam_cdel_recover __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); int __bam_repl_recover __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); -int __ram_open __P((DB *, DBTYPE, DB_INFO *)); -int __ram_cursor __P((DB *, DB_TXN *, DBC **)); +int __ram_open __P((DB *, DB_INFO *)); int __ram_close __P((DB *)); -int __ram_c_iclose __P((DB *, DBC *)); +int __ram_c_del __P((DBC *, u_int32_t)); +int __ram_c_get __P((DBC *, DBT *, DBT *, u_int32_t)); +int __ram_c_put __P((DBC *, DBT *, DBT *, u_int32_t)); void __ram_ca __P((DB *, db_recno_t, ca_recno_arg)); -int __ram_cprint __P((DB *)); -int __ram_getno __P((DB *, const DBT *, db_recno_t *, int)); -int __ram_snapshot __P((DB *)); -int __bam_rsearch __P((DB *, db_recno_t *, u_int32_t, int, int *)); -int __bam_adjust __P((DB *, BTREE *, int32_t)); -int __bam_nrecs __P((DB *, db_recno_t *)); +int __ram_getno __P((DBC *, const DBT *, db_recno_t *, int)); +int __bam_rsearch __P((DBC *, db_recno_t *, u_int32_t, int, int *)); +int __bam_adjust __P((DBC *, int32_t)); +int __bam_nrecs __P((DBC *, db_recno_t *)); db_recno_t __bam_total __P((PAGE *)); -int __bam_search __P((DB *, +int __bam_search __P((DBC *, const DBT *, u_int32_t, int, db_recno_t *, int *)); -int __bam_stkrel __P((DB *)); -int __bam_stkgrow __P((BTREE *)); -int __bam_split __P((DB *, void *)); -int __bam_broot __P((DB *, PAGE *, PAGE *, PAGE *)); -int __ram_root __P((DB *, PAGE *, PAGE *, PAGE *)); +int __bam_stkrel __P((DBC *, int)); +int __bam_stkgrow __P((CURSOR *)); +int __bam_split __P((DBC *, void *)); int __bam_copy __P((DB *, PAGE *, PAGE *, u_int32_t, u_int32_t)); int __bam_stat __P((DB *, void *, void *(*)(size_t), u_int32_t)); -void __bam_add_mstat __P((DB_BTREE_LSTAT *, DB_BTREE_LSTAT *)); int __bam_pg_alloc_log __P((DB_LOG *, DB_TXN *, DB_LSN *, u_int32_t, u_int32_t, DB_LSN *, DB_LSN *, db_pgno_t, diff --git a/db2/include/clib_ext.h b/db2/include/clib_ext.h index f5510a1629..2566b849ce 100644 --- a/db2/include/clib_ext.h +++ b/db2/include/clib_ext.h @@ -37,12 +37,6 @@ void *memcpy __P((void *, const void *, size_t)); #ifndef HAVE_MEMMOVE void *memmove __P((void *, const void *, size_t)); #endif -#ifndef HAVE_MEMCPY -void *memcpy __P((void *, const void *, size_t)); -#endif -#ifndef HAVE_MEMMOVE -void *memmove __P((void *, const void *, size_t)); -#endif #ifndef HAVE_RAISE int raise __P((int)); #endif diff --git a/db2/include/common_ext.h b/db2/include/common_ext.h index 4674f9ce01..33fb0cb218 100644 --- a/db2/include/common_ext.h +++ b/db2/include/common_ext.h @@ -5,26 +5,18 @@ int __db_appname __P((DB_ENV *, APPNAME, const char *, const char *, u_int32_t, int *, char **)); int __db_apprec __P((DB_ENV *, u_int32_t)); int __db_byteorder __P((DB_ENV *, int)); +int __db_fchk __P((DB_ENV *, const char *, u_int32_t, u_int32_t)); +int __db_fcchk + __P((DB_ENV *, const char *, u_int32_t, u_int32_t, u_int32_t)); +int __db_ferr __P((const DB_ENV *, const char *, int)); #ifdef __STDC__ void __db_err __P((const DB_ENV *dbenv, const char *fmt, ...)); #else void __db_err(); #endif -int __db_panic __P((DB *)); -int __db_fchk __P((DB_ENV *, const char *, u_int32_t, u_int32_t)); -int __db_fcchk - __P((DB_ENV *, const char *, u_int32_t, u_int32_t, u_int32_t)); -int __db_cdelchk __P((const DB *, u_int32_t, int, int)); -int __db_cgetchk __P((const DB *, DBT *, DBT *, u_int32_t, int)); -int __db_cputchk __P((const DB *, - const DBT *, DBT *, u_int32_t, int, int)); -int __db_delchk __P((const DB *, DBT *, u_int32_t, int)); -int __db_getchk __P((const DB *, const DBT *, DBT *, u_int32_t)); -int __db_putchk - __P((const DB *, DBT *, const DBT *, u_int32_t, int, int)); -int __db_statchk __P((const DB *, u_int32_t)); -int __db_syncchk __P((const DB *, u_int32_t)); -int __db_ferr __P((const DB_ENV *, const char *, int)); +int __db_pgerr __P((DB *, db_pgno_t)); +int __db_pgfmt __P((DB *, db_pgno_t)); +int __db_panic __P((DB_ENV *, int)); u_int32_t __db_log2 __P((u_int32_t)); int __db_rattach __P((REGINFO *)); int __db_rdetach __P((REGINFO *)); diff --git a/db2/include/db.h.src b/db2/include/db.h.src deleted file mode 100644 index 97ad55693f..0000000000 --- a/db2/include/db.h.src +++ /dev/null @@ -1,994 +0,0 @@ -/*- - * See the file LICENSE for redistribution information. - * - * Copyright (c) 1996, 1997, 1998 - * Sleepycat Software. All rights reserved. - * - * @(#)db.h.src 10.131 (Sleepycat) 6/2/98 - */ - -#ifndef _DB_H_ -#define _DB_H_ - -#ifndef __NO_SYSTEM_INCLUDES -#include <sys/types.h> - -#include <stdio.h> -#endif - -/* - * XXX - * MacOS: ensure that Metrowerks C makes enumeration types int sized. - */ -#ifdef __MWERKS__ -#pragma enumsalwaysint on -#endif - -/* - * XXX - * Handle function prototypes and the keyword "const". This steps on name - * space that DB doesn't control, but all of the other solutions are worse. - * - * XXX - * While Microsoft's compiler is ANSI C compliant, it doesn't have _STDC_ - * defined by default, you specify a command line flag or #pragma to turn - * it on. Don't do that, however, because some of Microsoft's own header - * files won't compile. - */ -#undef __P -#if defined(__STDC__) || defined(__cplusplus) || defined(_MSC_VER) -#define __P(protos) protos /* ANSI C prototypes */ -#else -#define const -#define __P(protos) () /* K&R C preprocessor */ -#endif - -/* - * !!! - * DB needs basic information about specifically sized types. If they're - * not provided by the system, typedef them here. - * - * We protect them against multiple inclusion using __BIT_TYPES_DEFINED__, - * as does BIND and Kerberos, since we don't know for sure what #include - * files the user is using. - * - * !!! - * We also provide the standard u_int, u_long etc., if they're not provided - * by the system. - */ -#ifndef __BIT_TYPES_DEFINED__ -#define __BIT_TYPES_DEFINED__ -@u_int8_decl@ -@int16_decl@ -@u_int16_decl@ -@int32_decl@ -@u_int32_decl@ -#endif - -@u_char_decl@ -@u_short_decl@ -@u_int_decl@ -@u_long_decl@ - -#define DB_VERSION_MAJOR 2 -#define DB_VERSION_MINOR 4 -#define DB_VERSION_PATCH 14 -#define DB_VERSION_STRING "Sleepycat Software: DB 2.4.14: (6/2/98)" - -typedef u_int32_t db_pgno_t; /* Page number type. */ -typedef u_int16_t db_indx_t; /* Page offset type. */ -#define DB_MAX_PAGES 0xffffffff /* >= # of pages in a file */ - -typedef u_int32_t db_recno_t; /* Record number type. */ -typedef size_t DB_LOCK; /* Object returned by lock manager. */ -#define DB_MAX_RECORDS 0xffffffff /* >= # of records in a tree */ - -#define DB_FILE_ID_LEN 20 /* DB file ID length. */ - -/* Forward structure declarations, so applications get type checking. */ -struct __db; typedef struct __db DB; -#ifdef DB_DBM_HSEARCH - typedef struct __db DBM; -#endif -struct __db_bt_stat; typedef struct __db_bt_stat DB_BTREE_STAT; -struct __db_dbt; typedef struct __db_dbt DBT; -struct __db_env; typedef struct __db_env DB_ENV; -struct __db_info; typedef struct __db_info DB_INFO; -struct __db_lock_stat; typedef struct __db_lock_stat DB_LOCK_STAT; -struct __db_lockregion; typedef struct __db_lockregion DB_LOCKREGION; -struct __db_lockreq; typedef struct __db_lockreq DB_LOCKREQ; -struct __db_locktab; typedef struct __db_locktab DB_LOCKTAB; -struct __db_log; typedef struct __db_log DB_LOG; -struct __db_log_stat; typedef struct __db_log_stat DB_LOG_STAT; -struct __db_lsn; typedef struct __db_lsn DB_LSN; -struct __db_mpool; typedef struct __db_mpool DB_MPOOL; -struct __db_mpool_finfo;typedef struct __db_mpool_finfo DB_MPOOL_FINFO; -struct __db_mpool_fstat;typedef struct __db_mpool_fstat DB_MPOOL_FSTAT; -struct __db_mpool_stat; typedef struct __db_mpool_stat DB_MPOOL_STAT; -struct __db_mpoolfile; typedef struct __db_mpoolfile DB_MPOOLFILE; -struct __db_txn; typedef struct __db_txn DB_TXN; -struct __db_txn_active; typedef struct __db_txn_active DB_TXN_ACTIVE; -struct __db_txn_stat; typedef struct __db_txn_stat DB_TXN_STAT; -struct __db_txnmgr; typedef struct __db_txnmgr DB_TXNMGR; -struct __db_txnregion; typedef struct __db_txnregion DB_TXNREGION; -struct __dbc; typedef struct __dbc DBC; - -/* Key/data structure -- a Data-Base Thang. */ -struct __db_dbt { - void *data; /* key/data */ - u_int32_t size; /* key/data length */ - u_int32_t ulen; /* RO: length of user buffer. */ - u_int32_t dlen; /* RO: get/put record length. */ - u_int32_t doff; /* RO: get/put record offset. */ - -#define DB_DBT_INTERNAL 0x01 /* Perform any mallocs using regular - malloc, not the user's malloc. */ -#define DB_DBT_MALLOC 0x02 /* Return in allocated memory. */ -#define DB_DBT_PARTIAL 0x04 /* Partial put/get. */ -#define DB_DBT_USERMEM 0x08 /* Return in user's memory. */ - u_int32_t flags; -}; - -/* - * DB internal configuration. - * - * There are a set of functions that the application can replace with its - * own versions, and some other knobs which can be turned at run-time. - */ -#define DB_FUNC_CALLOC 1 /* DELETED: ANSI C calloc. */ -#define DB_FUNC_CLOSE 2 /* POSIX 1003.1 close. */ -#define DB_FUNC_DIRFREE 3 /* DB: free directory list. */ -#define DB_FUNC_DIRLIST 4 /* DB: create directory list. */ -#define DB_FUNC_EXISTS 5 /* DB: return if file exists. */ -#define DB_FUNC_FREE 6 /* ANSI C free. */ -#define DB_FUNC_FSYNC 7 /* POSIX 1003.1 fsync. */ -#define DB_FUNC_IOINFO 8 /* DB: return file I/O information. */ -#define DB_FUNC_MALLOC 9 /* ANSI C malloc. */ -#define DB_FUNC_MAP 10 /* DB: map file into shared memory. */ -#define DB_FUNC_OPEN 11 /* POSIX 1003.1 open. */ -#define DB_FUNC_READ 12 /* POSIX 1003.1 read. */ -#define DB_FUNC_REALLOC 13 /* ANSI C realloc. */ -#define DB_FUNC_SEEK 14 /* POSIX 1003.1 lseek. */ -#define DB_FUNC_SLEEP 15 /* DB: sleep secs/usecs. */ -#define DB_FUNC_STRDUP 16 /* DELETED: DB: strdup(3). */ -#define DB_FUNC_UNLINK 17 /* POSIX 1003.1 unlink. */ -#define DB_FUNC_UNMAP 18 /* DB: unmap shared memory file. */ -#define DB_FUNC_WRITE 19 /* POSIX 1003.1 write. */ -#define DB_FUNC_YIELD 20 /* DB: yield thread to scheduler. */ -#define DB_TSL_SPINS 21 /* DB: initialize spin count. */ -#define DB_FUNC_RUNLINK 22 /* DB: remove a shared region. */ -#define DB_REGION_ANON 23 /* DB: anonymous, unnamed regions. */ -#define DB_REGION_INIT 24 /* DB: page-fault regions in create. */ -#define DB_REGION_NAME 25 /* DB: anonymous, named regions. */ -#define DB_MUTEXLOCKS 26 /* DB: turn off all mutex locks. */ -#define DB_PAGEYIELD 27 /* DB: yield the CPU on pool get. */ - -/* - * Database configuration and initialization. - */ - /* - * Flags understood by both db_open(3) and db_appinit(3). - */ -#define DB_CREATE 0x000001 /* O_CREAT: create file as necessary. */ -#define DB_NOMMAP 0x000002 /* Don't mmap underlying file. */ -#define DB_THREAD 0x000004 /* Free-thread DB package handles. */ - -/* - * Flags understood by db_appinit(3). - */ -/* 0x000007 COMMON MASK. */ -#define DB_INIT_LOCK 0x000008 /* Initialize locking. */ -#define DB_INIT_LOG 0x000010 /* Initialize logging. */ -#define DB_INIT_MPOOL 0x000020 /* Initialize mpool. */ -#define DB_INIT_TXN 0x000040 /* Initialize transactions. */ -#define DB_MPOOL_PRIVATE 0x000080 /* Mpool: private memory pool. */ -#define __UNUSED_100 0x000100 -#define DB_RECOVER 0x000200 /* Run normal recovery. */ -#define DB_RECOVER_FATAL 0x000400 /* Run catastrophic recovery. */ -#define DB_TXN_NOSYNC 0x000800 /* Do not sync log on commit. */ -#define DB_USE_ENVIRON 0x001000 /* Use the environment. */ -#define DB_USE_ENVIRON_ROOT 0x002000 /* Use the environment if root. */ - -/* CURRENTLY UNUSED LOCK FLAGS. */ -#define DB_TXN_LOCK_2PL 0x000000 /* Two-phase locking. */ -#define DB_TXN_LOCK_OPTIMIST 0x000000 /* Optimistic locking. */ -#define DB_TXN_LOCK_MASK 0x000000 /* Lock flags mask. */ - -/* CURRENTLY UNUSED LOG FLAGS. */ -#define DB_TXN_LOG_REDO 0x000000 /* Redo-only logging. */ -#define DB_TXN_LOG_UNDO 0x000000 /* Undo-only logging. */ -#define DB_TXN_LOG_UNDOREDO 0x000000 /* Undo/redo write-ahead logging. */ -#define DB_TXN_LOG_MASK 0x000000 /* Log flags mask. */ - -/* - * Flags understood by db_open(3). - * - * DB_EXCL and DB_TEMPORARY are internal only, and are not documented. - * DB_SEQUENTIAL is currently internal, but may be exported some day. - */ -/* 0x000007 COMMON MASK. */ -/* 0x003fff ALREADY USED. */ -#define __UNUSED_4000 0x004000 -#define DB_EXCL 0x008000 /* O_EXCL: exclusive open. */ -#define DB_RDONLY 0x010000 /* O_RDONLY: read-only. */ -#define DB_SEQUENTIAL 0x020000 /* Indicate sequential access. */ -#define DB_TEMPORARY 0x040000 /* Remove on last close. */ -#define DB_TRUNCATE 0x080000 /* O_TRUNCATE: replace existing DB. */ - -/* - * Deadlock detector modes; used in the DBENV structure to configure the - * locking subsystem. - */ -#define DB_LOCK_NORUN 0x0 -#define DB_LOCK_DEFAULT 0x1 /* Default policy. */ -#define DB_LOCK_OLDEST 0x2 /* Abort oldest transaction. */ -#define DB_LOCK_RANDOM 0x3 /* Abort random transaction. */ -#define DB_LOCK_YOUNGEST 0x4 /* Abort youngest transaction. */ - -struct __db_env { - int db_lorder; /* Byte order. */ - - /* Error message callback. */ - void (*db_errcall) __P((const char *, char *)); - FILE *db_errfile; /* Error message file stream. */ - const char *db_errpfx; /* Error message prefix. */ - int db_verbose; /* Generate debugging messages. */ - - /* User paths. */ - char *db_home; /* Database home. */ - char *db_log_dir; /* Database log file directory. */ - char *db_tmp_dir; /* Database tmp file directory. */ - - char **db_data_dir; /* Database data file directories. */ - int data_cnt; /* Database data file slots. */ - int data_next; /* Next Database data file slot. */ - - /* Locking. */ - DB_LOCKTAB *lk_info; /* Return from lock_open(). */ - u_int8_t *lk_conflicts; /* Two dimensional conflict matrix. */ - u_int32_t lk_modes; /* Number of lock modes in table. */ - u_int32_t lk_max; /* Maximum number of locks. */ - u_int32_t lk_detect; /* Deadlock detect on all conflicts. */ - - /* Logging. */ - DB_LOG *lg_info; /* Return from log_open(). */ - u_int32_t lg_max; /* Maximum file size. */ - - /* Memory pool. */ - DB_MPOOL *mp_info; /* Return from memp_open(). */ - size_t mp_mmapsize; /* Maximum file size for mmap. */ - size_t mp_size; /* Bytes in the mpool cache. */ - - /* Transactions. */ - DB_TXNMGR *tx_info; /* Return from txn_open(). */ - u_int32_t tx_max; /* Maximum number of transactions. */ - int (*tx_recover) /* Dispatch function for recovery. */ - __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); - -#define DB_ENV_APPINIT 0x01 /* Paths initialized by db_appinit(). */ -#define DB_ENV_STANDALONE 0x02 /* Test: freestanding environment. */ -#define DB_ENV_THREAD 0x04 /* DB_ENV is multi-threaded. */ - u_int32_t flags; /* Flags. */ -}; - -/******************************************************* - * Access methods. - *******************************************************/ -/* - * XXX - * Changes here must be reflected in java/src/com/sleepycat/db/Db.java. - */ -typedef enum { - DB_BTREE=1, /* B+tree. */ - DB_HASH, /* Extended Linear Hashing. */ - DB_RECNO, /* Fixed and variable-length records. */ - DB_UNKNOWN /* Figure it out on open. */ -} DBTYPE; - -#define DB_BTREEVERSION 6 /* Current btree version. */ -#define DB_BTREEOLDVER 6 /* Oldest btree version supported. */ -#define DB_BTREEMAGIC 0x053162 - -#define DB_HASHVERSION 5 /* Current hash version. */ -#define DB_HASHOLDVER 4 /* Oldest hash version supported. */ -#define DB_HASHMAGIC 0x061561 - -#define DB_LOGVERSION 2 /* Current log version. */ -#define DB_LOGOLDVER 2 /* Oldest log version supported. */ -#define DB_LOGMAGIC 0x040988 - -struct __db_info { - int db_lorder; /* Byte order. */ - size_t db_cachesize; /* Underlying cache size. */ - size_t db_pagesize; /* Underlying page size. */ - - /* Local heap allocation. */ - void *(*db_malloc) __P((size_t)); - - /* Btree access method. */ - u_int32_t bt_maxkey; /* Maximum keys per page. */ - u_int32_t bt_minkey; /* Minimum keys per page. */ - int (*bt_compare) /* Comparison function. */ - __P((const DBT *, const DBT *)); - size_t (*bt_prefix) /* Prefix function. */ - __P((const DBT *, const DBT *)); - - /* Hash access method. */ - u_int32_t h_ffactor; /* Fill factor. */ - u_int32_t h_nelem; /* Number of elements. */ - u_int32_t (*h_hash) /* Hash function. */ - __P((const void *, u_int32_t)); - - /* Recno access method. */ - int re_pad; /* Fixed-length padding byte. */ - int re_delim; /* Variable-length delimiting byte. */ - u_int32_t re_len; /* Length for fixed-length records. */ - char *re_source; /* Source file name. */ - -#define DB_DELIMITER 0x0001 /* Recno: re_delim set. */ -#define DB_DUP 0x0002 /* Btree, Hash: duplicate keys. */ -#define DB_FIXEDLEN 0x0004 /* Recno: fixed-length records. */ -#define DB_PAD 0x0008 /* Recno: re_pad set. */ -#define DB_RECNUM 0x0010 /* Btree: record numbers. */ -#define DB_RENUMBER 0x0020 /* Recno: renumber on insert/delete. */ -#define DB_SNAPSHOT 0x0040 /* Recno: snapshot the input. */ - u_int32_t flags; -}; - -/* - * DB access method and cursor operation codes. These are implemented as - * bit fields for future flexibility, but currently only a single one may - * be specified to any function. - */ -#define DB_AFTER 0x000001 /* c_put() */ -#define DB_APPEND 0x000002 /* put() */ -#define DB_BEFORE 0x000004 /* c_put() */ -#define DB_CHECKPOINT 0x000008 /* log_put(), log_get() */ -#define DB_CURRENT 0x000010 /* c_get(), c_put(), log_get() */ -#define DB_FIRST 0x000020 /* c_get(), log_get() */ -#define DB_FLUSH 0x000040 /* log_put() */ -#define DB_GET_RECNO 0x000080 /* get(), c_get() */ -#define DB_KEYFIRST 0x000100 /* c_put() */ -#define DB_KEYLAST 0x000200 /* c_put() */ -#define DB_LAST 0x000400 /* c_get(), log_get() */ -#define DB_NEXT 0x000800 /* c_get(), log_get() */ -#define DB_NOOVERWRITE 0x001000 /* put() */ -#define DB_NOSYNC 0x002000 /* close() */ -#define DB_PREV 0x004000 /* c_get(), log_get() */ -#define DB_RECORDCOUNT 0x008000 /* stat() */ -#define DB_SET 0x010000 /* c_get(), log_get() */ -#define DB_SET_RANGE 0x020000 /* c_get() */ -#define DB_SET_RECNO 0x040000 /* c_get() */ -#define DB_CURLSN 0x080000 /* log_put() */ - -/* - * DB (user visible) error return codes. - * - * XXX - * Changes to any of the user visible error return codes must be reflected - * in java/src/com/sleepycat/db/Db.java. - */ -#define DB_INCOMPLETE ( -1) /* Sync didn't finish. */ -#define DB_KEYEMPTY ( -2) /* The key/data pair was deleted or - was never created by the user. */ -#define DB_KEYEXIST ( -3) /* The key/data pair already exists. */ -#define DB_LOCK_DEADLOCK ( -4) /* Locker killed to resolve deadlock. */ -#define DB_LOCK_NOTGRANTED ( -5) /* Lock unavailable, no-wait set. */ -#define DB_LOCK_NOTHELD ( -6) /* Lock not held by locker. */ -#define DB_NOTFOUND ( -7) /* Key/data pair not found (EOF). */ - -/* DB (private) error return codes. */ -#define DB_DELETED ( -8) /* Recovery file marked deleted. */ -#define DB_NEEDSPLIT ( -9) /* Page needs to be split. */ -#define DB_REGISTERED (-10) /* Entry was previously registered. */ -#define DB_SWAPBYTES (-11) /* Database needs byte swapping. */ -#define DB_TXN_CKP (-12) /* Encountered ckp record in log. */ - -struct __db_ilock { /* Internal DB access method lock. */ - db_pgno_t pgno; /* Page being locked. */ - /* File id. */ - u_int8_t fileid[DB_FILE_ID_LEN]; -}; - -/* DB access method description structure. */ -struct __db { - void *mutexp; /* Synchronization for free threading */ - DBTYPE type; /* DB access method. */ - DB_ENV *dbenv; /* DB_ENV structure. */ - DB_ENV *mp_dbenv; /* DB_ENV for local mpool creation. */ - - DB *master; /* Original DB created by db_open. */ - void *internal; /* Access method private. */ - - DB_MPOOL *mp; /* The access method's mpool. */ - DB_MPOOLFILE *mpf; /* The access method's mpool file. */ - - /* - * XXX - * Explicit representations of structures in queue.h. - * - * TAILQ_HEAD(curs_queue, __dbc); - */ - struct { - struct __dbc *tqh_first; - struct __dbc **tqh_last; - } curs_queue; - - /* - * XXX - * Explicit representations of structures in queue.h. - * - * LIST_HEAD(handleq, __db); - * LIST_ENTRY(__db); - */ - struct { - struct __db *lh_first; - } handleq; /* List of handles for this DB. */ - struct { - struct __db *le_next; - struct __db **le_prev; - } links; /* Links for the handle list. */ - - u_int32_t log_fileid; /* Logging file id. */ - - DB_TXN *txn; /* Current transaction. */ - u_int32_t locker; /* Default process' locker id. */ - DBT lock_dbt; /* DBT referencing lock. */ - struct __db_ilock lock; /* Lock. */ - - size_t pgsize; /* Logical page size of file. */ - - /* Local heap allocation. */ - void *(*db_malloc) __P((size_t)); - - /* Functions. */ - int (*close) __P((DB *, u_int32_t)); - int (*cursor) __P((DB *, DB_TXN *, DBC **)); - int (*del) __P((DB *, DB_TXN *, DBT *, u_int32_t)); - int (*fd) __P((DB *, int *)); - int (*get) __P((DB *, DB_TXN *, DBT *, DBT *, u_int32_t)); - int (*put) __P((DB *, DB_TXN *, DBT *, DBT *, u_int32_t)); - int (*stat) __P((DB *, void *, void *(*)(size_t), u_int32_t)); - int (*sync) __P((DB *, u_int32_t)); - -#define DB_AM_DUP 0x000001 /* DB_DUP (internal). */ -#define DB_AM_INMEM 0x000002 /* In-memory; no sync on close. */ -#define DB_AM_LOCKING 0x000004 /* Perform locking. */ -#define DB_AM_LOGGING 0x000008 /* Perform logging. */ -#define DB_AM_MLOCAL 0x000010 /* Database memory pool is local. */ -#define DB_AM_PGDEF 0x000020 /* Page size was defaulted. */ -#define DB_AM_RDONLY 0x000040 /* Database is readonly. */ -#define DB_AM_RECOVER 0x000080 /* In recovery (do not log or lock). */ -#define DB_AM_SWAP 0x000100 /* Pages need to be byte-swapped. */ -#define DB_AM_THREAD 0x000200 /* DB is multi-threaded. */ -#define DB_BT_RECNUM 0x000400 /* DB_RECNUM (internal) */ -#define DB_HS_DIRTYMETA 0x000800 /* Hash: Metadata page modified. */ -#define DB_RE_DELIMITER 0x001000 /* DB_DELIMITER (internal). */ -#define DB_RE_FIXEDLEN 0x002000 /* DB_FIXEDLEN (internal). */ -#define DB_RE_PAD 0x004000 /* DB_PAD (internal). */ -#define DB_RE_RENUMBER 0x008000 /* DB_RENUMBER (internal). */ -#define DB_RE_SNAPSHOT 0x010000 /* DB_SNAPSHOT (internal). */ - u_int32_t flags; -}; - -/* Cursor description structure. */ -struct __dbc { - DB *dbp; /* Related DB access method. */ - DB_TXN *txn; /* Associated transaction. */ - - /* - * XXX - * Explicit representations of structures in queue.h. - * - * TAILQ_ENTRY(__dbc); - */ - struct { - struct __dbc *tqe_next; - struct __dbc **tqe_prev; - } links; - - void *internal; /* Access method private. */ - - int (*c_close) __P((DBC *)); - int (*c_del) __P((DBC *, u_int32_t)); - int (*c_get) __P((DBC *, DBT *, DBT *, u_int32_t)); - int (*c_put) __P((DBC *, DBT *, DBT *, u_int32_t)); -}; - -/* Btree/recno statistics structure. */ -struct __db_bt_stat { - u_int32_t bt_flags; /* Open flags. */ - u_int32_t bt_maxkey; /* Maxkey value. */ - u_int32_t bt_minkey; /* Minkey value. */ - u_int32_t bt_re_len; /* Fixed-length record length. */ - u_int32_t bt_re_pad; /* Fixed-length record pad. */ - u_int32_t bt_pagesize; /* Page size. */ - u_int32_t bt_levels; /* Tree levels. */ - u_int32_t bt_nrecs; /* Number of records. */ - u_int32_t bt_int_pg; /* Internal pages. */ - u_int32_t bt_leaf_pg; /* Leaf pages. */ - u_int32_t bt_dup_pg; /* Duplicate pages. */ - u_int32_t bt_over_pg; /* Overflow pages. */ - u_int32_t bt_free; /* Pages on the free list. */ - u_int32_t bt_freed; /* Pages freed for reuse. */ - u_int32_t bt_int_pgfree; /* Bytes free in internal pages. */ - u_int32_t bt_leaf_pgfree; /* Bytes free in leaf pages. */ - u_int32_t bt_dup_pgfree; /* Bytes free in duplicate pages. */ - u_int32_t bt_over_pgfree; /* Bytes free in overflow pages. */ - u_int32_t bt_pfxsaved; /* Bytes saved by prefix compression. */ - u_int32_t bt_split; /* Total number of splits. */ - u_int32_t bt_rootsplit; /* Root page splits. */ - u_int32_t bt_fastsplit; /* Fast splits. */ - u_int32_t bt_added; /* Items added. */ - u_int32_t bt_deleted; /* Items deleted. */ - u_int32_t bt_get; /* Items retrieved. */ - u_int32_t bt_cache_hit; /* Hits in fast-insert code. */ - u_int32_t bt_cache_miss; /* Misses in fast-insert code. */ - u_int32_t bt_magic; /* Magic number. */ - u_int32_t bt_version; /* Version number. */ -}; - -#if defined(__cplusplus) -extern "C" { -#endif -int db_appinit __P((const char *, char * const *, DB_ENV *, u_int32_t)); -int db_appexit __P((DB_ENV *)); -int db_jump_set __P((void *, int)); -int db_open __P((const char *, - DBTYPE, u_int32_t, int, DB_ENV *, DB_INFO *, DB **)); -int db_value_set __P((int, int)); -char *db_version __P((int *, int *, int *)); -#if defined(__cplusplus) -} -#endif - -/******************************************************* - * Locking - *******************************************************/ -#define DB_LOCKVERSION 1 -#define DB_LOCKMAGIC 0x090193 - -/* Flag values for lock_vec(). */ -#define DB_LOCK_NOWAIT 0x01 /* Don't wait on unavailable lock. */ - -/* Flag values for lock_detect(). */ -#define DB_LOCK_CONFLICT 0x01 /* Run on any conflict. */ - -/* - * Request types. - * - * XXX - * Changes here must be reflected in java/src/com/sleepycat/db/Db.java. - */ -typedef enum { - DB_LOCK_DUMP=0, /* Display held locks. */ - DB_LOCK_GET, /* Get the lock. */ - DB_LOCK_PUT, /* Release the lock. */ - DB_LOCK_PUT_ALL, /* Release locker's locks. */ - DB_LOCK_PUT_OBJ /* Release locker's locks on obj. */ -} db_lockop_t; - -/* - * Simple R/W lock modes and for multi-granularity intention locking. - * - * XXX - * Changes here must be reflected in java/src/com/sleepycat/db/Db.java. - */ -typedef enum { - DB_LOCK_NG=0, /* Not granted. */ - DB_LOCK_READ, /* Shared/read. */ - DB_LOCK_WRITE, /* Exclusive/write. */ - DB_LOCK_IREAD, /* Intent to share/read. */ - DB_LOCK_IWRITE, /* Intent exclusive/write. */ - DB_LOCK_IWR /* Intent to read and write. */ -} db_lockmode_t; - -/* - * Status of a lock. - */ -typedef enum { - DB_LSTAT_ABORTED, /* Lock belongs to an aborted txn. */ - DB_LSTAT_ERR, /* Lock is bad. */ - DB_LSTAT_FREE, /* Lock is unallocated. */ - DB_LSTAT_HELD, /* Lock is currently held. */ - DB_LSTAT_NOGRANT, /* Lock was not granted. */ - DB_LSTAT_PENDING, /* Lock was waiting and has been - * promoted; waiting for the owner - * to run and upgrade it to held. */ - DB_LSTAT_WAITING /* Lock is on the wait queue. */ -} db_status_t; - -/* Lock request structure. */ -struct __db_lockreq { - db_lockop_t op; /* Operation. */ - db_lockmode_t mode; /* Requested mode. */ - u_int32_t locker; /* Locker identity. */ - DBT *obj; /* Object being locked. */ - DB_LOCK lock; /* Lock returned. */ -}; - -/* - * Commonly used conflict matrices. - * - * Standard Read/Write (or exclusive/shared) locks. - */ -#define DB_LOCK_RW_N 3 -extern const u_int8_t db_rw_conflicts[]; - -/* Multi-granularity locking. */ -#define DB_LOCK_RIW_N 6 -extern const u_int8_t db_riw_conflicts[]; - -struct __db_lock_stat { - u_int32_t st_magic; /* Lock file magic number. */ - u_int32_t st_version; /* Lock file version number. */ - u_int32_t st_maxlocks; /* Maximum number of locks in table. */ - u_int32_t st_nmodes; /* Number of lock modes. */ - u_int32_t st_numobjs; /* Number of objects. */ - u_int32_t st_nlockers; /* Number of lockers. */ - u_int32_t st_nconflicts; /* Number of lock conflicts. */ - u_int32_t st_nrequests; /* Number of lock gets. */ - u_int32_t st_nreleases; /* Number of lock puts. */ - u_int32_t st_ndeadlocks; /* Number of lock deadlocks. */ - u_int32_t st_region_wait; /* Region lock granted after wait. */ - u_int32_t st_region_nowait; /* Region lock granted without wait. */ - u_int32_t st_refcnt; /* Region reference count. */ - u_int32_t st_regsize; /* Region size. */ -}; - -#if defined(__cplusplus) -extern "C" { -#endif -int lock_close __P((DB_LOCKTAB *)); -int lock_detect __P((DB_LOCKTAB *, u_int32_t, u_int32_t)); -int lock_get __P((DB_LOCKTAB *, - u_int32_t, u_int32_t, const DBT *, db_lockmode_t, DB_LOCK *)); -int lock_id __P((DB_LOCKTAB *, u_int32_t *)); -int lock_open __P((const char *, - u_int32_t, int, DB_ENV *, DB_LOCKTAB **)); -int lock_put __P((DB_LOCKTAB *, DB_LOCK)); -int lock_stat __P((DB_LOCKTAB *, DB_LOCK_STAT **, void *(*)(size_t))); -int lock_unlink __P((const char *, int, DB_ENV *)); -int lock_vec __P((DB_LOCKTAB *, - u_int32_t, u_int32_t, DB_LOCKREQ *, int, DB_LOCKREQ **)); -#if defined(__cplusplus) -} -#endif - -/******************************************************* - * Logging. - *******************************************************/ -/* Flag values for log_archive(). */ -#define DB_ARCH_ABS 0x001 /* Absolute pathnames. */ -#define DB_ARCH_DATA 0x002 /* Data files. */ -#define DB_ARCH_LOG 0x004 /* Log files. */ - -/* - * A DB_LSN has two parts, a fileid which identifies a specific file, and an - * offset within that file. The fileid is an unsigned 4-byte quantity that - * uniquely identifies a file within the log directory -- currently a simple - * counter inside the log. The offset is also an unsigned 4-byte value. The - * log manager guarantees the offset is never more than 4 bytes by switching - * to a new log file before the maximum length imposed by an unsigned 4-byte - * offset is reached. - */ -struct __db_lsn { - u_int32_t file; /* File ID. */ - u_int32_t offset; /* File offset. */ -}; - -/* Log statistics structure. */ -struct __db_log_stat { - u_int32_t st_magic; /* Log file magic number. */ - u_int32_t st_version; /* Log file version number. */ - int st_mode; /* Log file mode. */ - u_int32_t st_lg_max; /* Maximum log file size. */ - u_int32_t st_w_bytes; /* Bytes to log. */ - u_int32_t st_w_mbytes; /* Megabytes to log. */ - u_int32_t st_wc_bytes; /* Bytes to log since checkpoint. */ - u_int32_t st_wc_mbytes; /* Megabytes to log since checkpoint. */ - u_int32_t st_wcount; /* Total syncs to the log. */ - u_int32_t st_scount; /* Total writes to the log. */ - u_int32_t st_region_wait; /* Region lock granted after wait. */ - u_int32_t st_region_nowait; /* Region lock granted without wait. */ - u_int32_t st_cur_file; /* Current log file number. */ - u_int32_t st_cur_offset; /* Current log file offset. */ - u_int32_t st_refcnt; /* Region reference count. */ - u_int32_t st_regsize; /* Region size. */ -}; - -#if defined(__cplusplus) -extern "C" { -#endif -int log_archive __P((DB_LOG *, char **[], u_int32_t, void *(*)(size_t))); -int log_close __P((DB_LOG *)); -int log_compare __P((const DB_LSN *, const DB_LSN *)); -int log_file __P((DB_LOG *, const DB_LSN *, char *, size_t)); -int log_flush __P((DB_LOG *, const DB_LSN *)); -int log_get __P((DB_LOG *, DB_LSN *, DBT *, u_int32_t)); -int log_open __P((const char *, u_int32_t, int, DB_ENV *, DB_LOG **)); -int log_put __P((DB_LOG *, DB_LSN *, const DBT *, u_int32_t)); -int log_register __P((DB_LOG *, DB *, const char *, DBTYPE, u_int32_t *)); -int log_stat __P((DB_LOG *, DB_LOG_STAT **, void *(*)(size_t))); -int log_unlink __P((const char *, int, DB_ENV *)); -int log_unregister __P((DB_LOG *, u_int32_t)); -#if defined(__cplusplus) -} -#endif - -/******************************************************* - * Mpool - *******************************************************/ -/* Flag values for memp_fget(). */ -#define DB_MPOOL_CREATE 0x001 /* Create a page. */ -#define DB_MPOOL_LAST 0x002 /* Return the last page. */ -#define DB_MPOOL_NEW 0x004 /* Create a new page. */ - -/* Flag values for memp_fput(), memp_fset(). */ -#define DB_MPOOL_CLEAN 0x001 /* Clear modified bit. */ -#define DB_MPOOL_DIRTY 0x002 /* Page is modified. */ -#define DB_MPOOL_DISCARD 0x004 /* Don't cache the page. */ - -/* Mpool statistics structure. */ -struct __db_mpool_stat { - size_t st_cachesize; /* Cache size. */ - u_int32_t st_cache_hit; /* Pages found in the cache. */ - u_int32_t st_cache_miss; /* Pages not found in the cache. */ - u_int32_t st_map; /* Pages from mapped files. */ - u_int32_t st_page_create; /* Pages created in the cache. */ - u_int32_t st_page_in; /* Pages read in. */ - u_int32_t st_page_out; /* Pages written out. */ - u_int32_t st_ro_evict; /* Clean pages forced from the cache. */ - u_int32_t st_rw_evict; /* Dirty pages forced from the cache. */ - u_int32_t st_hash_buckets; /* Number of hash buckets. */ - u_int32_t st_hash_searches; /* Total hash chain searches. */ - u_int32_t st_hash_longest; /* Longest hash chain searched. */ - u_int32_t st_hash_examined; /* Total hash entries searched. */ - u_int32_t st_page_clean; /* Clean pages. */ - u_int32_t st_page_dirty; /* Dirty pages. */ - u_int32_t st_page_trickle; /* Pages written by memp_trickle. */ - u_int32_t st_region_wait; /* Region lock granted after wait. */ - u_int32_t st_region_nowait; /* Region lock granted without wait. */ - u_int32_t st_refcnt; /* Region reference count. */ - u_int32_t st_regsize; /* Region size. */ -}; - -/* Mpool file open information structure. */ -struct __db_mpool_finfo { - int ftype; /* File type. */ - DBT *pgcookie; /* Byte-string passed to pgin/pgout. */ - u_int8_t *fileid; /* Unique file ID. */ - int32_t lsn_offset; /* LSN offset in page. */ - u_int32_t clear_len; /* Cleared length on created pages. */ -}; - -/* Mpool file statistics structure. */ -struct __db_mpool_fstat { - char *file_name; /* File name. */ - size_t st_pagesize; /* Page size. */ - u_int32_t st_cache_hit; /* Pages found in the cache. */ - u_int32_t st_cache_miss; /* Pages not found in the cache. */ - u_int32_t st_map; /* Pages from mapped files. */ - u_int32_t st_page_create; /* Pages created in the cache. */ - u_int32_t st_page_in; /* Pages read in. */ - u_int32_t st_page_out; /* Pages written out. */ -}; - -#if defined(__cplusplus) -extern "C" { -#endif -int memp_close __P((DB_MPOOL *)); -int memp_fclose __P((DB_MPOOLFILE *)); -int memp_fget __P((DB_MPOOLFILE *, db_pgno_t *, u_int32_t, void *)); -int memp_fopen __P((DB_MPOOL *, const char *, - u_int32_t, int, size_t, DB_MPOOL_FINFO *, DB_MPOOLFILE **)); -int memp_fput __P((DB_MPOOLFILE *, void *, u_int32_t)); -int memp_fset __P((DB_MPOOLFILE *, void *, u_int32_t)); -int memp_fsync __P((DB_MPOOLFILE *)); -int memp_open __P((const char *, u_int32_t, int, DB_ENV *, DB_MPOOL **)); -int memp_register __P((DB_MPOOL *, int, - int (*)(db_pgno_t, void *, DBT *), - int (*)(db_pgno_t, void *, DBT *))); -int memp_stat __P((DB_MPOOL *, - DB_MPOOL_STAT **, DB_MPOOL_FSTAT ***, void *(*)(size_t))); -int memp_sync __P((DB_MPOOL *, DB_LSN *)); -int memp_trickle __P((DB_MPOOL *, int, int *)); -int memp_unlink __P((const char *, int, DB_ENV *)); -#if defined(__cplusplus) -} -#endif - -/******************************************************* - * Transactions. - *******************************************************/ -#define DB_TXNVERSION 1 -#define DB_TXNMAGIC 0x041593 - -/* Operations values to the tx_recover() function. */ -#define DB_TXN_BACKWARD_ROLL 1 /* Read the log backwards. */ -#define DB_TXN_FORWARD_ROLL 2 /* Read the log forwards. */ -#define DB_TXN_OPENFILES 3 /* Read for open files. */ -#define DB_TXN_REDO 4 /* Redo the operation. */ -#define DB_TXN_UNDO 5 /* Undo the operation. */ - -/* Internal transaction status values. */ - -/* Transaction statistics structure. */ -struct __db_txn_active { - u_int32_t txnid; /* Transaction ID */ - DB_LSN lsn; /* Lsn of the begin record */ -}; - -struct __db_txn_stat { - DB_LSN st_last_ckp; /* lsn of the last checkpoint */ - DB_LSN st_pending_ckp; /* last checkpoint did not finish */ - time_t st_time_ckp; /* time of last checkpoint */ - u_int32_t st_last_txnid; /* last transaction id given out */ - u_int32_t st_maxtxns; /* maximum number of active txns */ - u_int32_t st_naborts; /* number of aborted transactions */ - u_int32_t st_nbegins; /* number of begun transactions */ - u_int32_t st_ncommits; /* number of committed transactions */ - u_int32_t st_nactive; /* number of active transactions */ - DB_TXN_ACTIVE - *st_txnarray; /* array of active transactions */ - u_int32_t st_region_wait; /* Region lock granted after wait. */ - u_int32_t st_region_nowait; /* Region lock granted without wait. */ - u_int32_t st_refcnt; /* Region reference count. */ - u_int32_t st_regsize; /* Region size. */ -}; - -#if defined(__cplusplus) -extern "C" { -#endif -int txn_abort __P((DB_TXN *)); -int txn_begin __P((DB_TXNMGR *, DB_TXN *, DB_TXN **)); -int txn_checkpoint __P((const DB_TXNMGR *, u_int32_t, u_int32_t)); -int txn_commit __P((DB_TXN *)); -int txn_close __P((DB_TXNMGR *)); -u_int32_t txn_id __P((DB_TXN *)); -int txn_open __P((const char *, u_int32_t, int, DB_ENV *, DB_TXNMGR **)); -int txn_prepare __P((DB_TXN *)); -int txn_stat __P((DB_TXNMGR *, DB_TXN_STAT **, void *(*)(size_t))); -int txn_unlink __P((const char *, int, DB_ENV *)); -#if defined(__cplusplus) -} -#endif - -#ifndef DB_DBM_HSEARCH -#define DB_DBM_HSEARCH 0 /* No historic interfaces by default. */ -#endif -#if DB_DBM_HSEARCH != 0 -/******************************************************* - * Dbm/Ndbm historic interfaces. - *******************************************************/ -#define DBM_INSERT 0 /* Flags to dbm_store(). */ -#define DBM_REPLACE 1 - -/* - * The db(3) support for ndbm(3) always appends this suffix to the - * file name to avoid overwriting the user's original database. - */ -#define DBM_SUFFIX ".db" - -#if defined(_XPG4_2) -typedef struct { - char *dptr; - size_t dsize; -} datum; -#else -typedef struct { - char *dptr; - int dsize; -} datum; -#endif - -/* - * Translate DBM calls into DB calls so that DB doesn't step on the - * application's name space. - * - * The global variables dbrdonly, dirf and pagf were not retained when - * 4BSD replaced the dbm interface with ndbm, and are not support here. - */ -#define dbminit(a) __db_dbm_init(a) -#if !defined(__cplusplus) -#define delete(a) __db_dbm_delete(a) -#endif -#define fetch(a) __db_dbm_fetch(a) -#define firstkey __db_dbm_firstkey -#define nextkey(a) __db_dbm_nextkey(a) -#define store(a, b) __db_dbm_store(a, b) - -/* Prototype the DB calls. */ -#if defined(__cplusplus) -extern "C" { -#endif -int __db_dbm_init __P((char *)); -int __db_dbm_delete __P((datum)); -int __db_dbm_dbrdonly __P((void)); -int __db_dbm_dirf __P((void)); -datum __db_dbm_fetch __P((datum)); -datum __db_dbm_firstkey __P((void)); -datum __db_dbm_nextkey __P((datum)); -int __db_dbm_pagf __P((void)); -int __db_dbm_store __P((datum, datum)); -#if defined(__cplusplus) -} -#endif - -/* - * Translate NDBM calls into DB calls so that DB doesn't step on the - * application's name space. - */ -#define dbm_clearerr(a) __db_ndbm_clearerr(a) -#define dbm_close(a) __db_ndbm_close(a) -#define dbm_delete(a, b) __db_ndbm_delete(a, b) -#define dbm_dirfno(a) __db_ndbm_dirfno(a) -#define dbm_error(a) __db_ndbm_error(a) -#define dbm_fetch(a, b) __db_ndbm_fetch(a, b) -#define dbm_firstkey(a) __db_ndbm_firstkey(a) -#define dbm_nextkey(a) __db_ndbm_nextkey(a) -#define dbm_open(a, b, c) __db_ndbm_open(a, b, c) -#define dbm_pagfno(a) __db_ndbm_pagfno(a) -#define dbm_rdonly(a) __db_ndbm_rdonly(a) -#define dbm_store(a, b, c, d) __db_ndbm_store(a, b, c, d) - -/* Prototype the DB calls. */ -#if defined(__cplusplus) -extern "C" { -#endif -int __db_ndbm_clearerr __P((DBM *)); -void __db_ndbm_close __P((DBM *)); -int __db_ndbm_delete __P((DBM *, datum)); -int __db_ndbm_dirfno __P((DBM *)); -int __db_ndbm_error __P((DBM *)); -datum __db_ndbm_fetch __P((DBM *, datum)); -datum __db_ndbm_firstkey __P((DBM *)); -datum __db_ndbm_nextkey __P((DBM *)); -DBM *__db_ndbm_open __P((const char *, int, int)); -int __db_ndbm_pagfno __P((DBM *)); -int __db_ndbm_rdonly __P((DBM *)); -int __db_ndbm_store __P((DBM *, datum, datum, int)); -#if defined(__cplusplus) -} -#endif - -/******************************************************* - * Hsearch historic interface. - *******************************************************/ -typedef enum { - FIND, ENTER -} ACTION; - -typedef struct entry { - char *key; - char *data; -} ENTRY; - -/* - * Translate HSEARCH calls into DB calls so that DB doesn't step on the - * application's name space. - */ -#define hcreate(a) __db_hcreate(a) -#define hdestroy __db_hdestroy -#define hsearch(a, b) __db_hsearch(a, b) - -/* Prototype the DB calls. */ -#if defined(__cplusplus) -extern "C" { -#endif -int __db_hcreate __P((size_t)); -void __db_hdestroy __P((void)); -ENTRY *__db_hsearch __P((ENTRY, ACTION)); -#if defined(__cplusplus) -} -#endif -#endif /* DB_DBM_HSEARCH */ - -/* - * XXX - * MacOS: Reset Metrowerks C enum sizes. - */ -#ifdef __MWERKS__ -#pragma enumsalwaysint reset -#endif -#endif /* !_DB_H_ */ diff --git a/db2/include/db_am.h b/db2/include/db_am.h index 0c189244a2..fe2176d772 100644 --- a/db2/include/db_am.h +++ b/db2/include/db_am.h @@ -4,7 +4,7 @@ * Copyright (c) 1996, 1997, 1998 * Sleepycat Software. All rights reserved. * - * @(#)db_am.h 10.9 (Sleepycat) 4/10/98 + * @(#)db_am.h 10.15 (Sleepycat) 11/22/98 */ #ifndef _DB_AM_H #define _DB_AM_H @@ -16,6 +16,8 @@ #define DB_REM_BIG 0x40 #define DB_SPLITOLD 0x50 #define DB_SPLITNEW 0x60 +#define DB_ADD_PAGE 0x70 +#define DB_REM_PAGE 0x80 /* * Standard initialization and shutdown macros for all recovery functions. @@ -27,34 +29,31 @@ * int ret; */ #define REC_INTRO(func) { \ - file_dbp = mdbp = NULL; \ + file_dbp = NULL; \ + dbc = NULL; \ if ((ret = func(dbtp->data, &argp)) != 0) \ goto out; \ - if ((ret = __db_fileid_to_db(logp, &mdbp, argp->fileid)) != 0) {\ - if (ret == DB_DELETED) \ + if ((ret = \ + __db_fileid_to_db(logp, &file_dbp, argp->fileid)) != 0) { \ + if (ret == DB_DELETED) { \ ret = 0; \ + goto done; \ + } \ goto out; \ } \ - if (mdbp == NULL) \ + if (file_dbp == NULL) \ goto out; \ - if (F_ISSET(mdbp, DB_AM_THREAD)) { \ - if ((ret = __db_gethandle(mdbp, \ - mdbp->type == DB_HASH ? __ham_hdup : __bam_bdup, \ - &file_dbp)) != 0) \ - goto out; \ - } else \ - file_dbp = mdbp; \ - F_SET(file_dbp, DB_AM_RECOVER); \ + if ((ret = file_dbp->cursor(file_dbp, NULL, &dbc, 0)) != 0) \ + goto out; \ + F_SET(dbc, DBC_RECOVER); \ mpf = file_dbp->mpf; \ } + #define REC_CLOSE { \ if (argp != NULL) \ - __db_free(argp); \ - if (file_dbp != NULL) { \ - F_CLR(file_dbp, DB_AM_RECOVER); \ - if (F_ISSET(file_dbp, DB_AM_THREAD)) \ - __db_puthandle(file_dbp); \ - } \ + __os_free(argp, sizeof(*argp)); \ + if (dbc != NULL) \ + dbc->c_close(dbc); \ return (ret); \ } @@ -67,7 +66,7 @@ } #define REC_NOOP_CLOSE { \ if (argp != NULL) \ - __db_free(argp); \ + __os_free(argp, sizeof(*argp)); \ return (ret); \ } diff --git a/db2/include/db_auto.h b/db2/include/db_auto.h index 1b07c748e8..0d1e43a26a 100644 --- a/db2/include/db_auto.h +++ b/db2/include/db_auto.h @@ -70,6 +70,7 @@ typedef struct _db_relink_args { u_int32_t type; DB_TXN *txnid; DB_LSN prev_lsn; + u_int32_t opcode; u_int32_t fileid; db_pgno_t pgno; DB_LSN lsn; @@ -107,16 +108,4 @@ typedef struct _db_debug_args { u_int32_t arg_flags; } __db_debug_args; - -#define DB_db_noop (DB_db_BEGIN + 8) - -typedef struct _db_noop_args { - u_int32_t type; - DB_TXN *txnid; - DB_LSN prev_lsn; - u_int32_t fileid; - db_pgno_t pgno; - DB_LSN prevlsn; -} __db_noop_args; - #endif diff --git a/db2/include/db_cxx.h b/db2/include/db_cxx.h index fc04d5d66b..f415d594b5 100644 --- a/db2/include/db_cxx.h +++ b/db2/include/db_cxx.h @@ -4,7 +4,7 @@ * Copyright (c) 1997, 1998 * Sleepycat Software. All rights reserved. * - * @(#)db_cxx.h 10.17 (Sleepycat) 5/2/98 + * @(#)db_cxx.h 10.30 (Sleepycat) 11/22/98 */ #ifndef _DB_CXX_H_ @@ -49,7 +49,8 @@ // Forward declarations // -#include "db.h" +#include <iostream.h> +#include <db.h> class Db; // forward class Dbc; // forward @@ -66,6 +67,19 @@ class Dbt; // forward class DbTxn; // forward class DbTxnMgr; // forward +// These classes are not defined here and should be invisible +// to the user, but some compilers require forward references. +// There is one for each use of the DEFINE_DB_CLASS macro. + +class DbLockTabImp; +class DbLogImp; +class DbMpoolImp; +class DbMpoolFileImp; +class DbImp; +class DbTxnImp; +class DbTxnMgrImp; + + //////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////// // @@ -175,15 +189,11 @@ private: class _exported DbLock { - friend DbLockTab; + friend class DbLockTab; public: - DbLock(u_int); DbLock(); - u_int get_lock_id(); - void set_lock_id(u_int); - int put(DbLockTab *locktab); DbLock(const DbLock &); @@ -194,18 +204,21 @@ protected: // since its contained class is not allocated by db. // (see comment at top) + DbLock(DB_LOCK); DB_LOCK lock_; }; class _exported DbLockTab { -friend DbEnv; + friend class DbEnv; + public: int close(); int detect(u_int32_t flags, int atype); int get(u_int32_t locker, u_int32_t flags, const Dbt *obj, db_lockmode_t lock_mode, DbLock *lock); int id(u_int32_t *idp); + int stat(DB_LOCK_STAT **statp, void *(*db_malloc)(size_t)); int vec(u_int32_t locker, u_int32_t flags, DB_LOCKREQ list[], int nlist, DB_LOCKREQ **elistp); @@ -244,13 +257,14 @@ private: class _exported DbLsn : protected DB_LSN { - friend DbLog; // friendship needed to cast to base class - friend DbMpool; + friend class DbLog; // friendship needed to cast to base class + friend class DbMpool; }; class _exported DbLog { -friend DbEnv; + friend class DbEnv; + public: int archive(char **list[], u_int32_t flags, void *(*db_malloc)(size_t)); int close(); @@ -300,7 +314,8 @@ private: class _exported DbMpoolFile { -friend DbEnv; + friend class DbEnv; + public: int close(); int get(db_pgno_t *pgnoaddr, u_int32_t flags, void *pagep); @@ -337,7 +352,8 @@ private: class _exported DbMpool { -friend DbEnv; + friend class DbEnv; + public: int close(); @@ -388,7 +404,8 @@ private: class _exported DbTxnMgr { -friend DbEnv; + friend class DbEnv; + public: int begin(DbTxn *pid, DbTxn **tid); int checkpoint(u_int32_t kbyte, u_int32_t min) const; @@ -422,7 +439,8 @@ private: class _exported DbTxn { -friend DbTxnMgr; + friend class DbTxnMgr; + public: int abort(); int commit(); @@ -461,90 +479,78 @@ private: // class _exported DbInfo : protected DB_INFO { - friend DbEnv; - friend Db; + friend class DbEnv; + friend class Db; public: DbInfo(); ~DbInfo(); // Byte order. - int get_lorder() const; void set_lorder(int); // Underlying cache size. - size_t get_cachesize() const; void set_cachesize(size_t); // Underlying page size. - size_t get_pagesize() const; void set_pagesize(size_t); // Local heap allocation. typedef void *(*db_malloc_fcn)(size_t); - db_malloc_fcn get_malloc() const; void set_malloc(db_malloc_fcn); + // Duplicate compare function. + typedef int (*dup_compare_fcn)(const DBT *, const DBT *); + void set_dup_compare(dup_compare_fcn); + //////////////////////////////////////////////////////////////// // Btree access method. // Maximum keys per page. - int get_bt_maxkey() const; void set_bt_maxkey(int); // Minimum keys per page. - int get_bt_minkey() const; void set_bt_minkey(int); // Comparison function. typedef int (*bt_compare_fcn)(const DBT *, const DBT *); - bt_compare_fcn get_bt_compare() const; void set_bt_compare(bt_compare_fcn); // Prefix function. typedef size_t (*bt_prefix_fcn)(const DBT *, const DBT *); - bt_prefix_fcn get_bt_prefix() const; void set_bt_prefix(bt_prefix_fcn); //////////////////////////////////////////////////////////////// // Hash access method. // Fill factor. - u_int32_t get_h_ffactor() const; void set_h_ffactor(u_int32_t); // Number of elements. - u_int32_t get_h_nelem() const; void set_h_nelem(u_int32_t); // Hash function. typedef u_int32_t (*h_hash_fcn)(const void *, u_int32_t); - h_hash_fcn get_h_hash() const; void set_h_hash(h_hash_fcn); //////////////////////////////////////////////////////////////// // Recno access method. // Fixed-length padding byte. - int get_re_pad() const; void set_re_pad(int); // Variable-length delimiting byte. - int get_re_delim() const; void set_re_delim(int); // Length for fixed-length records. - u_int32_t get_re_len() const; void set_re_len(u_int32_t); // Source file name. - char *get_re_source() const; void set_re_source(char *); // Note: some flags are set as side effects of calling // above "set" methods. // - u_int32_t get_flags() const; void set_flags(u_int32_t); @@ -570,11 +576,11 @@ private: // class _exported DbEnv : protected DB_ENV { -friend DbTxnMgr; -friend DbLog; -friend DbLockTab; -friend DbMpool; -friend Db; + friend class DbTxnMgr; + friend class DbLog; + friend class DbLockTab; + friend class DbMpool; + friend class Db; public: @@ -603,6 +609,10 @@ public: // int appexit(); + // Version information. A static method so it can be obtained anytime. + // + static char *version(int *major, int *minor, int *patch); + //////////////////////////////////////////////////////////////// // simple get/set access methods // @@ -610,74 +620,41 @@ public: // use the default constructor along with appinit(). // Byte order. - int get_lorder() const; void set_lorder(int); + // Panic callback. + typedef void (*db_paniccall_fcn)(DbEnv *, int); + void set_paniccall(db_paniccall_fcn); + // Error message callback. typedef void (*db_errcall_fcn)(const char *, char *); - db_errcall_fcn get_errcall() const; void set_errcall(db_errcall_fcn); // Error message file stream. - FILE *get_errfile() const; void set_errfile(FILE *); // Error message prefix. - const char *get_errpfx() const; void set_errpfx(const char *); // Generate debugging messages. - int get_verbose() const; void set_verbose(int); //////////////////////////////////////////////////////////////// - // User paths. - - // Database home. - char *get_home() const; - void set_home(char *); - - // Database log file directory. - char *get_log_dir() const; - void set_log_dir(char *); - - // Database tmp file directory. - char *get_tmp_dir() const; - void set_tmp_dir(char *); - - // Database data file directories. - char **get_data_dir() const; - void set_data_dir(char **); - - // Database data file slots. - int get_data_cnt() const; - void set_data_cnt(int); - - // Next Database data file slot. - int get_data_next() const; - void set_data_next(int); - - - //////////////////////////////////////////////////////////////// // Locking. // Return from lock_open(). DbLockTab *get_lk_info() const; // Two dimensional conflict matrix. - u_int8_t *get_lk_conflicts() const; void set_lk_conflicts(u_int8_t *); // Number of lock modes in table. - int get_lk_modes() const; void set_lk_modes(int); // Maximum number of locks. - u_int32_t get_lk_max() const; void set_lk_max(u_int32_t); // Deadlock detect on every conflict. - u_int32_t get_lk_detect() const; void set_lk_detect(u_int32_t); @@ -688,7 +665,6 @@ public: DbLog *get_lg_info() const; // Maximum file size. - u_int32_t get_lg_max() const; void set_lg_max(u_int32_t); @@ -699,11 +675,9 @@ public: DbMpool *get_mp_info() const; // Maximum file size for mmap. - size_t get_mp_mmapsize() const; void set_mp_mmapsize(size_t); // Bytes in the mpool cache. - size_t get_mp_size() const; void set_mp_size(size_t); @@ -714,16 +688,13 @@ public: DbTxnMgr *get_tx_info() const; // Maximum number of transactions. - u_int32_t get_tx_max() const; void set_tx_max(u_int32_t); // Dispatch function for recovery. typedef int (*tx_recover_fcn)(DB_LOG *, DBT *, DB_LSN *, int, void *); - tx_recover_fcn get_tx_recover() const; void set_tx_recover(tx_recover_fcn); // Flags. - u_int32_t get_flags() const; void set_flags(u_int32_t); //////////////////////////////////////////////////////////////// @@ -736,7 +707,6 @@ public: // enum ErrorModel { Exception, ErrorReturn }; void set_error_model(ErrorModel); - ErrorModel get_error_model() const; // If an error is detected and the error call function // or stream is set, a message is dispatched or printed. @@ -747,11 +717,11 @@ public: // call set_error_stream() to force all errors to a C++ stream. // It is unwise to mix these approaches. // - class ostream* get_error_stream() const; void set_error_stream(class ostream*); // used internally - static int runtime_error(const char *caller, int err, int in_destructor = 0); + static int runtime_error(const char *caller, int err, + int in_destructor = 0, int force_throw = 0); private: // We can add data to this class if needed @@ -778,23 +748,27 @@ private: // class _exported Db { - friend DbEnv; + friend class DbEnv; public: int close(u_int32_t flags); - int cursor(DbTxn *txnid, Dbc **cursorp); + int cursor(DbTxn *txnid, Dbc **cursorp, u_int32_t flags); int del(DbTxn *txnid, Dbt *key, u_int32_t flags); int fd(int *fdp); int get(DbTxn *txnid, Dbt *key, Dbt *data, u_int32_t flags); + int join(Dbc **curslist, u_int32_t flags, Dbc **dbcp); int put(DbTxn *txnid, Dbt *key, Dbt *data, u_int32_t flags); int stat(void *sp, void *(*db_malloc)(size_t), u_int32_t flags); int sync(u_int32_t flags); + int get_byteswapped() const; DBTYPE get_type() const; static int open(const char *fname, DBTYPE type, u_int32_t flags, int mode, DbEnv *dbenv, DbInfo *info, Db **dbpp); + static int xa_open(const char *fname, DBTYPE type, u_int32_t flags, + int mode, DbInfo *info, Db **dbpp); private: // We can add data to this class if needed // since it is implemented via a pointer. @@ -817,11 +791,11 @@ private: // class _exported Dbt : private DBT { - friend Dbc; - friend Db; - friend DbLog; - friend DbMpoolFile; - friend DbLockTab; + friend class Dbc; + friend class Db; + friend class DbLog; + friend class DbMpoolFile; + friend class DbLockTab; public: @@ -863,7 +837,7 @@ private: class _exported Dbc : protected DBC { - friend Db; + friend class Db; public: int close(); diff --git a/db2/include/db_ext.h b/db2/include/db_ext.h index 8a03db9f64..1ad1643bfa 100644 --- a/db2/include/db_ext.h +++ b/db2/include/db_ext.h @@ -1,8 +1,11 @@ /* DO NOT EDIT: automatically built by dist/distrib. */ #ifndef _db_ext_h_ #define _db_ext_h_ -int __db_pgerr __P((DB *, db_pgno_t)); -int __db_pgfmt __P((DB *, db_pgno_t)); +int __db_close __P((DB *, u_int32_t)); +int __db_init_wrapper __P((DB *)); +int __db_cprint __P((DB *)); +int __db_c_destroy __P((DBC *)); +int __db_sync __P((DB *, u_int32_t)); int __db_addrem_log __P((DB_LOG *, DB_TXN *, DB_LSN *, u_int32_t, u_int32_t, u_int32_t, db_pgno_t, u_int32_t, @@ -33,8 +36,8 @@ int __db_ovref_print int __db_ovref_read __P((void *, __db_ovref_args **)); int __db_relink_log __P((DB_LOG *, DB_TXN *, DB_LSN *, u_int32_t, - u_int32_t, db_pgno_t, DB_LSN *, db_pgno_t, - DB_LSN *, db_pgno_t, DB_LSN *)); + u_int32_t, u_int32_t, db_pgno_t, DB_LSN *, + db_pgno_t, DB_LSN *, db_pgno_t, DB_LSN *)); int __db_relink_print __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); int __db_relink_read __P((void *, __db_relink_args **)); @@ -52,12 +55,6 @@ int __db_debug_log int __db_debug_print __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); int __db_debug_read __P((void *, __db_debug_args **)); -int __db_noop_log - __P((DB_LOG *, DB_TXN *, DB_LSN *, u_int32_t, - u_int32_t, db_pgno_t, DB_LSN *)); -int __db_noop_print - __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); -int __db_noop_read __P((void *, __db_noop_args **)); int __db_init_print __P((DB_ENV *)); int __db_init_recover __P((DB_ENV *)); int __db_pgin __P((db_pgno_t, size_t, void *)); @@ -71,23 +68,40 @@ int __db_txnlist_find __P((void *, u_int32_t)); void __db_txnlist_end __P((void *)); void __db_txnlist_gen __P((void *, int)); void __db_txnlist_print __P((void *)); -int __db_dput __P((DB *, - DBT *, PAGE **, db_indx_t *, int (*)(DB *, u_int32_t, PAGE **))); -int __db_drem __P((DB *, - PAGE **, u_int32_t, int (*)(DB *, PAGE *))); -int __db_dend __P((DB *, db_pgno_t, PAGE **)); - int __db_ditem __P((DB *, PAGE *, u_int32_t, u_int32_t)); +int __db_dput __P((DBC *, DBT *, + PAGE **, db_indx_t *, int (*)(DBC *, u_int32_t, PAGE **))); +int __db_drem __P((DBC *, + PAGE **, u_int32_t, int (*)(DBC *, PAGE *))); +int __db_dend __P((DBC *, db_pgno_t, PAGE **)); + int __db_ditem __P((DBC *, PAGE *, u_int32_t, u_int32_t)); int __db_pitem - __P((DB *, PAGE *, u_int32_t, u_int32_t, DBT *, DBT *)); -int __db_relink __P((DB *, PAGE *, PAGE **, int)); -int __db_ddup __P((DB *, db_pgno_t, int (*)(DB *, PAGE *))); + __P((DBC *, PAGE *, u_int32_t, u_int32_t, DBT *, DBT *)); +int __db_relink __P((DBC *, u_int32_t, PAGE *, PAGE **, int)); +int __db_ddup __P((DBC *, db_pgno_t, int (*)(DBC *, PAGE *))); +int __db_dsearch __P((DBC *, + int, DBT *, db_pgno_t, db_indx_t *, PAGE **, int *)); +int __db_cdelchk __P((const DB *, u_int32_t, int, int)); +int __db_cgetchk __P((const DB *, DBT *, DBT *, u_int32_t, int)); +int __db_cputchk __P((const DB *, + const DBT *, DBT *, u_int32_t, int, int)); +int __db_closechk __P((const DB *, u_int32_t)); +int __db_delchk __P((const DB *, DBT *, u_int32_t, int)); +int __db_getchk __P((const DB *, const DBT *, DBT *, u_int32_t)); +int __db_joinchk __P((const DB *, u_int32_t)); +int __db_putchk + __P((const DB *, DBT *, const DBT *, u_int32_t, int, int)); +int __db_statchk __P((const DB *, u_int32_t)); +int __db_syncchk __P((const DB *, u_int32_t)); +int __db_eopnotsup __P((const DB_ENV *)); +int __db_join __P((DB *, DBC **, u_int32_t, DBC **)); int __db_goff __P((DB *, DBT *, u_int32_t, db_pgno_t, void **, u_int32_t *)); -int __db_poff __P((DB *, const DBT *, db_pgno_t *, - int (*)(DB *, u_int32_t, PAGE **))); -int __db_ovref __P((DB *, db_pgno_t, int32_t)); -int __db_doff __P((DB *, db_pgno_t, int (*)(DB *, PAGE *))); -int __db_moff __P((DB *, const DBT *, db_pgno_t)); +int __db_poff __P((DBC *, const DBT *, db_pgno_t *, + int (*)(DBC *, u_int32_t, PAGE **))); +int __db_ovref __P((DBC *, db_pgno_t, int32_t)); +int __db_doff __P((DBC *, db_pgno_t, int (*)(DBC *, PAGE *))); +int __db_moff __P((DB *, const DBT *, db_pgno_t, u_int32_t, + int (*)(const DBT *, const DBT *), int *)); void __db_loadme __P((void)); FILE *__db_prinit __P((FILE *)); int __db_dump __P((DB *, char *, int)); @@ -111,11 +125,8 @@ int __db_relink_recover int __db_addpage_recover __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); int __db_debug_recover __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); -int __db_noop_recover __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); int __db_ret __P((DB *, PAGE *, u_int32_t, DBT *, void **, u_int32_t *)); int __db_retcopy __P((DBT *, void *, u_int32_t, void **, u_int32_t *, void *(*)(size_t))); -int __db_gethandle __P((DB *, int (*)(DB *, DB *), DB **)); -int __db_puthandle __P((DB *)); #endif /* _db_ext_h_ */ diff --git a/db2/include/db_int.h.src b/db2/include/db_int.h.src deleted file mode 100644 index d67e2c428c..0000000000 --- a/db2/include/db_int.h.src +++ /dev/null @@ -1,402 +0,0 @@ -/*- - * See the file LICENSE for redistribution information. - * - * Copyright (c) 1996, 1997, 1998 - * Sleepycat Software. All rights reserved. - * - * @(#)db_int.h.src 10.62 (Sleepycat) 5/23/98 - */ - -#ifndef _DB_INTERNAL_H_ -#define _DB_INTERNAL_H_ - -#include "db.h" /* Standard DB include file. */ -#include "queue.h" - -/******************************************************* - * General purpose constants and macros. - *******************************************************/ -#define UINT16_T_MAX 0xffff /* Maximum 16 bit unsigned. */ -#define UINT32_T_MAX 0xffffffff /* Maximum 32 bit unsigned. */ - -#define DB_MIN_PGSIZE 0x000200 /* Minimum page size. */ -#define DB_MAX_PGSIZE 0x010000 /* Maximum page size. */ - -#define DB_MINCACHE 10 /* Minimum cached pages */ - -#define MEGABYTE 1048576 - -/* - * If we are unable to determine the underlying filesystem block size, use - * 8K on the grounds that most OS's use less than 8K as their VM page size. - */ -#define DB_DEF_IOSIZE (8 * 1024) - -/* - * Aligning items to particular sizes or in pages or memory. ALIGNP is a - * separate macro, as we've had to cast the pointer to different integral - * types on different architectures. - * - * We cast pointers into unsigned longs when manipulating them because C89 - * guarantees that u_long is the largest available integral type and further, - * to never generate overflows. However, neither C89 or C9X requires that - * any integer type be large enough to hold a pointer, although C9X created - * the intptr_t type, which is guaranteed to hold a pointer but may or may - * not exist. At some point in the future, we should test for intptr_t and - * use it where available. - */ -#undef ALIGNTYPE -#define ALIGNTYPE u_long -#undef ALIGNP -#define ALIGNP(value, bound) ALIGN((ALIGNTYPE)value, bound) -#undef ALIGN -#define ALIGN(value, bound) (((value) + (bound) - 1) & ~((bound) - 1)) - -/* - * There are several on-page structures that are declared to have a number of - * fields followed by a variable length array of items. The structure size - * without including the variable length array or the address of the first of - * those elements can be found using SSZ. - * - * This macro can also be used to find the offset of a structure element in a - * structure. This is used in various places to copy structure elements from - * unaligned memory references, e.g., pointers into a packed page. - * - * There are two versions because compilers object if you take the address of - * an array. - */ -#undef SSZ -#define SSZ(name, field) ((int)&(((name *)0)->field)) - -#undef SSZA -#define SSZA(name, field) ((int)&(((name *)0)->field[0])) - -/* Macros to return per-process address, offsets based on shared regions. */ -#define R_ADDR(base, offset) ((void *)((u_int8_t *)((base)->addr) + offset)) -#define R_OFFSET(base, p) ((u_int8_t *)(p) - (u_int8_t *)(base)->addr) - -/* Free and free-string macros that overwrite memory. */ -#ifdef DIAGNOSTIC -#undef FREE -#define FREE(p, len) { \ - memset(p, 0xff, len); \ - __db_free(p); \ -} -#undef FREES -#define FREES(p) { \ - FREE(p, strlen(p)); \ -} -#else -#undef FREE -#define FREE(p, len) { \ - __db_free(p); \ -} -#undef FREES -#define FREES(p) { \ - __db_free(p); \ -} -#endif - -/* Structure used to print flag values. */ -typedef struct __fn { - u_int32_t mask; /* Flag value. */ - const char *name; /* Flag name. */ -} FN; - -/* Set, clear and test flags. */ -#define F_SET(p, f) (p)->flags |= (f) -#define F_CLR(p, f) (p)->flags &= ~(f) -#define F_ISSET(p, f) ((p)->flags & (f)) -#define LF_SET(f) (flags |= (f)) -#define LF_CLR(f) (flags &= ~(f)) -#define LF_ISSET(f) (flags & (f)) - -/* Display separator string. */ -#undef DB_LINE -#define DB_LINE "=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=" - -/* Global variables. */ -typedef struct __db_globals { - int db_mutexlocks; /* DB_MUTEXLOCKS */ - int db_region_anon; /* DB_REGION_ANON, DB_REGION_NAME */ - int db_region_init; /* DB_REGION_INIT */ - int db_tsl_spins; /* DB_TSL_SPINS */ - int db_pageyield; /* DB_PAGEYIELD */ -} DB_GLOBALS; -extern DB_GLOBALS __db_global_values; -#define DB_GLOBAL(v) __db_global_values.v - -/* Unused, or not-used-yet variable. "Shut that bloody compiler up!" */ -#define COMPQUIET(n, v) (n) = (v) - -/* - * Win16 needs specific syntax on callback functions. Nobody else cares. - */ -#ifndef DB_CALLBACK -#define DB_CALLBACK /* Nothing. */ -#endif - -/******************************************************* - * Files. - *******************************************************/ - /* - * We use 1024 as the maximum path length. It's too hard to figure out what - * the real path length is, as it was traditionally stored in <sys/param.h>, - * and that file isn't always available. - */ -#undef MAXPATHLEN -#define MAXPATHLEN 1024 - -#define PATH_DOT "." /* Current working directory. */ -#define PATH_SEPARATOR "/" /* Path separator character. */ - -/******************************************************* - * Mutex support. - *******************************************************/ -@spin_line1@ -@spin_line2@ -@spin_line3@ - -/* - * !!! - * Various systems require different alignments for mutexes (the worst we've - * seen so far is 16-bytes on some HP architectures). The mutex (tsl_t) must - * be first in the db_mutex_t structure, which must itself be first in the - * region. This ensures the alignment is as returned by mmap(2), which should - * be sufficient. All other mutex users must ensure proper alignment locally. - */ -#define MUTEX_ALIGNMENT @mutex_align@ - -/* - * The offset of a mutex in memory. - * - * !!! - * Not an off_t, so backing file offsets MUST be less than 4Gb. See the - * off field of the db_mutex_t as well. - */ -#define MUTEX_LOCK_OFFSET(a, b) ((u_int32_t)((u_int8_t *)b - (u_int8_t *)a)) - -typedef struct _db_mutex_t { -#ifdef HAVE_SPINLOCKS - tsl_t tsl_resource; /* Resource test and set. */ -#ifdef DIAGNOSTIC - u_int32_t pid; /* Lock holder: 0 or process pid. */ -#endif -#else - u_int32_t off; /* Backing file offset. */ - u_int32_t pid; /* Lock holder: 0 or process pid. */ -#endif - u_int32_t spins; /* Spins before block. */ - u_int32_t mutex_set_wait; /* Granted after wait. */ - u_int32_t mutex_set_nowait; /* Granted without waiting. */ -} db_mutex_t; - -#include "mutex_ext.h" - -/******************************************************* - * Access methods. - *******************************************************/ -/* Lock/unlock a DB thread. */ -#define DB_THREAD_LOCK(dbp) \ - if (F_ISSET(dbp, DB_AM_THREAD)) \ - (void)__db_mutex_lock((db_mutex_t *)(dbp)->mutexp, -1); -#define DB_THREAD_UNLOCK(dbp) \ - if (F_ISSET(dbp, DB_AM_THREAD)) \ - (void)__db_mutex_unlock((db_mutex_t *)(dbp)->mutexp, -1); - -/* Btree/recno local statistics structure. */ -struct __db_bt_lstat; typedef struct __db_bt_lstat DB_BTREE_LSTAT; -struct __db_bt_lstat { - u_int32_t bt_freed; /* Pages freed for reuse. */ - u_int32_t bt_pfxsaved; /* Bytes saved by prefix compression. */ - u_int32_t bt_split; /* Total number of splits. */ - u_int32_t bt_rootsplit; /* Root page splits. */ - u_int32_t bt_fastsplit; /* Fast splits. */ - u_int32_t bt_added; /* Items added. */ - u_int32_t bt_deleted; /* Items deleted. */ - u_int32_t bt_get; /* Items retrieved. */ - u_int32_t bt_cache_hit; /* Hits in fast-insert code. */ - u_int32_t bt_cache_miss; /* Misses in fast-insert code. */ -}; - -/******************************************************* - * Environment. - *******************************************************/ -/* Type passed to __db_appname(). */ -typedef enum { - DB_APP_NONE=0, /* No type (region). */ - DB_APP_DATA, /* Data file. */ - DB_APP_LOG, /* Log file. */ - DB_APP_TMP /* Temporary file. */ -} APPNAME; - -/******************************************************* - * Shared memory regions. - *******************************************************/ -/* - * The shared memory regions share an initial structure so that the general - * region code can handle races between the region being deleted and other - * processes waiting on the region mutex. - * - * !!! - * Note, the mutex must be the first entry in the region; see comment above. - */ -typedef struct _rlayout { - db_mutex_t lock; /* Region mutex. */ -#define DB_REGIONMAGIC 0x120897 - u_int32_t valid; /* Valid magic number. */ - u_int32_t refcnt; /* Region reference count. */ - size_t size; /* Region length. */ - int majver; /* Major version number. */ - int minver; /* Minor version number. */ - int patch; /* Patch version number. */ -#define INVALID_SEGID -1 - int segid; /* shmget(2) ID, or Win16 segment ID. */ - -#define REGION_ANONYMOUS 0x01 /* Region is/should be in anon mem. */ - u_int32_t flags; -} RLAYOUT; - -/* - * DB creates all regions on 4K boundaries out of sheer paranoia, so that - * we don't make the underlying VM unhappy. - */ -#define DB_VMPAGESIZE (4 * 1024) -#define DB_ROUNDOFF(i) { \ - (i) += DB_VMPAGESIZE - 1; \ - (i) -= (i) % DB_VMPAGESIZE; \ -} - -/* - * The interface to region attach is nasty, there is a lot of complex stuff - * going on, which has to be retained between create/attach and detach. The - * REGINFO structure keeps track of it. - */ -struct __db_reginfo; typedef struct __db_reginfo REGINFO; -struct __db_reginfo { - /* Arguments. */ - DB_ENV *dbenv; /* Region naming info. */ - APPNAME appname; /* Region naming info. */ - char *path; /* Region naming info. */ - const char *file; /* Region naming info. */ - int mode; /* Region mode, if a file. */ - size_t size; /* Region size. */ - u_int32_t dbflags; /* Region file open flags, if a file. */ - - /* Results. */ - char *name; /* Region name. */ - void *addr; /* Region address. */ - int fd; /* Fcntl(2) locking file descriptor. - NB: this is only valid if a regular - file is backing the shared region, - and mmap(2) is being used to map it - into our address space. */ - int segid; /* shmget(2) ID, or Win16 segment ID. */ - - /* Shared flags. */ -/* 0x0001 COMMON MASK with RLAYOUT structure. */ -#define REGION_CANGROW 0x0002 /* Can grow. */ -#define REGION_CREATED 0x0004 /* Created. */ -#define REGION_HOLDINGSYS 0x0008 /* Holding system resources. */ -#define REGION_LASTDETACH 0x0010 /* Delete on last detach. */ -#define REGION_MALLOC 0x0020 /* Created in malloc'd memory. */ -#define REGION_PRIVATE 0x0040 /* Private to thread/process. */ -#define REGION_REMOVED 0x0080 /* Already deleted. */ -#define REGION_SIZEDEF 0x0100 /* Use default region size if exists. */ - u_int32_t flags; -}; - -/******************************************************* - * Mpool. - *******************************************************/ -/* - * File types for DB access methods. Negative numbers are reserved to DB. - */ -#define DB_FTYPE_BTREE -1 /* Btree. */ -#define DB_FTYPE_HASH -2 /* Hash. */ - -/* Structure used as the DB pgin/pgout pgcookie. */ -typedef struct __dbpginfo { - size_t db_pagesize; /* Underlying page size. */ - int needswap; /* If swapping required. */ -} DB_PGINFO; - -/******************************************************* - * Log. - *******************************************************/ -/* Initialize an LSN to 'zero'. */ -#define ZERO_LSN(LSN) { \ - (LSN).file = 0; \ - (LSN).offset = 0; \ -} - -/* Return 1 if LSN is a 'zero' lsn, otherwise return 0. */ -#define IS_ZERO_LSN(LSN) ((LSN).file == 0) - -/* Test if we need to log a change. */ -#define DB_LOGGING(dbp) \ - (F_ISSET(dbp, DB_AM_LOGGING) && !F_ISSET(dbp, DB_AM_RECOVER)) - -#ifdef DIAGNOSTIC -/* - * Debugging macro to log operations. - * If DEBUG_WOP is defined, log operations that modify the database. - * If DEBUG_ROP is defined, log operations that read the database. - * - * D dbp - * T txn - * O operation (string) - * K key - * A data - * F flags - */ -#define LOG_OP(D, T, O, K, A, F) { \ - DB_LSN _lsn; \ - DBT _op; \ - if (DB_LOGGING((D))) { \ - memset(&_op, 0, sizeof(_op)); \ - _op.data = O; \ - _op.size = strlen(O) + 1; \ - (void)__db_debug_log((D)->dbenv->lg_info, \ - T, &_lsn, 0, &_op, (D)->log_fileid, K, A, F); \ - } \ -} -#ifdef DEBUG_ROP -#define DEBUG_LREAD(D, T, O, K, A, F) LOG_OP(D, T, O, K, A, F) -#else -#define DEBUG_LREAD(D, T, O, K, A, F) -#endif -#ifdef DEBUG_WOP -#define DEBUG_LWRITE(D, T, O, K, A, F) LOG_OP(D, T, O, K, A, F) -#else -#define DEBUG_LWRITE(D, T, O, K, A, F) -#endif -#else -#define DEBUG_LREAD(D, T, O, K, A, F) -#define DEBUG_LWRITE(D, T, O, K, A, F) -#endif /* DIAGNOSTIC */ - -/******************************************************* - * Transactions and recovery. - *******************************************************/ -/* - * Out of band value for a lock. The locks are returned to callers as offsets - * into the lock regions. Since the RLAYOUT structure begins all regions, an - * offset of 0 is guaranteed not to be a valid lock. - */ -#define LOCK_INVALID 0 - -/* The structure allocated for every transaction. */ -struct __db_txn { - DB_TXNMGR *mgrp; /* Pointer to transaction manager. */ - DB_TXN *parent; /* Pointer to transaction's parent. */ - DB_LSN last_lsn; /* Lsn of last log write. */ - u_int32_t txnid; /* Unique transaction id. */ - size_t off; /* Detail structure within region. */ - TAILQ_ENTRY(__db_txn) links; -}; - -#include "os_func.h" -#include "os_ext.h" - -#endif /* !_DB_INTERNAL_H_ */ diff --git a/db2/include/db_join.h b/db2/include/db_join.h new file mode 100644 index 0000000000..cb27e21f68 --- /dev/null +++ b/db2/include/db_join.h @@ -0,0 +1,23 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1998 + * Sleepycat Software. All rights reserved. + * + * @(#)db_join.h 10.2 (Sleepycat) 10/4/98 + */ + +#ifndef _DB_JOIN_H +#define _DB_JOIN_H +/* + * Joins use a join cursor that is similar to a regular DB cursor except + * that it only supports c_get and c_close functionality. Also, it does + * not support the full range of flags for get. + */ +typedef struct __join_cursor { + u_int32_t j_init; /* Set when cursor is initialized. */ + DBC **j_curslist; /* Array of cursors in the join. */ + DB *j_primary; /* Primary dbp. */ + DBT j_key; /* Used to do lookups. */ +} JOIN_CURSOR; +#endif diff --git a/db2/include/db_page.h b/db2/include/db_page.h index e1846cbbbd..5c9ca674f1 100644 --- a/db2/include/db_page.h +++ b/db2/include/db_page.h @@ -4,7 +4,7 @@ * Copyright (c) 1996, 1997, 1998 * Sleepycat Software. All rights reserved. * - * @(#)db_page.h 10.15 (Sleepycat) 5/1/98 + * @(#)db_page.h 10.18 (Sleepycat) 12/2/98 */ #ifndef _DB_PAGE_H_ @@ -43,14 +43,6 @@ /* * Btree metadata page layout: - * - * +-----------------------------------+ - * | lsn | pgno | magic | - * +-----------------------------------+ - * | version | pagesize | free | - * +-----------------------------------+ - * | flags | unused ... | - * +-----------------------------------+ */ typedef struct _btmeta { DB_LSN lsn; /* 00-07: LSN. */ @@ -72,10 +64,6 @@ typedef struct _btmeta { u_int32_t re_pad; /* 44-47: Recno: fixed-length record pad. */ /* 48-67: Unique file ID. */ u_int8_t uid[DB_FILE_ID_LEN]; - - u_int32_t spare[13]; /* 68-123: Save some room for growth. */ - - DB_BTREE_LSTAT stat; /* 124-163: Statistics. */ } BTMETA; /************************************************************************ @@ -84,18 +72,6 @@ typedef struct _btmeta { /* * Hash metadata page layout: - * - * +-----------------------------------+ - * | lsn | magic | version | - * +-----------------------------------+ - * | pagesize | ovfl_point| last_freed| - * +-----------------------------------+ - * | max_bucket| high_mask | low_mask | - * +-----------------------------------+ - * | ffactor | nelem | charkey | - * +-----------------------------------+ - * | spares[32]| flags | unused | - * +-----------------------------------+ */ /* Hash Table Information */ typedef struct hashhdr { /* Disk resident portion */ @@ -359,10 +335,6 @@ typedef struct _hkeydata { /* * The third type is the H_OFFPAGE, represented by the HOFFPAGE structure: - * - * +-----------------------------------+ - * | type | pgno_t | total len | - * +-----------------------------------+ */ typedef struct _hoffpage { u_int8_t type; /* 00: Page type and delete flag. */ @@ -383,10 +355,6 @@ typedef struct _hoffpage { /* * The fourth type is H_OFFDUP represented by the HOFFDUP structure: - * - * +-----------------------+ - * | type | pgno_t | - * +-----------------------+ */ typedef struct _hoffdup { u_int8_t type; /* 00: Page type and delete flag. */ @@ -431,10 +399,6 @@ typedef struct _hoffdup { /* * The first type is B_KEYDATA, represented by the BKEYDATA structure: - * - * +-----------------------------------+ - * | length | type | key/data | - * +-----------------------------------+ */ typedef struct _bkeydata { db_indx_t len; /* 00-01: Key/data item length. */ @@ -457,13 +421,7 @@ typedef struct _bkeydata { /* * The second and third types are B_DUPLICATE and B_OVERFLOW, represented - * by the BOVERFLOW structure: - * - * +-----------------------------------+ - * | total len | type | unused | - * +-----------------------------------+ - * | nxt: page | nxt: off | nxt: len | - * +-----------------------------------+ + * by the BOVERFLOW structure. */ typedef struct _boverflow { db_indx_t unused1; /* 00-01: Padding, unused. */ @@ -501,10 +459,6 @@ typedef struct _boverflow { /* * Btree internal entry. - * - * +-----------------------------------+ - * | leaf pgno | type | data ... | - * +-----------------------------------+ */ typedef struct _binternal { db_indx_t len; /* 00-01: Key/data item length. */ @@ -535,12 +489,8 @@ typedef struct _binternal { /* * The recno internal entry. * - * +-----------------------+ - * | leaf pgno | # of recs | - * +-----------------------+ - * * XXX - * Why not fold this into the db_indx_t structure, it's fixed length. + * Why not fold this into the db_indx_t structure, it's fixed length? */ typedef struct _rinternal { db_pgno_t pgno; /* 00-03: Page number of referenced page. */ diff --git a/db2/include/hash.h b/db2/include/hash.h index e55c2102cb..5d85a2a3a7 100644 --- a/db2/include/hash.h +++ b/db2/include/hash.h @@ -43,13 +43,22 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * @(#)hash.h 10.8 (Sleepycat) 4/10/98 + * @(#)hash.h 10.14 (Sleepycat) 10/4/98 */ /* Cursor structure definitions. */ typedef struct cursor_t { - DBC *db_cursor; + DBC *dbc; + + /* Per-thread information */ + DB_LOCK hlock; /* Metadata page lock. */ + HASHHDR *hdr; /* Pointer to meta-data page. */ + PAGE *split_buf; /* Temporary buffer for splits. */ + struct __db_h_stat stats; /* Hash statistics. */ + + /* Hash cursor information */ db_pgno_t bucket; /* Bucket we are traversing. */ + db_pgno_t lbucket; /* Bucket for which we are locked. */ DB_LOCK lock; /* Lock held on the current bucket. */ PAGE *pagep; /* The current page. */ db_pgno_t pgno; /* Current page number. */ @@ -62,104 +71,83 @@ typedef struct cursor_t { db_indx_t dup_tlen; /* Total length of duplicate entry. */ u_int32_t seek_size; /* Number of bytes we need for add. */ db_pgno_t seek_found_page;/* Page on which we can insert. */ - u_int32_t big_keylen; /* Length of big_key buffer. */ - void *big_key; /* Temporary buffer for big keys. */ - u_int32_t big_datalen; /* Length of big_data buffer. */ - void *big_data; /* Temporary buffer for big data. */ -#define H_OK 0x0001 -#define H_NOMORE 0x0002 -#define H_DELETED 0x0004 -#define H_ISDUP 0x0008 -#define H_EXPAND 0x0020 - u_int32_t flags; /* Is cursor inside a dup set. */ + +#define H_DELETED 0x0001 /* Cursor item is deleted. */ +#define H_DUPONLY 0x0002 /* Dups only; do not change key. */ +#define H_EXPAND 0x0004 /* Table expanded. */ +#define H_ISDUP 0x0008 /* Cursor is within duplicate set. */ +#define H_NOMORE 0x0010 /* No more entries in bucket. */ +#define H_OK 0x0020 /* Request succeeded. */ +#define H_DIRTY 0x0040 /* Meta-data page needs to be written */ +#define H_ORIGINAL 0x0080 /* Bucket lock existed on entry. */ + u_int32_t flags; } HASH_CURSOR; #define IS_VALID(C) ((C)->bucket != BUCKET_INVALID) +#define SAVE_CURSOR(ORIG, COPY) { \ + F_SET((ORIG), H_ORIGINAL); \ + *(COPY) = *(ORIG); \ +} -typedef struct htab { /* Memory resident data structure. */ - DB *dbp; /* Pointer to parent db structure. */ - DB_LOCK hlock; /* Metadata page lock. */ - HASHHDR *hdr; /* Pointer to meta-data page. */ - u_int32_t (*hash) __P((const void *, u_int32_t)); /* Hash Function */ - PAGE *split_buf; /* Temporary buffer for splits. */ - int local_errno; /* Error Number -- for DBM compatability */ - u_long hash_accesses; /* Number of accesses to this table. */ - u_long hash_collisions; /* Number of collisions on search. */ - u_long hash_expansions; /* Number of times we added a bucket. */ - u_long hash_overflows; /* Number of overflow pages. */ - u_long hash_bigpages; /* Number of big key/data pages. */ -} HTAB; - -/* - * Macro used for interface functions to set the txnid in the DBP. - */ -#define SET_LOCKER(D, T) ((D)->txn = (T)) +#define RESTORE_CURSOR(D, ORIG, COPY, RET) { \ + if ((RET) == 0) { \ + if ((ORIG)->dbc->txn == NULL && \ + (COPY)->lock != 0 && (ORIG)->lock != (COPY)->lock) \ + (void)lock_put((D)->dbenv->lk_info, (COPY)->lock); \ + } else { \ + if ((ORIG)->dbc->txn == NULL && \ + (ORIG)->lock != 0 && (ORIG)->lock != (COPY)->lock) \ + (void)lock_put((D)->dbenv->lk_info, (ORIG)->lock); \ + *ORIG = *COPY; \ + } \ +} /* * More interface macros used to get/release the meta data page. */ -#define GET_META(D, H) { \ - int _r; \ - if (F_ISSET(D, DB_AM_LOCKING) && !F_ISSET(D, DB_AM_RECOVER)) { \ - (D)->lock.pgno = BUCKET_INVALID; \ - if ((_r = lock_get((D)->dbenv->lk_info, \ - (D)->txn == NULL ? (D)->locker : (D)->txn->txnid, \ - 0, &(D)->lock_dbt, DB_LOCK_READ, \ - &(H)->hlock)) != 0) \ - return (_r < 0 ? EAGAIN : _r); \ +#define GET_META(D, I, R) { \ + if (F_ISSET(D, DB_AM_LOCKING) && \ + !F_ISSET((I)->dbc, DBC_RECOVER)) { \ + (I)->dbc->lock.pgno = BUCKET_INVALID; \ + (R) = lock_get((D)->dbenv->lk_info, (I)->dbc->locker, \ + 0, &(I)->dbc->lock_dbt, DB_LOCK_READ, &(I)->hlock); \ + (R) = (R) < 0 ? EAGAIN : (R); \ } \ - if ((_r = __ham_get_page(D, 0, (PAGE **)&((H)->hdr))) != 0) { \ - if ((H)->hlock) { \ - (void)lock_put((D)->dbenv->lk_info, (H)->hlock);\ - (H)->hlock = 0; \ - } \ - return (_r); \ + if ((R) == 0 && \ + ((R) = __ham_get_page(D, 0, (PAGE **)&((I)->hdr))) != 0 && \ + (I)->hlock != LOCK_INVALID) { \ + (void)lock_put((D)->dbenv->lk_info, (I)->hlock); \ + (I)->hlock = LOCK_INVALID; \ } \ } -#define RELEASE_META(D, H) { \ - if (!F_ISSET(D, DB_AM_RECOVER) && \ - (D)->txn == NULL && (H)->hlock) \ - (void)lock_put((H)->dbp->dbenv->lk_info, (H)->hlock); \ - (H)->hlock = 0; \ - if ((H)->hdr) \ - (void)__ham_put_page(D, (PAGE *)(H)->hdr, \ - F_ISSET(D, DB_HS_DIRTYMETA) ? 1 : 0); \ - (H)->hdr = NULL; \ - F_CLR(D, DB_HS_DIRTYMETA); \ +#define RELEASE_META(D, I) { \ + if ((I)->hdr) \ + (void)__ham_put_page(D, (PAGE *)(I)->hdr, \ + F_ISSET(I, H_DIRTY) ? 1 : 0); \ + (I)->hdr = NULL; \ + if (!F_ISSET((I)->dbc, DBC_RECOVER) && \ + (I)->dbc->txn == NULL && (I)->hlock) \ + (void)lock_put((D)->dbenv->lk_info, (I)->hlock); \ + (I)->hlock = LOCK_INVALID; \ + F_CLR(I, H_DIRTY); \ } -#define DIRTY_META(H, R) { \ - if (F_ISSET((H)->dbp, DB_AM_LOCKING) && \ - !F_ISSET((H)->dbp, DB_AM_RECOVER)) { \ +#define DIRTY_META(D, I, R) { \ + if (F_ISSET(D, DB_AM_LOCKING) && \ + !F_ISSET((I)->dbc, DBC_RECOVER)) { \ DB_LOCK _tmp; \ - (H)->dbp->lock.pgno = BUCKET_INVALID; \ - if (((R) = lock_get((H)->dbp->dbenv->lk_info, \ - (H)->dbp->txn ? (H)->dbp->txn->txnid : \ - (H)->dbp->locker, 0, &(H)->dbp->lock_dbt, \ + (I)->dbc->lock.pgno = BUCKET_INVALID; \ + if (((R) = lock_get((D)->dbenv->lk_info, \ + (I)->dbc->locker, 0, &(I)->dbc->lock_dbt, \ DB_LOCK_WRITE, &_tmp)) == 0) \ - (R) = lock_put((H)->dbp->dbenv->lk_info, \ - (H)->hlock); \ + (R) = lock_put((D)->dbenv->lk_info, (I)->hlock);\ else if ((R) < 0) \ (R) = EAGAIN; \ - (H)->hlock = _tmp; \ + (I)->hlock = _tmp; \ } \ - F_SET((H)->dbp, DB_HS_DIRTYMETA); \ -} - -/* Allocate and discard thread structures. */ -#define H_GETHANDLE(dbp, dbpp, ret) \ - if (F_ISSET(dbp, DB_AM_THREAD)) \ - ret = __db_gethandle(dbp, __ham_hdup, dbpp); \ - else { \ - ret = 0; \ - *dbpp = dbp; \ - } - -#define H_PUTHANDLE(dbp) { \ - if (F_ISSET(dbp, DB_AM_THREAD)) \ - __db_puthandle(dbp); \ + F_SET((I), H_DIRTY); \ } /* Test string. */ @@ -171,16 +159,16 @@ typedef struct htab { /* Memory resident data structure. */ * the table, we can allocate extra pages. We keep track of how many pages * we've allocated at each point to calculate bucket to page number mapping. */ -#define BUCKET_TO_PAGE(H, B) \ - ((B) + 1 + ((B) ? (H)->hdr->spares[__db_log2((B)+1)-1] : 0)) +#define BUCKET_TO_PAGE(I, B) \ + ((B) + 1 + ((B) ? (I)->hdr->spares[__db_log2((B)+1)-1] : 0)) -#define PGNO_OF(H, S, O) (BUCKET_TO_PAGE((H), (1 << (S)) - 1) + (O)) +#define PGNO_OF(I, S, O) (BUCKET_TO_PAGE((I), (1 << (S)) - 1) + (O)) /* Constraints about number of pages and how much data goes on a page. */ #define MAX_PAGES(H) UINT32_T_MAX #define MINFILL 4 -#define ISBIG(H, N) (((N) > ((H)->hdr->pagesize / MINFILL)) ? 1 : 0) +#define ISBIG(I, N) (((N) > ((I)->hdr->pagesize / MINFILL)) ? 1 : 0) /* Shorthands for accessing structure */ #define NDX_INVALID 0xFFFF diff --git a/db2/include/hash_ext.h b/db2/include/hash_ext.h index 7086adcc44..fe17dc7b39 100644 --- a/db2/include/hash_ext.h +++ b/db2/include/hash_ext.h @@ -3,13 +3,11 @@ #define _hash_ext_h_ int __ham_open __P((DB *, DB_INFO *)); int __ham_close __P((DB *)); -int __ham_c_iclose __P((DB *, DBC *)); -int __ham_expand_table __P((HTAB *)); -u_int32_t __ham_call_hash __P((HTAB *, u_int8_t *, int32_t)); +int __ham_c_init __P((DBC *)); +u_int32_t __ham_call_hash __P((HASH_CURSOR *, u_int8_t *, int32_t)); int __ham_init_dbt __P((DBT *, u_int32_t, void **, u_int32_t *)); void __ham_c_update __P((HASH_CURSOR *, db_pgno_t, u_int32_t, int, int)); -int __ham_hdup __P((DB *, DB *)); int __ham_insdel_log __P((DB_LOG *, DB_TXN *, DB_LSN *, u_int32_t, u_int32_t, u_int32_t, db_pgno_t, u_int32_t, @@ -72,48 +70,45 @@ int __ham_init_recover __P((DB_ENV *)); int __ham_pgin __P((db_pgno_t, void *, DBT *)); int __ham_pgout __P((db_pgno_t, void *, DBT *)); int __ham_mswap __P((void *)); -#ifdef DEBUG -void __ham_dump_bucket __P((HTAB *, u_int32_t)); -#endif -int __ham_add_dup __P((HTAB *, HASH_CURSOR *, DBT *, u_int32_t)); -void __ham_move_offpage __P((HTAB *, PAGE *, u_int32_t, db_pgno_t)); +int __ham_add_dup __P((DBC *, DBT *, u_int32_t)); +void __ham_move_offpage __P((DBC *, PAGE *, u_int32_t, db_pgno_t)); +void __ham_dsearch __P((DBC *, DBT *, u_int32_t *, int *)); u_int32_t __ham_func2 __P((const void *, u_int32_t)); u_int32_t __ham_func3 __P((const void *, u_int32_t)); u_int32_t __ham_func4 __P((const void *, u_int32_t)); u_int32_t __ham_func5 __P((const void *, u_int32_t)); -int __ham_item __P((HTAB *, HASH_CURSOR *, db_lockmode_t)); -int __ham_item_reset __P((HTAB *, HASH_CURSOR *)); +int __ham_item __P((DBC *, db_lockmode_t)); +int __ham_item_reset __P((DBC *)); void __ham_item_init __P((HASH_CURSOR *)); -int __ham_item_done __P((HTAB *, HASH_CURSOR *, int)); -int __ham_item_last __P((HTAB *, HASH_CURSOR *, db_lockmode_t)); -int __ham_item_first __P((HTAB *, HASH_CURSOR *, db_lockmode_t)); -int __ham_item_prev __P((HTAB *, HASH_CURSOR *, db_lockmode_t)); -int __ham_item_next __P((HTAB *, HASH_CURSOR *, db_lockmode_t)); +int __ham_item_done __P((DBC *, int)); +int __ham_item_last __P((DBC *, db_lockmode_t)); +int __ham_item_first __P((DBC *, db_lockmode_t)); +int __ham_item_prev __P((DBC *, db_lockmode_t)); +int __ham_item_next __P((DBC *, db_lockmode_t)); void __ham_putitem __P((PAGE *p, const DBT *, int)); void __ham_reputpair __P((PAGE *p, u_int32_t, u_int32_t, const DBT *, const DBT *)); -int __ham_del_pair __P((HTAB *, HASH_CURSOR *, int)); -int __ham_replpair __P((HTAB *, HASH_CURSOR *, DBT *, u_int32_t)); +int __ham_del_pair __P((DBC *, int)); +int __ham_replpair __P((DBC *, DBT *, u_int32_t)); void __ham_onpage_replace __P((PAGE *, size_t, u_int32_t, int32_t, int32_t, DBT *)); -int __ham_split_page __P((HTAB *, u_int32_t, u_int32_t)); -int __ham_add_el - __P((HTAB *, HASH_CURSOR *, const DBT *, const DBT *, int)); -void __ham_copy_item __P((HTAB *, PAGE *, u_int32_t, PAGE *)); -int __ham_add_ovflpage __P((HTAB *, PAGE *, int, PAGE **)); -int __ham_new_page __P((HTAB *, u_int32_t, u_int32_t, PAGE **)); -int __ham_del_page __P((DB *, PAGE *)); +int __ham_split_page __P((DBC *, u_int32_t, u_int32_t)); +int __ham_add_el __P((DBC *, const DBT *, const DBT *, int)); +void __ham_copy_item __P((size_t, PAGE *, u_int32_t, PAGE *)); +int __ham_add_ovflpage __P((DBC *, PAGE *, int, PAGE **)); +int __ham_new_page __P((DB *, u_int32_t, u_int32_t, PAGE **)); +int __ham_del_page __P((DBC *, PAGE *)); int __ham_put_page __P((DB *, PAGE *, int32_t)); -int __ham_dirty_page __P((HTAB *, PAGE *)); +int __ham_dirty_page __P((DB *, PAGE *)); int __ham_get_page __P((DB *, db_pgno_t, PAGE **)); -int __ham_overflow_page __P((DB *, u_int32_t, PAGE **)); +int __ham_overflow_page + __P((DBC *, u_int32_t, PAGE **)); #ifdef DEBUG -db_pgno_t __bucket_to_page __P((HTAB *, db_pgno_t)); +db_pgno_t __bucket_to_page __P((HASH_CURSOR *, db_pgno_t)); #endif -void __ham_init_ovflpages __P((HTAB *)); -int __ham_get_cpage __P((HTAB *, HASH_CURSOR *, db_lockmode_t)); -int __ham_next_cpage - __P((HTAB *, HASH_CURSOR *, db_pgno_t, int, u_int32_t)); +void __ham_init_ovflpages __P((DBC *)); +int __ham_get_cpage __P((DBC *, db_lockmode_t)); +int __ham_next_cpage __P((DBC *, db_pgno_t, int, u_int32_t)); void __ham_dpair __P((DB *, PAGE *, u_int32_t)); int __ham_insdel_recover __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); @@ -131,5 +126,5 @@ int __ham_ovfl_recover __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); int __ham_copypage_recover __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); -int __ham_stat __P((DB *, FILE *)); +int __ham_stat __P((DB *, void *, void *(*)(size_t), u_int32_t)); #endif /* _hash_ext_h_ */ diff --git a/db2/include/lock.h b/db2/include/lock.h index 47a38b8783..13364ca7a5 100644 --- a/db2/include/lock.h +++ b/db2/include/lock.h @@ -4,7 +4,7 @@ * Copyright (c) 1996, 1997, 1998 * Sleepycat Software. All rights reserved. * - * @(#)lock.h 10.15 (Sleepycat) 5/10/98 + * @(#)lock.h 10.17 (Sleepycat) 1/3/99 */ typedef struct __db_lockobj DB_LOCKOBJ; @@ -22,6 +22,12 @@ typedef struct __db_lockobj DB_LOCKOBJ; */ #define DB_LOCK_MAXID 0x7fffffff +/* Check for region catastrophic shutdown. */ +#define LOCK_PANIC_CHECK(lt) { \ + if ((lt)->region->hdr.panic) \ + return (DB_RUNRECOVERY); \ +} + /* * The lock region consists of: * The DB_LOCKREGION structure (sizeof(DB_LOCKREGION)). @@ -135,10 +141,24 @@ struct __db_lock { u_int32_t refcount; /* Reference count the lock. */ db_lockmode_t mode; /* What sort of lock. */ ssize_t obj; /* Relative offset of object struct. */ + size_t txnoff; /* Offset of holding transaction. */ db_status_t status; /* Status of this lock. */ }; /* + * This is a serious layering violation. To support nested transactions, we + * need to be able to tell that a lock is held by a transaction (as opposed to + * some other locker) and to be able to traverse the parent/descendent chain. + * In order to do this, each lock held by a transaction maintains a reference + * to the shared memory transaction structure so it can be accessed during lock + * promotion. As the structure is in shared memory, we cannot store a pointer + * to it, so we use the offset within the region. As nothing lives at region + * offset 0, we use that to indicate that there is no transaction associated + * with the current lock. + */ +#define TXN_IS_HOLDING(L) ((L)->txnoff != 0 /* INVALID_REG_OFFSET */) + +/* * We cannot return pointers to the user (else we cannot easily grow regions), * so we return offsets in the region. These must be converted to and from * regular pointers. Always use the macros below. diff --git a/db2/include/lock_ext.h b/db2/include/lock_ext.h index 1e0522c6b5..ce7994774a 100644 --- a/db2/include/lock_ext.h +++ b/db2/include/lock_ext.h @@ -6,6 +6,9 @@ int __lock_is_locked void __lock_printlock __P((DB_LOCKTAB *, struct __db_lock *, int)); int __lock_getobj __P((DB_LOCKTAB *, u_int32_t, const DBT *, u_int32_t type, DB_LOCKOBJ **)); +int __lock_downgrade __P((DB_LOCKTAB *, + DB_LOCK, db_lockmode_t, u_int32_t)); +void __lock_panic __P((DB_ENV *)); int __lock_validate_region __P((DB_LOCKTAB *)); int __lock_grow_region __P((DB_LOCKTAB *, int, size_t)); void __lock_dump_region __P((DB_LOCKTAB *, char *, FILE *)); diff --git a/db2/include/log.h b/db2/include/log.h index 7d5161cc9d..50309085aa 100644 --- a/db2/include/log.h +++ b/db2/include/log.h @@ -4,7 +4,7 @@ * Copyright (c) 1996, 1997, 1998 * Sleepycat Software. All rights reserved. * - * @(#)log.h 10.25 (Sleepycat) 4/10/98 + * @(#)log.h 10.30 (Sleepycat) 10/11/98 */ #ifndef _LOG_H_ @@ -16,8 +16,10 @@ struct __log; typedef struct __log LOG; struct __log_persist; typedef struct __log_persist LOGP; #ifndef MAXLFNAME -#define MAXLFNAME 99999 /* Maximum log file name. */ -#define LFNAME "log.%05d" /* Log file name template. */ +#define LFPREFIX "log." /* Log file name prefix. */ +#define LFNAME "log.%010d" /* Log file name template. */ +#define LFNAME_V1 "log.%05d" /* Log file name template, rev 1. */ +#define MAXLFNAME 2000000000 /* Maximum log file name. */ #endif /* Default log name. */ #define DB_DEFAULT_LOG_FILE "__db_log.share" @@ -38,6 +40,12 @@ struct __log_persist; typedef struct __log_persist LOGP; (void)__db_mutex_unlock(&((RLAYOUT *)(dblp)->lp)->lock, \ (dblp)->reginfo.fd) +/* Check for region catastrophic shutdown. */ +#define LOG_PANIC_CHECK(dblp) { \ + if ((dblp)->lp->rlayout.panic) \ + return (DB_RUNRECOVERY); \ +} + /* * The per-process table that maps log file-id's to DB structures. */ @@ -84,7 +92,28 @@ struct __db_log { char *dir; /* Directory argument. */ - u_int32_t flags; /* Support the DB_AM_XXX flags. */ +/* + * These fields are used by XA; since XA forbids threaded execution, these + * do not have to be protected. + */ + void *xa_info; /* Committed transaction list that + * has to be carried between calls + * to xa_recover. */ + DB_LSN xa_lsn; /* Position of an XA recovery scan. */ + DB_LSN xa_first; /* LSN to which we need to roll back + for this XA recovery scan. */ + + /* + * !!! + * Currently used to hold: + * DB_AM_THREAD (a DB flag) + * DBC_RECOVER (a DBC flag) + * If they are ever the same bits, we're in serious trouble. + */ +#if DB_AM_THREAD == DBC_RECOVER + DB_AM_THREAD, DBC_RECOVER, FLAG MISMATCH +#endif + u_int32_t flags; }; /* diff --git a/db2/include/log_ext.h b/db2/include/log_ext.h index bf3bcb02ce..842a3f4265 100644 --- a/db2/include/log_ext.h +++ b/db2/include/log_ext.h @@ -1,8 +1,9 @@ /* DO NOT EDIT: automatically built by dist/distrib. */ #ifndef _log_ext_h_ #define _log_ext_h_ +void __log_panic __P((DB_ENV *)); int __log_find __P((DB_LOG *, int, int *)); -int __log_valid __P((DB_LOG *, LOG *, int)); +int __log_valid __P((DB_LOG *, u_int32_t, int)); int __log_register_log __P((DB_LOG *, DB_TXN *, DB_LSN *, u_int32_t, u_int32_t, const DBT *, const DBT *, u_int32_t, @@ -15,7 +16,7 @@ int __log_init_recover __P((DB_ENV *)); int __log_findckp __P((DB_LOG *, DB_LSN *)); int __log_get __P((DB_LOG *, DB_LSN *, DBT *, u_int32_t, int)); int __log_put __P((DB_LOG *, DB_LSN *, const DBT *, u_int32_t)); -int __log_name __P((DB_LOG *, int, char **)); +int __log_name __P((DB_LOG *, u_int32_t, char **, int *, u_int32_t)); int __log_register_recover __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); int __log_add_logid __P((DB_LOG *, DB *, u_int32_t)); diff --git a/db2/include/mp.h b/db2/include/mp.h index 8635efa722..904bccfe98 100644 --- a/db2/include/mp.h +++ b/db2/include/mp.h @@ -4,7 +4,7 @@ * Copyright (c) 1996, 1997, 1998 * Sleepycat Software. All rights reserved. * - * @(#)mp.h 10.33 (Sleepycat) 5/4/98 + * @(#)mp.h 10.37 (Sleepycat) 1/1/99 */ struct __bh; typedef struct __bh BH; @@ -16,11 +16,11 @@ struct __mpoolfile; typedef struct __mpoolfile MPOOLFILE; #define DB_DEFAULT_MPOOL_FILE "__db_mpool.share" /* - * We default to 128K (16 8K pages) if the user doesn't specify, and + * We default to 256K (32 8K pages) if the user doesn't specify, and * require a minimum of 20K. */ #ifndef DB_CACHESIZE_DEF -#define DB_CACHESIZE_DEF (128 * 1024) +#define DB_CACHESIZE_DEF (256 * 1024) #endif #define DB_CACHESIZE_MIN ( 20 * 1024) @@ -106,6 +106,12 @@ struct __mpoolfile; typedef struct __mpoolfile MPOOLFILE; if (F_ISSET(dbmp, MP_LOCKREGION)) \ (void)__db_mutex_unlock(&(bhp)->mutex, (dbmp)->reginfo.fd) +/* Check for region catastrophic shutdown. */ +#define MP_PANIC_CHECK(dbmp) { \ + if ((dbmp)->mp->rlayout.panic) \ + return (DB_RUNRECOVERY); \ +} + /* * DB_MPOOL -- * Per-process memory pool structure. @@ -158,6 +164,18 @@ struct __db_mpoolfile { int fd; /* Underlying file descriptor. */ + u_int32_t ref; /* Reference count. */ + + /* + * !!! + * This field is a special case -- it's protected by the region lock + * NOT the thread lock. The reason for this is that we always have + * the region lock immediately before or after we modify the field, + * and we don't want to use the structure lock to protect it because + * then I/O (which is done with the structure lock held because of + * the race between the seek and write of the file descriptor) will + * block any other put/get calls using this DB_MPOOLFILE structure. + */ u_int32_t pinref; /* Pinned block reference count. */ /* These fields are not protected. */ diff --git a/db2/include/mp_ext.h b/db2/include/mp_ext.h index 3650839475..8b46334408 100644 --- a/db2/include/mp_ext.h +++ b/db2/include/mp_ext.h @@ -9,10 +9,12 @@ int __memp_pg __P((DB_MPOOLFILE *, BH *, int)); void __memp_bhfree __P((DB_MPOOL *, MPOOLFILE *, BH *, int)); int __memp_fopen __P((DB_MPOOL *, MPOOLFILE *, const char *, u_int32_t, int, size_t, int, DB_MPOOL_FINFO *, DB_MPOOLFILE **)); +void __memp_panic __P((DB_ENV *)); char * __memp_fn __P((DB_MPOOLFILE *)); char * __memp_fns __P((DB_MPOOL *, MPOOLFILE *)); void __memp_dump_region __P((DB_MPOOL *, char *, FILE *)); -int __memp_ralloc __P((DB_MPOOL *, size_t, size_t *, void *)); +int __memp_reg_alloc __P((DB_MPOOL *, size_t, size_t *, void *)); +int __memp_alloc __P((DB_MPOOL *, size_t, size_t *, void *)); int __memp_ropen __P((DB_MPOOL *, const char *, size_t, int, int, u_int32_t)); int __mp_xxx_fd __P((DB_MPOOLFILE *, int *)); diff --git a/db2/include/os.h b/db2/include/os.h new file mode 100644 index 0000000000..f173d1f610 --- /dev/null +++ b/db2/include/os.h @@ -0,0 +1,24 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1997, 1998 + * Sleepycat Software. All rights reserved. + * + * @(#)os.h 10.11 (Sleepycat) 10/12/98 + */ + +/* + * We group seek/write calls into a single function so that we can use + * pread(2)/pwrite(2) where they're available. + */ +#define DB_IO_READ 1 +#define DB_IO_WRITE 2 +typedef struct __io { + int fd_io; /* I/O file descriptor. */ + int fd_lock; /* Locking file descriptor. */ + db_mutex_t *mutexp; /* Mutex to lock. */ + size_t pagesize; /* Page size. */ + db_pgno_t pgno; /* Page number. */ + u_int8_t *buf; /* Buffer. */ + size_t bytes; /* Bytes read/written. */ +} DB_IO; diff --git a/db2/include/os_ext.h b/db2/include/os_ext.h index 889a45a44e..346210975f 100644 --- a/db2/include/os_ext.h +++ b/db2/include/os_ext.h @@ -1,15 +1,17 @@ /* DO NOT EDIT: automatically built by dist/distrib. */ #ifndef _os_ext_h_ #define _os_ext_h_ -int __db_abspath __P((const char *)); -char *__db_strdup __P((const char *)); -void *__db_calloc __P((size_t, size_t)); -void *__db_malloc __P((size_t)); -void *__db_realloc __P((void *, size_t)); +int __os_abspath __P((const char *)); +int __os_strdup __P((const char *, void *)); +int __os_calloc __P((size_t, size_t, void *)); +int __os_malloc __P((size_t, void *(*)(size_t), void *)); +int __os_realloc __P((void *, size_t)); +void __os_free __P((void *, size_t)); +void __os_freestr __P((void *)); int __os_dirlist __P((const char *, char ***, int *)); void __os_dirfree __P((char **, int)); -int __db_fileid __P((DB_ENV *, const char *, int, u_int8_t *)); -int __db_fsync __P((int)); +int __os_fileid __P((DB_ENV *, const char *, int, u_int8_t *)); +int __os_fsync __P((int)); int __db_mapanon_ok __P((int)); int __db_mapinit __P((void)); int __db_mapregion __P((char *, REGINFO *)); @@ -20,15 +22,19 @@ int __db_unmapfile __P((void *, size_t)); u_int32_t __db_oflags __P((int)); int __db_omode __P((const char *)); int __db_open __P((const char *, u_int32_t, u_int32_t, int, int *)); -int __db_close __P((int)); +int __os_open __P((const char *, int, int, int *)); +int __os_close __P((int)); char *__db_rpath __P((const char *)); -int __db_read __P((int, void *, size_t, ssize_t *)); -int __db_write __P((int, void *, size_t, ssize_t *)); +int __os_io __P((DB_IO *, int, ssize_t *)); +int __os_read __P((int, void *, size_t, ssize_t *)); +int __os_write __P((int, const void *, size_t, ssize_t *)); int __os_seek __P((int, size_t, db_pgno_t, u_int32_t, int, int)); int __os_sleep __P((u_long, u_long)); int __os_spin __P((void)); +void __os_yield __P((u_long)); int __os_exists __P((const char *, int *)); int __os_ioinfo __P((const char *, int, u_int32_t *, u_int32_t *, u_int32_t *)); -int __db_unlink __P((const char *)); +int __os_tmpdir __P((DB_ENV *, u_int32_t)); +int __os_unlink __P((const char *)); #endif /* _os_ext_h_ */ diff --git a/db2/include/os_func.h b/db2/include/os_jump.h index 12794d550d..e2d577ff10 100644 --- a/db2/include/os_func.h +++ b/db2/include/os_jump.h @@ -4,7 +4,7 @@ * Copyright (c) 1997, 1998 * Sleepycat Software. All rights reserved. * - * @(#)os_func.h 10.8 (Sleepycat) 4/19/98 + * @(#)os_jump.h 10.1 (Sleepycat) 10/17/98 */ /* Calls which can be replaced by the application. */ @@ -38,32 +38,3 @@ struct __db_jumptab { }; extern struct __db_jumptab __db_jump; - -/* - * Names used by DB to call through the jump table. - * - * The naming scheme goes like this: if the functionality the application can - * replace is the same as the DB functionality, e.g., malloc, or dirlist, then - * we use the name __db_XXX, and the application is expected to replace the - * complete functionality, which may or may not map directly to an ANSI C or - * POSIX 1003.1 interface. If the functionality that the aplication replaces - * only underlies what the DB os directory exports to other parts of DB, e.g., - * read, then the name __os_XXX is used, and the application can only replace - * the underlying functionality. Under most circumstances, the os directory - * part of DB is the only code that should use the __os_XXX names, all other - * parts of DB should be calling __db_XXX functions. - */ -#define __os_close __db_jump.j_close /* __db_close is a wrapper. */ -#define __db_dirfree __db_jump.j_dirfree -#define __db_dirlist __db_jump.j_dirlist -#define __db_exists __db_jump.j_exists -#define __db_free __db_jump.j_free -#define __os_fsync __db_jump.j_fsync /* __db_fsync is a wrapper. */ -#define __db_ioinfo __db_jump.j_ioinfo -#define __os_open __db_jump.j_open /* __db_open is a wrapper. */ -#define __os_read __db_jump.j_read /* __db_read is a wrapper. */ -#define __db_seek __db_jump.j_seek -#define __db_sleep __db_jump.j_sleep -#define __os_unlink __db_jump.j_unlink /* __db_unlink is a wrapper. */ -#define __os_write __db_jump.j_write /* __db_write is a wrapper. */ -#define __db_yield __db_jump.j_yield diff --git a/db2/include/txn.h b/db2/include/txn.h index a2512ed152..a6fa4db8de 100644 --- a/db2/include/txn.h +++ b/db2/include/txn.h @@ -4,11 +4,13 @@ * Copyright (c) 1996, 1997, 1998 * Sleepycat Software. All rights reserved. * - * @(#)txn.h 10.15 (Sleepycat) 4/21/98 + * @(#)txn.h 10.18 (Sleepycat) 1/3/99 */ #ifndef _TXN_H_ #define _TXN_H_ +#include "xa.h" + /* * The name of the transaction shared memory region is DEFAULT_TXN_FILE and * the region is always created group RW of the group owning the directory. @@ -25,6 +27,8 @@ /* * Internal data maintained in shared memory for each transaction. */ +typedef char DB_XID[XIDDATASIZE]; + typedef struct __txn_detail { u_int32_t txnid; /* current transaction id used to link free list also */ @@ -32,12 +36,31 @@ typedef struct __txn_detail { DB_LSN begin_lsn; /* lsn of begin record */ size_t last_lock; /* offset in lock region of last lock for this transaction. */ + size_t parent; /* Offset of transaction's parent. */ #define TXN_UNALLOC 0 #define TXN_RUNNING 1 #define TXN_ABORTED 2 #define TXN_PREPARED 3 +#define TXN_COMMITTED 4 u_int32_t status; /* status of the transaction */ SH_TAILQ_ENTRY links; /* free/active list */ + +#define TXN_XA_ABORTED 1 +#define TXN_XA_DEADLOCKED 2 +#define TXN_XA_ENDED 3 +#define TXN_XA_PREPARED 4 +#define TXN_XA_STARTED 5 +#define TXN_XA_SUSPENDED 6 + u_int32_t xa_status; /* XA status */ + + /* + * XID (xid_t) structure: because these fields are logged, the + * sizes have to be explicit. + */ + DB_XID xid; /* XA global transaction id */ + u_int32_t bqual; /* bqual_length from XID */ + u_int32_t gtrid; /* gtrid_length from XID */ + int32_t format; /* XA format */ } TXN_DETAIL; /* @@ -105,6 +128,12 @@ struct __db_txnregion { #define UNLOCK_TXNREGION(tmgrp) \ (void)__db_mutex_unlock(&(tmgrp)->region->hdr.lock, (tmgrp)->reginfo.fd) +/* Check for region catastrophic shutdown. */ +#define TXN_PANIC_CHECK(tmgrp) { \ + if ((tmgrp)->region->hdr.panic) \ + return (DB_RUNRECOVERY); \ +} + /* * Log record types. */ @@ -114,4 +143,6 @@ struct __db_txnregion { #include "txn_auto.h" #include "txn_ext.h" + +#include "xa_ext.h" #endif /* !_TXN_H_ */ diff --git a/db2/include/txn_auto.h b/db2/include/txn_auto.h index fd5a456115..bb3de4eb17 100644 --- a/db2/include/txn_auto.h +++ b/db2/include/txn_auto.h @@ -22,4 +22,30 @@ typedef struct _txn_ckp_args { DB_LSN last_ckp; } __txn_ckp_args; + +#define DB_txn_xa_regop (DB_txn_BEGIN + 3) + +typedef struct _txn_xa_regop_args { + u_int32_t type; + DB_TXN *txnid; + DB_LSN prev_lsn; + u_int32_t opcode; + DBT xid; + int32_t formatID; + u_int32_t gtrid; + u_int32_t bqual; + DB_LSN begin_lsn; +} __txn_xa_regop_args; + + +#define DB_txn_child (DB_txn_BEGIN + 4) + +typedef struct _txn_child_args { + u_int32_t type; + DB_TXN *txnid; + DB_LSN prev_lsn; + u_int32_t opcode; + u_int32_t parent; +} __txn_child_args; + #endif diff --git a/db2/include/txn_ext.h b/db2/include/txn_ext.h index 7d694f070d..e0d69c360d 100644 --- a/db2/include/txn_ext.h +++ b/db2/include/txn_ext.h @@ -1,6 +1,9 @@ /* DO NOT EDIT: automatically built by dist/distrib. */ #ifndef _txn_ext_h_ #define _txn_ext_h_ +void __txn_panic __P((DB_ENV *)); +int __txn_xa_begin __P((DB_ENV *, DB_TXN *)); +int __txn_is_ancestor __P((DB_TXNMGR *, size_t, size_t)); int __txn_regop_log __P((DB_LOG *, DB_TXN *, DB_LSN *, u_int32_t, u_int32_t)); @@ -13,9 +16,26 @@ int __txn_ckp_log int __txn_ckp_print __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); int __txn_ckp_read __P((void *, __txn_ckp_args **)); +int __txn_xa_regop_log + __P((DB_LOG *, DB_TXN *, DB_LSN *, u_int32_t, + u_int32_t, const DBT *, int32_t, u_int32_t, + u_int32_t, DB_LSN *)); +int __txn_xa_regop_print + __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); +int __txn_xa_regop_read __P((void *, __txn_xa_regop_args **)); +int __txn_child_log + __P((DB_LOG *, DB_TXN *, DB_LSN *, u_int32_t, + u_int32_t, u_int32_t)); +int __txn_child_print + __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); +int __txn_child_read __P((void *, __txn_child_args **)); int __txn_init_print __P((DB_ENV *)); int __txn_init_recover __P((DB_ENV *)); int __txn_regop_recover - __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); + __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); +int __txn_xa_regop_recover + __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); int __txn_ckp_recover __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); +int __txn_child_recover + __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); #endif /* _txn_ext_h_ */ diff --git a/db2/include/xa.h b/db2/include/xa.h new file mode 100644 index 0000000000..ae822f3e75 --- /dev/null +++ b/db2/include/xa.h @@ -0,0 +1,179 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1998 + * Sleepycat Software. All rights reserved. + * + * @(#)xa.h 10.1 (Sleepycat) 6/22/98 + */ +/* + * Start of xa.h header + * + * Define a symbol to prevent multiple inclusions of this header file + */ +#ifndef XA_H +#define XA_H + +/* + * Transaction branch identification: XID and NULLXID: + */ +#define XIDDATASIZE 128 /* size in bytes */ +#define MAXGTRIDSIZE 64 /* maximum size in bytes of gtrid */ +#define MAXBQUALSIZE 64 /* maximum size in bytes of bqual */ + +struct xid_t { + long formatID; /* format identifier */ + long gtrid_length; /* value from 1 through 64 */ + long bqual_length; /* value from 1 through 64 */ + char data[XIDDATASIZE]; +}; +typedef struct xid_t XID; +/* + * A value of -1 in formatID means that the XID is null. + */ + +/* + * Declarations of routines by which RMs call TMs: + */ +extern int ax_reg __P((int, XID *, long)); +extern int ax_unreg __P((int, long)); + +/* + * XA Switch Data Structure + */ +#define RMNAMESZ 32 /* length of resource manager name, */ + /* including the null terminator */ +#define MAXINFOSIZE 256 /* maximum size in bytes of xa_info */ + /* strings, including the null + terminator */ +struct xa_switch_t { + char name[RMNAMESZ]; /* name of resource manager */ + long flags; /* resource manager specific options */ + long version; /* must be 0 */ + int (*xa_open_entry) /* xa_open function pointer */ + __P((char *, int, long)); + int (*xa_close_entry) /* xa_close function pointer */ + __P((char *, int, long)); + int (*xa_start_entry) /* xa_start function pointer */ + __P((XID *, int, long)); + int (*xa_end_entry) /* xa_end function pointer */ + __P((XID *, int, long)); + int (*xa_rollback_entry) /* xa_rollback function pointer */ + __P((XID *, int, long)); + int (*xa_prepare_entry) /* xa_prepare function pointer */ + __P((XID *, int, long)); + int (*xa_commit_entry) /* xa_commit function pointer */ + __P((XID *, int, long)); + int (*xa_recover_entry) /* xa_recover function pointer */ + __P((XID *, long, int, long)); + int (*xa_forget_entry) /* xa_forget function pointer */ + __P((XID *, int, long)); + int (*xa_complete_entry) /* xa_complete function pointer */ + __P((int *, int *, int, long)); +}; + +/* + * Flag definitions for the RM switch + */ +#define TMNOFLAGS 0x00000000L /* no resource manager features + selected */ +#define TMREGISTER 0x00000001L /* resource manager dynamically + registers */ +#define TMNOMIGRATE 0x00000002L /* resource manager does not support + association migration */ +#define TMUSEASYNC 0x00000004L /* resource manager supports + asynchronous operations */ +/* + * Flag definitions for xa_ and ax_ routines + */ +/* use TMNOFLAGGS, defined above, when not specifying other flags */ +#define TMASYNC 0x80000000L /* perform routine asynchronously */ +#define TMONEPHASE 0x40000000L /* caller is using one-phase commit + optimisation */ +#define TMFAIL 0x20000000L /* dissociates caller and marks + transaction branch rollback-only */ +#define TMNOWAIT 0x10000000L /* return if blocking condition + exists */ +#define TMRESUME 0x08000000L /* caller is resuming association with + suspended transaction branch */ +#define TMSUCCESS 0x04000000L /* dissociate caller from transaction + branch */ +#define TMSUSPEND 0x02000000L /* caller is suspending, not ending, + association */ +#define TMSTARTRSCAN 0x01000000L /* start a recovery scan */ +#define TMENDRSCAN 0x00800000L /* end a recovery scan */ +#define TMMULTIPLE 0x00400000L /* wait for any asynchronous + operation */ +#define TMJOIN 0x00200000L /* caller is joining existing + transaction branch */ +#define TMMIGRATE 0x00100000L /* caller intends to perform + migration */ + +/* + * ax_() return codes (transaction manager reports to resource manager) + */ +#define TM_JOIN 2 /* caller is joining existing + transaction branch */ +#define TM_RESUME 1 /* caller is resuming association with + suspended transaction branch */ +#define TM_OK 0 /* normal execution */ +#define TMER_TMERR -1 /* an error occurred in the transaction + manager */ +#define TMER_INVAL -2 /* invalid arguments were given */ +#define TMER_PROTO -3 /* routine invoked in an improper + context */ + +/* + * xa_() return codes (resource manager reports to transaction manager) + */ +#define XA_RBBASE 100 /* The inclusive lower bound of the + rollback codes */ +#define XA_RBROLLBACK XA_RBBASE /* The rollback was caused by an + unspecified reason */ +#define XA_RBCOMMFAIL XA_RBBASE+1 /* The rollback was caused by a + communication failure */ +#define XA_RBDEADLOCK XA_RBBASE+2 /* A deadlock was detected */ +#define XA_RBINTEGRITY XA_RBBASE+3 /* A condition that violates the + integrity of the resources was + detected */ +#define XA_RBOTHER XA_RBBASE+4 /* The resource manager rolled back the + transaction branch for a reason not + on this list */ +#define XA_RBPROTO XA_RBBASE+5 /* A protocol error occurred in the + resource manager */ +#define XA_RBTIMEOUT XA_RBBASE+6 /* A transaction branch took too long */ +#define XA_RBTRANSIENT XA_RBBASE+7 /* May retry the transaction branch */ +#define XA_RBEND XA_RBTRANSIENT /* The inclusive upper bound of the + rollback codes */ +#define XA_NOMIGRATE 9 /* resumption must occur where + suspension occurred */ +#define XA_HEURHAZ 8 /* the transaction branch may have + been heuristically completed */ +#define XA_HEURCOM 7 /* the transaction branch has been + heuristically committed */ +#define XA_HEURRB 6 /* the transaction branch has been + heuristically rolled back */ +#define XA_HEURMIX 5 /* the transaction branch has been + heuristically committed and rolled + back */ +#define XA_RETRY 4 /* routine returned with no effect and + may be re-issued */ +#define XA_RDONLY 3 /* the transaction branch was read-only + and has been committed */ +#define XA_OK 0 /* normal execution */ +#define XAER_ASYNC -2 /* asynchronous operation already + outstanding */ +#define XAER_RMERR -3 /* a resource manager error occurred in + the transaction branch */ +#define XAER_NOTA -4 /* the XID is not valid */ +#define XAER_INVAL -5 /* invalid arguments were given */ +#define XAER_PROTO -6 /* routine invoked in an improper + context */ +#define XAER_RMFAIL -7 /* resource manager unavailable */ +#define XAER_DUPID -8 /* the XID already exists */ +#define XAER_OUTSIDE -9 /* resource manager doing work outside + transaction */ +#endif /* ifndef XA_H */ +/* + * End of xa.h header + */ diff --git a/db2/include/xa_ext.h b/db2/include/xa_ext.h new file mode 100644 index 0000000000..00369ccaae --- /dev/null +++ b/db2/include/xa_ext.h @@ -0,0 +1,13 @@ +/* DO NOT EDIT: automatically built by dist/distrib. */ +#ifndef _xa_ext_h_ +#define _xa_ext_h_ +int __db_rmid_to_env __P((int rmid, DB_ENV **envp, int open_ok)); +int __db_xid_to_txn __P((DB_ENV *, XID *, size_t *)); +int __db_map_rmid __P((int, DB_ENV *)); +int __db_unmap_rmid __P((int)); +int __db_map_xid __P((DB_ENV *, XID *, size_t)); +void __db_unmap_xid __P((DB_ENV *, XID *, size_t)); +int __db_map_rmid_name __P((int, char *)); +int __db_rmid_to_name __P((int, char **)); + void __db_unmap_rmid_name __P((int)); +#endif /* _xa_ext_h_ */ diff --git a/db2/lock/lock.c b/db2/lock/lock.c index 3d20e0d65b..4cf1d9ecca 100644 --- a/db2/lock/lock.c +++ b/db2/lock/lock.c @@ -8,7 +8,7 @@ #include "config.h" #ifndef lint -static const char sccsid[] = "@(#)lock.c 10.52 (Sleepycat) 5/10/98"; +static const char sccsid[] = "@(#)lock.c 10.61 (Sleepycat) 1/3/99"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -23,16 +23,22 @@ static const char sccsid[] = "@(#)lock.c 10.52 (Sleepycat) 5/10/98"; #include "db_page.h" #include "db_shash.h" #include "lock.h" -#include "common_ext.h" #include "db_am.h" +#include "txn_auto.h" +#include "txn_ext.h" +#include "common_ext.h" static void __lock_checklocker __P((DB_LOCKTAB *, struct __db_lock *, int)); static void __lock_freeobj __P((DB_LOCKTAB *, DB_LOCKOBJ *)); -static int __lock_get_internal __P((DB_LOCKTAB *, u_int32_t, u_int32_t, - const DBT *, db_lockmode_t, struct __db_lock **)); +static int __lock_get_internal __P((DB_LOCKTAB *, u_int32_t, DB_TXN *, + u_int32_t, const DBT *, db_lockmode_t, struct __db_lock **)); +static int __lock_is_parent __P((u_int32_t, DB_TXN *)); +static int __lock_promote __P((DB_LOCKTAB *, DB_LOCKOBJ *)); static int __lock_put_internal __P((DB_LOCKTAB *, struct __db_lock *, int)); static void __lock_remove_waiter __P((DB_LOCKTAB *, DB_LOCKOBJ *, struct __db_lock *, db_status_t)); +static int __lock_vec_internal __P((DB_LOCKTAB *, u_int32_t, DB_TXN *, + u_int32_t, DB_LOCKREQ *, int, DB_LOCKREQ **elistp)); int lock_id(lt, idp) @@ -41,6 +47,8 @@ lock_id(lt, idp) { u_int32_t id; + LOCK_PANIC_CHECK(lt); + LOCK_LOCKREGION(lt); if (lt->region->id >= DB_LOCK_MAXID) lt->region->id = 0; @@ -58,10 +66,37 @@ lock_vec(lt, locker, flags, list, nlist, elistp) int nlist; DB_LOCKREQ *list, **elistp; { + return (__lock_vec_internal(lt, + locker, NULL, flags, list, nlist, elistp)); +} + +int +lock_tvec(lt, txn, flags, list, nlist, elistp) + DB_LOCKTAB *lt; + DB_TXN *txn; + u_int32_t flags; + int nlist; + DB_LOCKREQ *list, **elistp; +{ + return (__lock_vec_internal(lt, + txn->txnid, txn, flags, list, nlist, elistp)); +} + +static int +__lock_vec_internal(lt, locker, txn, flags, list, nlist, elistp) + DB_LOCKTAB *lt; + u_int32_t locker; + DB_TXN *txn; + u_int32_t flags; + int nlist; + DB_LOCKREQ *list, **elistp; +{ struct __db_lock *lp; - DB_LOCKOBJ *sh_obj, *sh_locker; + DB_LOCKOBJ *sh_obj, *sh_locker, *sh_parent; int i, ret, run_dd; + LOCK_PANIC_CHECK(lt); + /* Validate arguments. */ if ((ret = __db_fchk(lt->dbenv, "lock_vec", flags, DB_LOCK_NOWAIT)) != 0) @@ -78,13 +113,43 @@ lock_vec(lt, locker, flags, list, nlist, elistp) for (i = 0; i < nlist && ret == 0; i++) { switch (list[i].op) { case DB_LOCK_GET: - ret = __lock_get_internal(lt, locker, flags, + ret = __lock_get_internal(lt, locker, txn, flags, list[i].obj, list[i].mode, &lp); if (ret == 0) { list[i].lock = LOCK_TO_OFFSET(lt, lp); lt->region->nrequests++; } break; + case DB_LOCK_INHERIT: + /* Find the locker. */ + if ((ret = __lock_getobj(lt, locker, + NULL, DB_LOCK_LOCKER, &sh_locker)) != 0) + break; + if (txn == NULL || txn->parent == NULL) { + ret = EINVAL; + break; + } + + if ((ret = __lock_getobj(lt, txn->parent->txnid, + NULL, DB_LOCK_LOCKER, &sh_parent)) != 0) + break; + + /* + * Traverse all the locks held by this locker. Remove + * the locks from the locker's list and put them on the + * parent's list. + */ + for (lp = SH_LIST_FIRST(&sh_locker->heldby, __db_lock); + lp != NULL; + lp = SH_LIST_FIRST(&sh_locker->heldby, __db_lock)) { + SH_LIST_REMOVE(lp, locker_links, __db_lock); + SH_LIST_INSERT_HEAD(&sh_parent->heldby, lp, + locker_links, __db_lock); + lp->holder = txn->parent->txnid; + } + __lock_freeobj(lt, sh_locker); + lt->region->nlockers--; + break; case DB_LOCK_PUT: lp = OFFSET_TO_LOCK(lt, list[i].lock); if (lp->holder != locker) { @@ -93,8 +158,8 @@ lock_vec(lt, locker, flags, list, nlist, elistp) } list[i].mode = lp->mode; - /* XXX Need to copy the object. ??? */ ret = __lock_put_internal(lt, lp, 0); + __lock_checklocker(lt, lp, 0); break; case DB_LOCK_PUT_ALL: /* Find the locker. */ @@ -204,18 +269,25 @@ lock_get(lt, locker, flags, obj, lock_mode, lock) struct __db_lock *lockp; int ret; + LOCK_PANIC_CHECK(lt); + /* Validate arguments. */ - if ((ret = - __db_fchk(lt->dbenv, "lock_get", flags, DB_LOCK_NOWAIT)) != 0) + if ((ret = __db_fchk(lt->dbenv, + "lock_get", flags, DB_LOCK_NOWAIT | DB_LOCK_UPGRADE)) != 0) return (ret); LOCK_LOCKREGION(lt); - ret = __lock_validate_region(lt); - if (ret == 0 && (ret = __lock_get_internal(lt, - locker, flags, obj, lock_mode, &lockp)) == 0) { - *lock = LOCK_TO_OFFSET(lt, lockp); - lt->region->nrequests++; + if ((ret = __lock_validate_region(lt)) == 0) { + if (LF_ISSET(DB_LOCK_UPGRADE)) + lockp = OFFSET_TO_LOCK(lt, *lock); + + if ((ret = __lock_get_internal(lt, + locker, NULL, flags, obj, lock_mode, &lockp)) == 0) { + if (!LF_ISSET(DB_LOCK_UPGRADE)) + *lock = LOCK_TO_OFFSET(lt, lockp); + lt->region->nrequests++; + } } UNLOCK_LOCKREGION(lt); @@ -223,6 +295,42 @@ lock_get(lt, locker, flags, obj, lock_mode, lock) } int +lock_tget(lt, txn, flags, obj, lock_mode, lock) + DB_LOCKTAB *lt; + DB_TXN *txn; + u_int32_t flags; + const DBT *obj; + db_lockmode_t lock_mode; + DB_LOCK *lock; +{ + struct __db_lock *lockp; + int ret; + + LOCK_PANIC_CHECK(lt); + + /* Validate arguments. */ + if ((ret = __db_fchk(lt->dbenv, + "lock_get", flags, DB_LOCK_NOWAIT | DB_LOCK_UPGRADE)) != 0) + return (ret); + + LOCK_LOCKREGION(lt); + + if ((ret = __lock_validate_region(lt)) == 0) { + if (LF_ISSET(DB_LOCK_UPGRADE)) + lockp = OFFSET_TO_LOCK(lt, *lock); + + if ((ret = __lock_get_internal(lt, + txn->txnid, txn, flags, obj, lock_mode, &lockp)) == 0) { + if (!LF_ISSET(DB_LOCK_UPGRADE)) + *lock = LOCK_TO_OFFSET(lt, lockp); + lt->region->nrequests++; + } + } + + UNLOCK_LOCKREGION(lt); + return (ret); +} +int lock_put(lt, lock) DB_LOCKTAB *lt; DB_LOCK lock; @@ -230,6 +338,8 @@ lock_put(lt, lock) struct __db_lock *lockp; int ret, run_dd; + LOCK_PANIC_CHECK(lt); + LOCK_LOCKREGION(lt); if ((ret = __lock_validate_region(lt)) != 0) @@ -261,7 +371,6 @@ __lock_put_internal(lt, lockp, do_all) struct __db_lock *lockp; int do_all; { - struct __db_lock *lp_w, *lp_h, *next_waiter; DB_LOCKOBJ *sh_obj; int state_changed; @@ -293,39 +402,7 @@ __lock_put_internal(lt, lockp, do_all) else SH_TAILQ_REMOVE(&sh_obj->holders, lockp, links, __db_lock); - /* - * We need to do lock promotion. We also need to determine if - * we're going to need to run the deadlock detector again. If - * we release locks, and there are waiters, but no one gets promoted, - * then we haven't fundamentally changed the lockmgr state, so - * we may still have a deadlock and we have to run again. However, - * if there were no waiters, or we actually promoted someone, then - * we are OK and we don't have to run it immediately. - */ - for (lp_w = SH_TAILQ_FIRST(&sh_obj->waiters, __db_lock), - state_changed = lp_w == NULL; - lp_w != NULL; - lp_w = next_waiter) { - next_waiter = SH_TAILQ_NEXT(lp_w, links, __db_lock); - for (lp_h = SH_TAILQ_FIRST(&sh_obj->holders, __db_lock); - lp_h != NULL; - lp_h = SH_TAILQ_NEXT(lp_h, links, __db_lock)) { - if (CONFLICTS(lt, lp_h->mode, lp_w->mode) && - lp_h->holder != lp_w->holder) - break; - } - if (lp_h != NULL) /* Found a conflict. */ - break; - - /* No conflict, promote the waiting lock. */ - SH_TAILQ_REMOVE(&sh_obj->waiters, lp_w, links, __db_lock); - lp_w->status = DB_LSTAT_PENDING; - SH_TAILQ_INSERT_TAIL(&sh_obj->holders, lp_w, links); - - /* Wake up waiter. */ - (void)__db_mutex_unlock(&lp_w->mutex, lt->reginfo.fd); - state_changed = 1; - } + state_changed = __lock_promote(lt, sh_obj); /* Check if object should be reclaimed. */ if (SH_TAILQ_FIRST(&sh_obj->holders, __db_lock) == NULL) { @@ -354,9 +431,10 @@ __lock_put_internal(lt, lockp, do_all) } static int -__lock_get_internal(lt, locker, flags, obj, lock_mode, lockp) +__lock_get_internal(lt, locker, txn, flags, obj, lock_mode, lockp) DB_LOCKTAB *lt; u_int32_t locker, flags; + DB_TXN *txn; const DBT *obj; db_lockmode_t lock_mode; struct __db_lock **lockp; @@ -365,13 +443,13 @@ __lock_get_internal(lt, locker, flags, obj, lock_mode, lockp) DB_LOCKOBJ *sh_obj, *sh_locker; DB_LOCKREGION *lrp; size_t newl_off; - int ihold, ret; + int ihold, no_dd, ret; + + no_dd = ret = 0; - ret = 0; /* * Check that lock mode is valid. */ - lrp = lt->region; if ((u_int32_t)lock_mode >= lrp->nmodes) { __db_err(lt->dbenv, @@ -423,20 +501,28 @@ __lock_get_internal(lt, locker, flags, obj, lock_mode, lockp) * lock, then we guarantee deadlock. * * In case of conflict, we put the new lock on the end of the waiters - * list. + * list, unless we are upgrading in which case the locker goes on the + * front of the list. */ ihold = 0; for (lp = SH_TAILQ_FIRST(&sh_obj->holders, __db_lock); lp != NULL; lp = SH_TAILQ_NEXT(lp, links, __db_lock)) { - if (locker == lp->holder) { + if (locker == lp->holder || + __lock_is_parent(lp->holder, txn)) { if (lp->mode == lock_mode && lp->status == DB_LSTAT_HELD) { - /* Lock is held, just inc the ref count. */ + if (LF_ISSET(DB_LOCK_UPGRADE)) + goto upgrade; + + /* + * Lock is held, so we can increment the + * reference count and return this lock. + */ lp->refcount++; + *lockp = lp; SH_TAILQ_INSERT_HEAD(&lrp->free_locks, newl, links, __db_lock); - *lockp = lp; return (0); } else ihold = 1; @@ -444,6 +530,21 @@ __lock_get_internal(lt, locker, flags, obj, lock_mode, lockp) break; } + /* + * If we are upgrading, then there are two scenarios. Either + * we had no conflicts, so we can do the upgrade. Or, there + * is a conflict and we should wait at the HEAD of the waiters + * list. + */ + if (LF_ISSET(DB_LOCK_UPGRADE)) { + if (lp == NULL) + goto upgrade; + + /* There was a conflict, wait. */ + SH_TAILQ_INSERT_HEAD(&sh_obj->waiters, newl, links, __db_lock); + goto wait; + } + if (lp == NULL && !ihold) for (lp = SH_TAILQ_FIRST(&sh_obj->waiters, __db_lock); lp != NULL; @@ -464,31 +565,35 @@ __lock_get_internal(lt, locker, flags, obj, lock_mode, lockp) } /* - * This is really a blocker for the process, so initialize it - * set. That way the current process will block when it tries - * to get it and the waking process will release it. - */ - (void)__db_mutex_init(&newl->mutex, - MUTEX_LOCK_OFFSET(lt->region, &newl->mutex)); - (void)__db_mutex_lock(&newl->mutex, lt->reginfo.fd); - - /* - * Now, insert the lock onto its locker's list. + * Now, insert the lock onto its locker's list. If the locker does + * not currently hold any locks, there's no reason to run a deadlock + * detector, save that information. */ if ((ret = __lock_getobj(lt, locker, NULL, DB_LOCK_LOCKER, &sh_locker)) != 0) return (ret); + no_dd = SH_LIST_FIRST(&sh_locker->heldby, __db_lock) == NULL; lrp = lt->region; SH_LIST_INSERT_HEAD(&sh_locker->heldby, newl, locker_links, __db_lock); if (lp != NULL) { + /* + * This is really a blocker for the process, so initialize it + * set. That way the current process will block when it tries + * to get it and the waking process will release it. + */ +wait: (void)__db_mutex_init(&newl->mutex, + MUTEX_LOCK_OFFSET(lt->region, &newl->mutex)); + (void)__db_mutex_lock(&newl->mutex, lt->reginfo.fd); + newl->status = DB_LSTAT_WAITING; lrp->nconflicts++; + /* - * We are about to wait; must release the region mutex. - * Then, when we wakeup, we need to reacquire the region - * mutex before continuing. + * We are about to wait; must release the region mutex. Then, + * when we wakeup, we need to reacquire the region mutex before + * continuing. */ if (lrp->detect == DB_LOCK_NORUN) lt->region->need_dd = 1; @@ -498,13 +603,19 @@ __lock_get_internal(lt, locker, flags, obj, lock_mode, lockp) * We are about to wait; before waiting, see if the deadlock * detector should be run. */ - if (lrp->detect != DB_LOCK_NORUN) - ret = lock_detect(lt, 0, lrp->detect); + if (lrp->detect != DB_LOCK_NORUN && !no_dd) + (void)lock_detect(lt, 0, lrp->detect); (void)__db_mutex_lock(&newl->mutex, lt->reginfo.fd); LOCK_LOCKREGION(lt); if (newl->status != DB_LSTAT_PENDING) { + /* + * If this lock errored due to a deadlock, then + * we have waiters that require promotion. + */ + if (newl->status == DB_LSTAT_ABORTED) + (void)__lock_promote(lt, sh_obj); /* Return to free list. */ __lock_checklocker(lt, newl, 0); SH_TAILQ_INSERT_HEAD(&lrp->free_locks, newl, links, @@ -522,12 +633,31 @@ __lock_get_internal(lt, locker, flags, obj, lock_mode, lockp) } newl->status = DB_LSTAT_FREE; newl = NULL; + } else if (LF_ISSET(DB_LOCK_UPGRADE)) { + /* + * The lock that was just granted got put on the + * holders list. Since we're upgrading some other + * lock, we've got to remove it here. + */ + SH_TAILQ_REMOVE(&sh_obj->holders, + newl, links, __db_lock); + goto upgrade; } else newl->status = DB_LSTAT_HELD; } *lockp = newl; return (ret); + +upgrade: + /* + * This was an upgrade, so return the new lock to the free list and + * upgrade the mode. + */ + (*lockp)->mode = lock_mode; + newl->status = DB_LSTAT_FREE; + SH_TAILQ_INSERT_HEAD(&lrp->free_locks, newl, links, __db_lock); + return (0); } /* @@ -788,3 +918,117 @@ __lock_freeobj(lt, obj) __db_shalloc_free(lt->mem, SH_DBT_PTR(&obj->lockobj)); SH_TAILQ_INSERT_HEAD(<->region->free_objs, obj, links, __db_lockobj); } + +/* + * __lock_downgrade -- + * Used by the concurrent access product to downgrade write locks + * back to iwrite locks. + * + * PUBLIC: int __lock_downgrade __P((DB_LOCKTAB *, + * PUBLIC: DB_LOCK, db_lockmode_t, u_int32_t)); + */ +int +__lock_downgrade(lt, lock, new_mode, flags) + DB_LOCKTAB *lt; + DB_LOCK lock; + db_lockmode_t new_mode; + u_int32_t flags; +{ + struct __db_lock *lockp; + DB_LOCKOBJ *obj; + int ret; + + COMPQUIET(flags, 0); + LOCK_PANIC_CHECK(lt); + LOCK_LOCKREGION(lt); + + if ((ret = __lock_validate_region(lt)) == 0) { + lockp = OFFSET_TO_LOCK(lt, lock); + lockp->mode = new_mode; + + /* Get the object associated with this lock. */ + obj = (DB_LOCKOBJ *)((u_int8_t *)lockp + lockp->obj); + (void)__lock_promote(lt, obj); + ++lt->region->nreleases; + } + + UNLOCK_LOCKREGION(lt); + + return (ret); +} + +/* + * __lock_promote -- + * + * Look through the waiters and holders lists and decide which (if any) + * locks can be promoted. Promote any that are eligible. + */ +static int +__lock_promote(lt, obj) + DB_LOCKTAB *lt; + DB_LOCKOBJ *obj; +{ + struct __db_lock *lp_w, *lp_h, *next_waiter; + int state_changed, waiter_is_txn; + + /* + * We need to do lock promotion. We also need to determine if + * we're going to need to run the deadlock detector again. If + * we release locks, and there are waiters, but no one gets promoted, + * then we haven't fundamentally changed the lockmgr state, so + * we may still have a deadlock and we have to run again. However, + * if there were no waiters, or we actually promoted someone, then + * we are OK and we don't have to run it immediately. + * + * During promotion, we look for state changes so we can return + * this information to the caller. + */ + for (lp_w = SH_TAILQ_FIRST(&obj->waiters, __db_lock), + state_changed = lp_w == NULL; + lp_w != NULL; + lp_w = next_waiter) { + waiter_is_txn = TXN_IS_HOLDING(lp_w); + next_waiter = SH_TAILQ_NEXT(lp_w, links, __db_lock); + for (lp_h = SH_TAILQ_FIRST(&obj->holders, __db_lock); + lp_h != NULL; + lp_h = SH_TAILQ_NEXT(lp_h, links, __db_lock)) { + if (CONFLICTS(lt, lp_h->mode, lp_w->mode) && + lp_h->holder != lp_w->holder && + !(waiter_is_txn && + TXN_IS_HOLDING(lp_h) && + __txn_is_ancestor(lt->dbenv->tx_info, + lp_h->txnoff, lp_w->txnoff))) + break; + } + if (lp_h != NULL) /* Found a conflict. */ + break; + + /* No conflict, promote the waiting lock. */ + SH_TAILQ_REMOVE(&obj->waiters, lp_w, links, __db_lock); + lp_w->status = DB_LSTAT_PENDING; + SH_TAILQ_INSERT_TAIL(&obj->holders, lp_w, links); + + /* Wake up waiter. */ + (void)__db_mutex_unlock(&lp_w->mutex, lt->reginfo.fd); + state_changed = 1; + } + + return (state_changed); +} + +static int +__lock_is_parent(locker, txn) + u_int32_t locker; + DB_TXN *txn; +{ + DB_TXN *t; + + if (txn == NULL) + return (0); + + for (t = txn->parent; t != NULL; t = t->parent) + if (t->txnid == locker) + return (1); + + return (0); +} diff --git a/db2/lock/lock_conflict.c b/db2/lock/lock_conflict.c index 870aa0dc17..4be858af7a 100644 --- a/db2/lock/lock_conflict.c +++ b/db2/lock/lock_conflict.c @@ -8,7 +8,7 @@ #include "config.h" #ifndef lint -static const char sccsid[] = "@(#)lock_conflict.c 10.3 (Sleepycat) 4/10/98"; +static const char sccsid[] = "@(#)lock_conflict.c 10.4 (Sleepycat) 11/20/98"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -29,11 +29,11 @@ const u_int8_t db_rw_conflicts[] = { }; const u_int8_t db_riw_conflicts[] = { - /* N S X IS IX SIX */ + /* N S X IX IS SIX */ /* N */ 0, 0, 0, 0, 0, 0, - /* S */ 0, 0, 1, 0, 1, 1, + /* S */ 0, 0, 1, 1, 0, 1, /* X */ 1, 1, 1, 1, 1, 1, - /* IS */ 0, 0, 1, 0, 0, 0, /* IX */ 0, 1, 1, 0, 0, 0, + /* IS */ 0, 0, 1, 0, 0, 0, /* SIX */ 0, 1, 1, 0, 0, 0 }; diff --git a/db2/lock/lock_deadlock.c b/db2/lock/lock_deadlock.c index 4de492944e..8b2f91bc9e 100644 --- a/db2/lock/lock_deadlock.c +++ b/db2/lock/lock_deadlock.c @@ -8,7 +8,7 @@ #include "config.h" #ifndef lint -static const char sccsid[] = "@(#)lock_deadlock.c 10.32 (Sleepycat) 4/26/98"; +static const char sccsid[] = "@(#)lock_deadlock.c 10.37 (Sleepycat) 10/4/98"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -69,6 +69,8 @@ lock_detect(lt, flags, atype) u_int32_t *bitmap, *deadlock, i, killid, nentries, nlockers; int do_pass, ret; + LOCK_PANIC_CHECK(lt); + /* Validate arguments. */ if ((ret = __db_fchk(lt->dbenv, "lock_detect", flags, DB_LOCK_CONFLICT)) != 0) @@ -176,8 +178,8 @@ lock_detect(lt, flags, atype) "warning: unable to abort locker %lx", (u_long)idmap[killid].id); } - __db_free(bitmap); - __db_free(idmap); + __os_free(bitmap, 0); + __os_free(idmap, 0); return (ret); } @@ -198,7 +200,7 @@ __dd_build(dbenv, bmp, nlockers, idmap) u_int8_t *pptr; locker_info *id_array; u_int32_t *bitmap, count, *entryp, i, id, nentries, *tmpmap; - int is_first; + int is_first, ret; lt = dbenv->lk_info; @@ -230,25 +232,20 @@ retry: count = lt->region->nlockers; * We can probably save the malloc's between iterations just * reallocing if necessary because count grew by too much. */ - if ((bitmap = (u_int32_t *)__db_calloc((size_t)count, - sizeof(u_int32_t) * nentries)) == NULL) { - __db_err(dbenv, "%s", strerror(ENOMEM)); - return (ENOMEM); - } + if ((ret = __os_calloc((size_t)count, + sizeof(u_int32_t) * nentries, &bitmap)) != 0) + return (ret); - if ((tmpmap = - (u_int32_t *)__db_calloc(sizeof(u_int32_t), nentries)) == NULL) { - __db_err(dbenv, "%s", strerror(ENOMEM)); - __db_free(bitmap); - return (ENOMEM); + if ((ret = __os_calloc(sizeof(u_int32_t), nentries, &tmpmap)) != 0) { + __os_free(bitmap, sizeof(u_int32_t) * nentries); + return (ret); } - if ((id_array = (locker_info *)__db_calloc((size_t)count, - sizeof(locker_info))) == NULL) { - __db_err(dbenv, "%s", strerror(ENOMEM)); - __db_free(bitmap); - __db_free(tmpmap); - return (ENOMEM); + if ((ret = + __os_calloc((size_t)count, sizeof(locker_info), &id_array)) != 0) { + __os_free(bitmap, count * sizeof(u_int32_t) * nentries); + __os_free(tmpmap, sizeof(u_int32_t) * nentries); + return (ret); } /* @@ -256,9 +253,9 @@ retry: count = lt->region->nlockers; */ LOCK_LOCKREGION(lt); if (lt->region->nlockers > count) { - __db_free(bitmap); - __db_free(tmpmap); - __db_free(id_array); + __os_free(bitmap, count * sizeof(u_int32_t) * nentries); + __os_free(tmpmap, sizeof(u_int32_t) * nentries); + __os_free(id_array, count * sizeof(locker_info)); goto retry; } @@ -383,7 +380,7 @@ retry: count = lt->region->nlockers; *nlockers = id; *idmap = id_array; *bmp = bitmap; - __db_free(tmpmap); + __os_free(tmpmap, sizeof(u_int32_t) * nentries); return (0); } @@ -434,8 +431,21 @@ __dd_abort(dbenv, info) goto out; lockp = SH_LIST_FIRST(&lockerp->heldby, __db_lock); - if (LOCK_TO_OFFSET(lt, lockp) != info->last_lock || - lockp == NULL || lockp->status != DB_LSTAT_WAITING) + + /* + * It's possible that this locker was already aborted. + * If that's the case, make sure that we remove its + * locker from the hash table. + */ + if (lockp == NULL) { + HASHREMOVE_EL(lt->hashtab, __db_lockobj, + links, lockerp, lt->region->table_size, __lock_lhash); + SH_TAILQ_INSERT_HEAD(<->region->free_objs, + lockerp, links, __db_lockobj); + lt->region->nlockers--; + goto out; + } else if (LOCK_TO_OFFSET(lt, lockp) != info->last_lock || + lockp->status != DB_LSTAT_WAITING) goto out; /* Abort lock, take it off list, and wake up this lock. */ @@ -460,17 +470,17 @@ __dd_debug(dbenv, idmap, bitmap, nlockers) u_int32_t *bitmap, nlockers; { u_int32_t i, j, *mymap, nentries; + int ret; char *msgbuf; __db_err(dbenv, "Waitsfor array"); __db_err(dbenv, "waiter\twaiting on"); - /* - * Allocate space to print 10 bytes per item waited on. - */ - if ((msgbuf = (char *)__db_malloc((nlockers + 1) * 10 + 64)) == NULL) { - __db_err(dbenv, "%s", strerror(ENOMEM)); + + /* Allocate space to print 10 bytes per item waited on. */ +#undef MSGBUF_LEN +#define MSGBUF_LEN ((nlockers + 1) * 10 + 64) + if ((ret = __os_malloc(MSGBUF_LEN, NULL, &msgbuf)) != 0) return; - } nentries = ALIGN(nlockers, 32) / 32; for (mymap = bitmap, i = 0; i < nlockers; i++, mymap += nentries) { @@ -487,6 +497,6 @@ __dd_debug(dbenv, idmap, bitmap, nlockers) __db_err(dbenv, msgbuf); } - __db_free(msgbuf); + __os_free(msgbuf, MSGBUF_LEN); } #endif diff --git a/db2/lock/lock_region.c b/db2/lock/lock_region.c index b597560744..613a6cefb2 100644 --- a/db2/lock/lock_region.c +++ b/db2/lock/lock_region.c @@ -8,7 +8,7 @@ #include "config.h" #ifndef lint -static const char sccsid[] = "@(#)lock_region.c 10.15 (Sleepycat) 6/2/98"; +static const char sccsid[] = "@(#)lock_region.c 10.21 (Sleepycat) 10/19/98"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -29,7 +29,8 @@ static u_int32_t __lock_count_locks __P((DB_LOCKREGION *)); static u_int32_t __lock_count_objs __P((DB_LOCKREGION *)); static void __lock_dump_locker __P((DB_LOCKTAB *, DB_LOCKOBJ *, FILE *)); static void __lock_dump_object __P((DB_LOCKTAB *, DB_LOCKOBJ *, FILE *)); -static const char *__lock_dump_status __P((db_status_t)); +static const char * + __lock_dump_status __P((db_status_t)); static void __lock_reset_region __P((DB_LOCKTAB *)); static int __lock_tabinit __P((DB_ENV *, DB_LOCKREGION *)); @@ -55,10 +56,8 @@ lock_open(path, flags, mode, dbenv, ltp) return (ret); /* Create the lock table structure. */ - if ((lt = (DB_LOCKTAB *)__db_calloc(1, sizeof(DB_LOCKTAB))) == NULL) { - __db_err(dbenv, "%s", strerror(ENOMEM)); - return (ENOMEM); - } + if ((ret = __os_calloc(1, sizeof(DB_LOCKTAB), <)) != 0) + return (ret); lt->dbenv = dbenv; /* Grab the values that we need to compute the region size. */ @@ -82,7 +81,7 @@ lock_open(path, flags, mode, dbenv, ltp) if (path == NULL) lt->reginfo.path = NULL; else - if ((lt->reginfo.path = (char *)__db_strdup(path)) == NULL) + if ((ret = __os_strdup(path, <->reginfo.path)) != 0) goto err; lt->reginfo.file = DB_DEFAULT_LOCK_FILE; lt->reginfo.mode = mode; @@ -147,12 +146,27 @@ err: if (lt->reginfo.addr != NULL) { } if (lt->reginfo.path != NULL) - FREES(lt->reginfo.path); - FREE(lt, sizeof(*lt)); + __os_freestr(lt->reginfo.path); + __os_free(lt, sizeof(*lt)); return (ret); } /* + * __lock_panic -- + * Panic a lock region. + * + * PUBLIC: void __lock_panic __P((DB_ENV *)); + */ +void +__lock_panic(dbenv) + DB_ENV *dbenv; +{ + if (dbenv->lk_info != NULL) + dbenv->lk_info->region->hdr.panic = 1; +} + + +/* * __lock_tabinit -- * Initialize the lock region. */ @@ -254,12 +268,14 @@ lock_close(lt) { int ret; + LOCK_PANIC_CHECK(lt); + if ((ret = __db_rdetach(<->reginfo)) != 0) return (ret); if (lt->reginfo.path != NULL) - FREES(lt->reginfo.path); - FREE(lt, sizeof(*lt)); + __os_freestr(lt->reginfo.path); + __os_free(lt, sizeof(*lt)); return (0); } @@ -276,12 +292,12 @@ lock_unlink(path, force, dbenv) memset(®info, 0, sizeof(reginfo)); reginfo.dbenv = dbenv; reginfo.appname = DB_APP_NONE; - if (path != NULL && (reginfo.path = (char *)__db_strdup(path)) == NULL) - return (ENOMEM); + if (path != NULL && (ret = __os_strdup(path, ®info.path)) != 0) + return (ret); reginfo.file = DB_DEFAULT_LOCK_FILE; ret = __db_runlink(®info, force); if (reginfo.path != NULL) - FREES(reginfo.path); + __os_freestr(reginfo.path); return (ret); } @@ -463,13 +479,14 @@ lock_stat(lt, gspp, db_malloc) void *(*db_malloc) __P((size_t)); { DB_LOCKREGION *rp; + int ret; *gspp = NULL; - if ((*gspp = db_malloc == NULL ? - (DB_LOCK_STAT *)__db_malloc(sizeof(**gspp)) : - (DB_LOCK_STAT *)db_malloc(sizeof(**gspp))) == NULL) - return (ENOMEM); + LOCK_PANIC_CHECK(lt); + + if ((ret = __os_malloc(sizeof(**gspp), db_malloc, gspp)) != 0) + return (ret); /* Copy out the global statistics. */ LOCK_LOCKREGION(lt); @@ -632,15 +649,15 @@ __lock_dump_region(lt, area, fp) for (lp = SH_TAILQ_FIRST(&lrp->free_locks, __db_lock); lp != NULL; lp = SH_TAILQ_NEXT(lp, links, __db_lock)) - fprintf(fp, "0x%x: %lu\t%lu\t%s\t0x%x\n", (u_int)lp, + fprintf(fp, "0x%lx: %lu\t%lu\t%s\t0x%lx\n", (u_long)lp, (u_long)lp->holder, (u_long)lp->mode, - __lock_dump_status(lp->status), (u_int)lp->obj); + __lock_dump_status(lp->status), (u_long)lp->obj); fprintf(fp, "%s\nObject free list\n", DB_LINE); for (op = SH_TAILQ_FIRST(&lrp->free_objs, __db_lockobj); op != NULL; op = SH_TAILQ_NEXT(op, links, __db_lockobj)) - fprintf(fp, "0x%x\n", (u_int)op); + fprintf(fp, "0x%lx\n", (u_long)op); } if (LF_ISSET(LOCK_DUMP_MEM)) diff --git a/db2/lock/lock_util.c b/db2/lock/lock_util.c index 7274a50422..29da75b8a8 100644 --- a/db2/lock/lock_util.c +++ b/db2/lock/lock_util.c @@ -8,7 +8,7 @@ #include "config.h" #ifndef lint -static const char sccsid[] = "@(#)lock_util.c 10.9 (Sleepycat) 4/26/98"; +static const char sccsid[] = "@(#)lock_util.c 10.10 (Sleepycat) 9/20/98"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -75,7 +75,7 @@ __lock_locker_cmp(locker, lock_obj) * fast path the case where we think we are doing a hash on a DB page/fileid * pair. If the size is right, then we do the fast hash. * - * We know that DB uses struct __db_ilocks for its lock objects. The first + * We know that DB uses DB_LOCK_ILOCK types for its lock objects. The first * four bytes are the 4-byte page number and the next DB_FILE_ID_LEN bytes * are a unique file id, where the first 4 bytes on UNIX systems are the file * inode number, and the first 4 bytes on Windows systems are the FileIndexLow @@ -107,7 +107,7 @@ u_int32_t __lock_ohash(dbt) const DBT *dbt; { - if (dbt->size == sizeof(struct __db_ilock)) + if (dbt->size == sizeof(DB_LOCK_ILOCK)) FAST_HASH(dbt->data); return (__ham_func5(dbt->data, dbt->size)); @@ -131,7 +131,7 @@ __lock_lhash(lock_obj) return (tmp); } - if (lock_obj->lockobj.size == sizeof(struct __db_ilock)) + if (lock_obj->lockobj.size == sizeof(DB_LOCK_ILOCK)) FAST_HASH(obj_data); return (__ham_func5(obj_data, lock_obj->lockobj.size)); diff --git a/db2/log/log.c b/db2/log/log.c index d642c9f9ef..ad15f16aef 100644 --- a/db2/log/log.c +++ b/db2/log/log.c @@ -7,13 +7,14 @@ #include "config.h" #ifndef lint -static const char sccsid[] = "@(#)log.c 10.54 (Sleepycat) 5/31/98"; +static const char sccsid[] = "@(#)log.c 10.63 (Sleepycat) 10/10/98"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES #include <sys/types.h> #include <errno.h> +#include <shqueue.h> #include <stdlib.h> #include <string.h> #include <unistd.h> @@ -23,6 +24,7 @@ static const char sccsid[] = "@(#)log.c 10.54 (Sleepycat) 5/31/98"; #include "shqueue.h" #include "log.h" #include "db_dispatch.h" +#include "txn.h" #include "txn_auto.h" #include "common_ext.h" @@ -54,13 +56,11 @@ log_open(path, flags, mode, dbenv, lpp) return (ret); /* Create and initialize the DB_LOG structure. */ - if ((dblp = (DB_LOG *)__db_calloc(1, sizeof(DB_LOG))) == NULL) - return (ENOMEM); + if ((ret = __os_calloc(1, sizeof(DB_LOG), &dblp)) != 0) + return (ret); - if (path != NULL && (dblp->dir = __db_strdup(path)) == NULL) { - ret = ENOMEM; + if (path != NULL && (ret = __os_strdup(path, &dblp->dir)) != 0) goto err; - } dblp->dbenv = dbenv; dblp->lfd = -1; @@ -80,7 +80,7 @@ log_open(path, flags, mode, dbenv, lpp) if (path == NULL) dblp->reginfo.path = NULL; else - if ((dblp->reginfo.path = __db_strdup(path)) == NULL) + if ((ret = __os_strdup(path, &dblp->reginfo.path)) != 0) goto err; dblp->reginfo.file = DB_DEFAULT_LOG_FILE; dblp->reginfo.mode = mode; @@ -122,7 +122,7 @@ log_open(path, flags, mode, dbenv, lpp) if ((ret = __db_shalloc(dblp->addr, sizeof(db_mutex_t), MUTEX_ALIGNMENT, &dblp->mutexp)) != 0) goto err; - (void)__db_mutex_init(dblp->mutexp, -1); + (void)__db_mutex_init(dblp->mutexp, 0); } /* @@ -148,14 +148,28 @@ err: if (dblp->reginfo.addr != NULL) { } if (dblp->reginfo.path != NULL) - FREES(dblp->reginfo.path); + __os_freestr(dblp->reginfo.path); if (dblp->dir != NULL) - FREES(dblp->dir); - FREE(dblp, sizeof(*dblp)); + __os_freestr(dblp->dir); + __os_free(dblp, sizeof(*dblp)); return (ret); } /* + * __log_panic -- + * Panic a log. + * + * PUBLIC: void __log_panic __P((DB_ENV *)); + */ +void +__log_panic(dbenv) + DB_ENV *dbenv; +{ + if (dbenv->lg_info != NULL) + dbenv->lg_info->lp->rlayout.panic = 1; +} + +/* * __log_recover -- * Recover a log. */ @@ -212,12 +226,12 @@ __log_recover(dblp) } /* - * We know where the end of the log is. Since that record is on disk, - * it's also the last-synced LSN. + * We now know where the end of the log is. Set the first LSN that + * we want to return to an application and the LSN of the last known + * record on disk. */ - lp->lsn = lsn; + lp->lsn = lp->s_lsn = lsn; lp->lsn.offset += dblp->c_len; - lp->s_lsn = lp->lsn; /* Set up the current buffer information, too. */ lp->len = dblp->c_len; @@ -250,13 +264,23 @@ __log_recover(dblp) } } } + /* + * Reset the cursor lsn to the beginning of the log, so that an + * initial call to DB_NEXT does the right thing. + */ + ZERO_LSN(dblp->c_lsn); /* If we never find a checkpoint, that's okay, just 0 it out. */ if (!found_checkpoint) ZERO_LSN(lp->chkpt_lsn); + /* + * !!! + * The test suite explicitly looks for this string -- don't change + * it here unless you also change it there. + */ __db_err(dblp->dbenv, - "Recovering the log: last valid LSN: file: %lu offset %lu", + "Finding last valid log LSN: file: %lu offset %lu", (u_long)lp->lsn.file, (u_long)lp->lsn.offset); return (0); @@ -275,14 +299,15 @@ __log_find(dblp, find_first, valp) DB_LOG *dblp; int find_first, *valp; { - int cnt, fcnt, logval, ret; + u_int32_t clv, logval; + int cnt, fcnt, ret; const char *dir; char **names, *p, *q; *valp = 0; /* Find the directory name. */ - if ((ret = __log_name(dblp, 1, &p)) != 0) + if ((ret = __log_name(dblp, 1, &p, NULL, 0)) != 0) return (ret); if ((q = __db_rpath(p)) == NULL) dir = PATH_DOT; @@ -292,8 +317,8 @@ __log_find(dblp, find_first, valp) } /* Get the list of file names. */ - ret = __db_dirlist(dir, &names, &fcnt); - FREES(p); + ret = __os_dirlist(dir, &names, &fcnt); + __os_freestr(p); if (ret != 0) { __db_err(dblp->dbenv, "%s: %s", dir, strerror(ret)); return (ret); @@ -302,29 +327,31 @@ __log_find(dblp, find_first, valp) /* * Search for a valid log file name, return a value of 0 on * failure. + * + * XXX + * Assumes that atoi(3) returns a 32-bit number. */ - for (cnt = fcnt, logval = 0; --cnt >= 0;) - if (strncmp(names[cnt], "log.", sizeof("log.") - 1) == 0) { - logval = atoi(names[cnt] + 4); - if (logval != 0 && - __log_valid(dblp, dblp->lp, logval) == 0) - break; - } + for (cnt = fcnt, clv = logval = 0; --cnt >= 0;) { + if (strncmp(names[cnt], LFPREFIX, sizeof(LFPREFIX) - 1) != 0) + continue; + + clv = atoi(names[cnt] + (sizeof(LFPREFIX) - 1)); + if (find_first) { + if (logval != 0 && clv > logval) + continue; + } else + if (logval != 0 && clv < logval) + continue; + + if (__log_valid(dblp, clv, 1) == 0) + logval = clv; + } - /* Discard the list. */ - __db_dirfree(names, fcnt); - - /* We have a valid log file, find either the first or last one. */ - if (find_first) { - for (; logval > 0; --logval) - if (__log_valid(dblp, dblp->lp, logval - 1) != 0) - break; - } else - for (; logval < MAXLFNAME; ++logval) - if (__log_valid(dblp, dblp->lp, logval + 1) != 0) - break; *valp = logval; + /* Discard the list. */ + __os_dirfree(names, fcnt); + return (0); } @@ -332,62 +359,68 @@ __log_find(dblp, find_first, valp) * log_valid -- * Validate a log file. * - * PUBLIC: int __log_valid __P((DB_LOG *, LOG *, int)); + * PUBLIC: int __log_valid __P((DB_LOG *, u_int32_t, int)); */ int -__log_valid(dblp, lp, cnt) +__log_valid(dblp, number, set_persist) DB_LOG *dblp; - LOG *lp; - int cnt; + u_int32_t number; + int set_persist; { LOGP persist; ssize_t nw; + char *fname; int fd, ret; - char *p; - if ((ret = __log_name(dblp, cnt, &p)) != 0) + /* Try to open the log file. */ + if ((ret = __log_name(dblp, + number, &fname, &fd, DB_RDONLY | DB_SEQUENTIAL)) != 0) { + __os_freestr(fname); return (ret); + } - fd = -1; - if ((ret = __db_open(p, - DB_RDONLY | DB_SEQUENTIAL, - DB_RDONLY | DB_SEQUENTIAL, 0, &fd)) != 0 || - (ret = __db_seek(fd, 0, 0, sizeof(HDR), 0, SEEK_SET)) != 0 || - (ret = __db_read(fd, &persist, sizeof(LOGP), &nw)) != 0 || + /* Try to read the header. */ + if ((ret = __os_seek(fd, 0, 0, sizeof(HDR), 0, SEEK_SET)) != 0 || + (ret = __os_read(fd, &persist, sizeof(LOGP), &nw)) != 0 || nw != sizeof(LOGP)) { if (ret == 0) ret = EIO; - if (fd != -1) { - (void)__db_close(fd); - __db_err(dblp->dbenv, - "Ignoring log file: %s: %s", p, strerror(ret)); - } + + (void)__os_close(fd); + + __db_err(dblp->dbenv, + "Ignoring log file: %s: %s", fname, strerror(ret)); goto err; } - (void)__db_close(fd); + (void)__os_close(fd); + /* Validate the header. */ if (persist.magic != DB_LOGMAGIC) { __db_err(dblp->dbenv, "Ignoring log file: %s: magic number %lx, not %lx", - p, (u_long)persist.magic, (u_long)DB_LOGMAGIC); + fname, (u_long)persist.magic, (u_long)DB_LOGMAGIC); ret = EINVAL; goto err; } if (persist.version < DB_LOGOLDVER || persist.version > DB_LOGVERSION) { __db_err(dblp->dbenv, "Ignoring log file: %s: unsupported log version %lu", - p, (u_long)persist.version); + fname, (u_long)persist.version); ret = EINVAL; goto err; } - if (lp != NULL) { - lp->persist.lg_max = persist.lg_max; - lp->persist.mode = persist.mode; + /* + * If we're going to use this log file, set the region's persistent + * information based on the headers. + */ + if (set_persist) { + dblp->lp->persist.lg_max = persist.lg_max; + dblp->lp->persist.mode = persist.mode; } ret = 0; -err: FREES(p); +err: __os_freestr(fname); return (ret); } @@ -401,6 +434,11 @@ log_close(dblp) { int ret, t_ret; + LOG_PANIC_CHECK(dblp); + + /* We may have opened files as part of XA; if so, close them. */ + __log_close_files(dblp); + /* Discard the per-thread pointer. */ if (dblp->mutexp != NULL) { LOCK_LOGREGION(dblp); @@ -412,21 +450,22 @@ log_close(dblp) ret = __db_rdetach(&dblp->reginfo); /* Close open files, release allocated memory. */ - if (dblp->lfd != -1 && (t_ret = __db_close(dblp->lfd)) != 0 && ret == 0) + if (dblp->lfd != -1 && (t_ret = __os_close(dblp->lfd)) != 0 && ret == 0) ret = t_ret; if (dblp->c_dbt.data != NULL) - FREE(dblp->c_dbt.data, dblp->c_dbt.ulen); + __os_free(dblp->c_dbt.data, dblp->c_dbt.ulen); if (dblp->c_fd != -1 && - (t_ret = __db_close(dblp->c_fd)) != 0 && ret == 0) + (t_ret = __os_close(dblp->c_fd)) != 0 && ret == 0) ret = t_ret; if (dblp->dbentry != NULL) - FREE(dblp->dbentry, (dblp->dbentry_cnt * sizeof(DB_ENTRY))); + __os_free(dblp->dbentry, + (dblp->dbentry_cnt * sizeof(DB_ENTRY))); if (dblp->dir != NULL) - FREES(dblp->dir); + __os_freestr(dblp->dir); if (dblp->reginfo.path != NULL) - FREES(dblp->reginfo.path); - FREE(dblp, sizeof(*dblp)); + __os_freestr(dblp->reginfo.path); + __os_free(dblp, sizeof(*dblp)); return (ret); } @@ -447,12 +486,12 @@ log_unlink(path, force, dbenv) memset(®info, 0, sizeof(reginfo)); reginfo.dbenv = dbenv; reginfo.appname = DB_APP_LOG; - if (path != NULL && (reginfo.path = __db_strdup(path)) == NULL) - return (ENOMEM); + if (path != NULL && (ret = __os_strdup(path, ®info.path)) != 0) + return (ret); reginfo.file = DB_DEFAULT_LOG_FILE; ret = __db_runlink(®info, force); if (reginfo.path != NULL) - FREES(reginfo.path); + __os_freestr(reginfo.path); return (ret); } @@ -467,14 +506,15 @@ log_stat(dblp, gspp, db_malloc) void *(*db_malloc) __P((size_t)); { LOG *lp; + int ret; *gspp = NULL; lp = dblp->lp; - if ((*gspp = db_malloc == NULL ? - (DB_LOG_STAT *)__db_malloc(sizeof(**gspp)) : - (DB_LOG_STAT *)db_malloc(sizeof(**gspp))) == NULL) - return (ENOMEM); + LOG_PANIC_CHECK(dblp); + + if ((ret = __os_malloc(sizeof(**gspp), db_malloc, gspp)) != 0) + return (ret); /* Copy out the global statistics. */ LOCK_LOGREGION(dblp); diff --git a/db2/log/log_archive.c b/db2/log/log_archive.c index 7db0cc3e36..9f3b24d8e3 100644 --- a/db2/log/log_archive.c +++ b/db2/log/log_archive.c @@ -8,7 +8,7 @@ #include "config.h" #ifndef lint -static const char sccsid[] = "@(#)log_archive.c 10.37 (Sleepycat) 5/3/98"; +static const char sccsid[] = "@(#)log_archive.c 10.44 (Sleepycat) 10/9/98"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -49,8 +49,11 @@ log_archive(dblp, listp, flags, db_malloc) int array_size, n, ret; char **array, **arrayp, *name, *p, *pref, buf[MAXPATHLEN]; + name = NULL; COMPQUIET(fnum, 0); + LOG_PANIC_CHECK(dblp); + #define OKFLAGS (DB_ARCH_ABS | DB_ARCH_DATA | DB_ARCH_LOG) if (flags != 0) { if ((ret = @@ -84,7 +87,7 @@ log_archive(dblp, listp, flags, db_malloc) if ((ret = log_get(dblp, &stable_lsn, &rec, DB_LAST)) != 0) return (ret); if (F_ISSET(dblp, DB_AM_THREAD)) - __db_free(rec.data); + __os_free(rec.data, rec.size); fnum = stable_lsn.file; break; case 0: @@ -106,40 +109,40 @@ log_archive(dblp, listp, flags, db_malloc) #define LIST_INCREMENT 64 /* Get some initial space. */ - if ((array = - (char **)__db_malloc(sizeof(char *) * (array_size = 10))) == NULL) - return (ENOMEM); + array_size = 10; + if ((ret = __os_malloc(sizeof(char *) * array_size, NULL, &array)) != 0) + return (ret); array[0] = NULL; /* Build an array of the file names. */ for (n = 0; fnum > 0; --fnum) { - if ((ret = __log_name(dblp, fnum, &name)) != 0) + if ((ret = __log_name(dblp, fnum, &name, NULL, 0)) != 0) goto err; - if (__db_exists(name, NULL) != 0) + if (__os_exists(name, NULL) != 0) { + __os_freestr(name); + name = NULL; break; + } if (n >= array_size - 1) { array_size += LIST_INCREMENT; - if ((array = (char **)__db_realloc(array, - sizeof(char *) * array_size)) == NULL) { - ret = ENOMEM; + if ((ret = __os_realloc(&array, + sizeof(char *) * array_size)) != 0) goto err; - } } if (LF_ISSET(DB_ARCH_ABS)) { if ((ret = __absname(pref, name, &array[n])) != 0) goto err; - FREES(name); + __os_freestr(name); } else if ((p = __db_rpath(name)) != NULL) { - if ((array[n] = (char *)__db_strdup(p + 1)) == NULL) { - ret = ENOMEM; + if ((ret = __os_strdup(p + 1, &array[n])) != 0) goto err; - } - FREES(name); + __os_freestr(name); } else array[n] = name; + name = NULL; array[++n] = NULL; } @@ -162,9 +165,11 @@ log_archive(dblp, listp, flags, db_malloc) err: if (array != NULL) { for (arrayp = array; *arrayp != NULL; ++arrayp) - FREES(*arrayp); - __db_free(array); + __os_freestr(*arrayp); + __os_free(array, sizeof(char *) * array_size); } + if (name != NULL) + __os_freestr(name); return (ret); } @@ -186,9 +191,9 @@ __build_data(dblp, pref, listp, db_malloc) char **array, **arrayp, *p, *real_name; /* Get some initial space. */ - if ((array = - (char **)__db_malloc(sizeof(char *) * (array_size = 10))) == NULL) - return (ENOMEM); + array_size = 10; + if ((ret = __os_malloc(sizeof(char *) * array_size, NULL, &array)) != 0) + return (ret); array[0] = NULL; memset(&rec, 0, sizeof(rec)); @@ -205,7 +210,7 @@ __build_data(dblp, pref, listp, db_malloc) memcpy(&rectype, rec.data, sizeof(rectype)); if (rectype != DB_log_register) { if (F_ISSET(dblp, DB_AM_THREAD)) { - __db_free(rec.data); + __os_free(rec.data, rec.size); rec.data = NULL; } continue; @@ -219,25 +224,22 @@ __build_data(dblp, pref, listp, db_malloc) if (n >= array_size - 1) { array_size += LIST_INCREMENT; - if ((array = (char **)__db_realloc(array, - sizeof(char *) * array_size)) == NULL) { - ret = ENOMEM; + if ((ret = __os_realloc(&array, + sizeof(char *) * array_size)) != 0) goto lg_free; - } } - if ((array[n] = (char *)__db_strdup(argp->name.data)) == NULL) { - ret = ENOMEM; + if ((ret = __os_strdup(argp->name.data, &array[n])) != 0) { lg_free: if (F_ISSET(&rec, DB_DBT_MALLOC) && rec.data != NULL) - __db_free(rec.data); + __os_free(rec.data, rec.size); goto err1; } array[++n] = NULL; - __db_free(argp); + __os_free(argp, 0); if (F_ISSET(dblp, DB_AM_THREAD)) { - __db_free(rec.data); + __os_free(rec.data, rec.size); rec.data = NULL; } } @@ -268,7 +270,7 @@ lg_free: if (F_ISSET(&rec, DB_DBT_MALLOC) && rec.data != NULL) } for (++nxt; nxt < n && strcmp(array[last], array[nxt]) == 0; ++nxt) { - FREES(array[nxt]); + __os_freestr(array[nxt]); array[nxt] = NULL; } @@ -278,25 +280,25 @@ lg_free: if (F_ISSET(&rec, DB_DBT_MALLOC) && rec.data != NULL) goto err2; /* If the file doesn't exist, ignore it. */ - if (__db_exists(real_name, NULL) != 0) { - FREES(real_name); - FREES(array[last]); + if (__os_exists(real_name, NULL) != 0) { + __os_freestr(real_name); + __os_freestr(array[last]); array[last] = NULL; continue; } /* Rework the name as requested by the user. */ - FREES(array[last]); + __os_freestr(array[last]); array[last] = NULL; if (pref != NULL) { ret = __absname(pref, real_name, &array[last]); - FREES(real_name); + __os_freestr(real_name); if (ret != 0) goto err2; } else if ((p = __db_rpath(real_name)) != NULL) { - array[last] = (char *)__db_strdup(p + 1); - FREES(real_name); - if (array[last] == NULL) + ret = __os_strdup(p + 1, &array[last]); + __os_freestr(real_name); + if (ret != 0) goto err2; } else array[last] = real_name; @@ -320,13 +322,13 @@ err2: /* */ if (array != NULL) for (; nxt < n; ++nxt) - FREES(array[nxt]); + __os_freestr(array[nxt]); /* FALLTHROUGH */ err1: if (array != NULL) { for (arrayp = array; *arrayp != NULL; ++arrayp) - FREES(*arrayp); - __db_free(array); + __os_freestr(*arrayp); + __os_free(array, array_size * sizeof(char *)); } return (ret); } @@ -340,17 +342,17 @@ __absname(pref, name, newnamep) char *pref, *name, **newnamep; { size_t l_pref, l_name; - int isabspath; + int isabspath, ret; char *newname; l_name = strlen(name); - isabspath = __db_abspath(name); + isabspath = __os_abspath(name); l_pref = isabspath ? 0 : strlen(pref); /* Malloc space for concatenating the two. */ - if ((*newnamep = - newname = (char *)__db_malloc(l_pref + l_name + 2)) == NULL) - return (ENOMEM); + if ((ret = __os_malloc(l_pref + l_name + 2, NULL, &newname)) != 0) + return (ret); + *newnamep = newname; /* Build the name. If `name' is an absolute path, ignore any prefix. */ if (!isabspath) { @@ -369,11 +371,12 @@ __absname(pref, name, newnamep) * If the user has their own malloc routine, use it. */ static int -__usermem(listp, cmpfunc) +__usermem(listp, db_malloc) char ***listp; - void *(*cmpfunc) __P((size_t)); + void *(*db_malloc) __P((size_t)); { size_t len; + int ret; char **array, **arrayp, **orig, *strp; /* Find out how much space we need. */ @@ -381,18 +384,10 @@ __usermem(listp, cmpfunc) len += sizeof(char *) + strlen(*orig) + 1; len += sizeof(char *); - /* - * Allocate it and set up the pointers. - * - * XXX - * Don't simplify this expression, SunOS compilers don't like it. - */ - if (cmpfunc == NULL) - array = (char **)__db_malloc(len); - else - array = (char **)cmpfunc(len); - if (array == NULL) - return (ENOMEM); + /* Allocate it and set up the pointers. */ + if ((ret = __os_malloc(len, db_malloc, &array)) != 0) + return (ret); + strp = (char *)(array + (orig - *listp) + 1); /* Copy the original information into the new memory. */ @@ -402,13 +397,13 @@ __usermem(listp, cmpfunc) *arrayp = strp; strp += len + 1; - FREES(*orig); + __os_freestr(*orig); } /* NULL-terminate the list. */ *arrayp = NULL; - __db_free(*listp); + __os_free(*listp, 0); *listp = array; return (0); diff --git a/db2/log/log_auto.c b/db2/log/log_auto.c index b17b1ffb2f..92e682661c 100644 --- a/db2/log/log_auto.c +++ b/db2/log/log_auto.c @@ -10,7 +10,6 @@ #endif #include "db_int.h" -#include "shqueue.h" #include "db_page.h" #include "db_dispatch.h" #include "log.h" @@ -43,8 +42,7 @@ int __log_register_log(logp, txnid, ret_lsnp, flags, rectype = DB_log_register; txn_num = txnid == NULL ? 0 : txnid->txnid; if (txnid == NULL) { - null_lsn.file = 0; - null_lsn.offset = 0; + ZERO_LSN(null_lsn); lsnp = &null_lsn; } else lsnp = &txnid->last_lsn; @@ -54,8 +52,8 @@ int __log_register_log(logp, txnid, ret_lsnp, flags, + sizeof(u_int32_t) + (uid == NULL ? 0 : uid->size) + sizeof(id) + sizeof(ftype); - if ((logrec.data = (void *)__db_malloc(logrec.size)) == NULL) - return (ENOMEM); + if ((ret = __os_malloc(logrec.size, NULL, &logrec.data)) != 0) + return (ret); bp = logrec.data; memcpy(bp, &rectype, sizeof(rectype)); @@ -97,7 +95,7 @@ int __log_register_log(logp, txnid, ret_lsnp, flags, ret = __log_put(logp, ret_lsnp, (DBT *)&logrec, flags); if (txnid != NULL) txnid->last_lsn = *ret_lsnp; - __db_free(logrec.data); + __os_free(logrec.data, 0); return (ret); } @@ -155,7 +153,7 @@ __log_register_print(notused1, dbtp, lsnp, notused2, notused3) printf("\tid: %lu\n", (u_long)argp->id); printf("\tftype: 0x%lx\n", (u_long)argp->ftype); printf("\n"); - __db_free(argp); + __os_free(argp, 0); return (0); } @@ -169,11 +167,12 @@ __log_register_read(recbuf, argpp) { __log_register_args *argp; u_int8_t *bp; + int ret; - argp = (__log_register_args *)__db_malloc(sizeof(__log_register_args) + - sizeof(DB_TXN)); - if (argp == NULL) - return (ENOMEM); + ret = __os_malloc(sizeof(__log_register_args) + + sizeof(DB_TXN), NULL, &argp); + if (ret != 0) + return (ret); argp->txnid = (DB_TXN *)&argp[1]; bp = recbuf; memcpy(&argp->type, bp, sizeof(argp->type)); diff --git a/db2/log/log_findckp.c b/db2/log/log_findckp.c index 1f717b49e7..ab13c8380e 100644 --- a/db2/log/log_findckp.c +++ b/db2/log/log_findckp.c @@ -8,7 +8,7 @@ #include "config.h" #ifndef lint -static const char sccsid[] = "@(#)log_findckp.c 10.15 (Sleepycat) 4/26/98"; +static const char sccsid[] = "@(#)log_findckp.c 10.17 (Sleepycat) 9/17/98"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -28,7 +28,10 @@ static const char sccsid[] = "@(#)log_findckp.c 10.15 (Sleepycat) 4/26/98"; * __log_findckp -- * * Looks for the most recent checkpoint that occurs before the most recent - * checkpoint LSN. This is the point from which recovery can start and the + * checkpoint LSN, subject to the constraint that there must be at least two + * checkpoints. The reason you need two checkpoints is that you might have + * crashed during the most recent one and may not have a copy of all the + * open files. This is the point from which recovery can start and the * point up to which archival/truncation can take place. Checkpoints in * the log look like: * @@ -56,7 +59,7 @@ __log_findckp(lp, lsnp) DB_LSN *lsnp; { DBT data; - DB_LSN ckp_lsn, last_ckp, next_lsn; + DB_LSN ckp_lsn, final_ckp, last_ckp, next_lsn; __txn_ckp_args *ckp_args; int ret, verbose; @@ -77,16 +80,17 @@ __log_findckp(lp, lsnp) return (ret); } + final_ckp = last_ckp; next_lsn = last_ckp; do { if (F_ISSET(lp, DB_AM_THREAD)) - __db_free(data.data); + __os_free(data.data, data.size); if ((ret = log_get(lp, &next_lsn, &data, DB_SET)) != 0) return (ret); if ((ret = __txn_ckp_read(data.data, &ckp_args)) != 0) { if (F_ISSET(lp, DB_AM_THREAD)) - __db_free(data.data); + __os_free(data.data, data.size); return (ret); } if (IS_ZERO_LSN(ckp_lsn)) @@ -103,12 +107,19 @@ __log_findckp(lp, lsnp) } last_ckp = next_lsn; next_lsn = ckp_args->last_ckp; - __db_free(ckp_args); + __os_free(ckp_args, sizeof(*ckp_args)); + + /* + * Keep looping until either you 1) run out of checkpoints, + * 2) you've found a checkpoint before the most recent + * checkpoint's LSN and you have at least 2 checkpoints. + */ } while (!IS_ZERO_LSN(next_lsn) && - log_compare(&last_ckp, &ckp_lsn) > 0); + (log_compare(&last_ckp, &ckp_lsn) > 0 || + log_compare(&final_ckp, &last_ckp) == 0)); if (F_ISSET(lp, DB_AM_THREAD)) - __db_free(data.data); + __os_free(data.data, data.size); /* * At this point, either, next_lsn is ZERO or ckp_lsn is the @@ -117,11 +128,12 @@ __log_findckp(lp, lsnp) * next_lsn must be 0 and we need to roll forward from the * beginning of the log. */ - if (log_compare(&last_ckp, &ckp_lsn) > 0) { + if (log_compare(&last_ckp, &ckp_lsn) > 0 || + log_compare(&final_ckp, &last_ckp) == 0) { get_first: if ((ret = log_get(lp, &last_ckp, &data, DB_FIRST)) != 0) return (ret); if (F_ISSET(lp, DB_AM_THREAD)) - __db_free(data.data); + __os_free(data.data, data.size); } *lsnp = last_ckp; diff --git a/db2/log/log_get.c b/db2/log/log_get.c index 84ddca1c73..de81519a7c 100644 --- a/db2/log/log_get.c +++ b/db2/log/log_get.c @@ -7,7 +7,7 @@ #include "config.h" #ifndef lint -static const char sccsid[] = "@(#)log_get.c 10.32 (Sleepycat) 5/6/98"; +static const char sccsid[] = "@(#)log_get.c 10.38 (Sleepycat) 10/3/98"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -38,26 +38,16 @@ log_get(dblp, alsn, dbt, flags) { int ret; + LOG_PANIC_CHECK(dblp); + /* Validate arguments. */ -#define OKFLAGS (DB_CHECKPOINT | \ - DB_CURRENT | DB_FIRST | DB_LAST | DB_NEXT | DB_PREV | DB_SET) - if ((ret = __db_fchk(dblp->dbenv, "log_get", flags, OKFLAGS)) != 0) - return (ret); - switch (flags) { - case DB_CHECKPOINT: - case DB_CURRENT: - case DB_FIRST: - case DB_LAST: - case DB_NEXT: - case DB_PREV: - case DB_SET: - break; - default: + if (flags != DB_CHECKPOINT && flags != DB_CURRENT && + flags != DB_FIRST && flags != DB_LAST && + flags != DB_NEXT && flags != DB_PREV && flags != DB_SET) return (__db_ferr(dblp->dbenv, "log_get", 1)); - } if (F_ISSET(dblp, DB_AM_THREAD)) { - if (LF_ISSET(DB_NEXT | DB_PREV | DB_CURRENT)) + if (flags == DB_NEXT || flags == DB_PREV || flags == DB_CURRENT) return (__db_ferr(dblp->dbenv, "log_get", 1)); if (!F_ISSET(dbt, DB_DBT_USERMEM | DB_DBT_MALLOC)) return (__db_ferr(dblp->dbenv, "threaded data", 1)); @@ -156,7 +146,7 @@ __log_get(dblp, alsn, dbt, flags, silent) /* If at start-of-file, move to the previous file. */ if (nlsn.offset == 0) { if (nlsn.file == 1 || - __log_valid(dblp, NULL, nlsn.file - 1) != 0) + __log_valid(dblp, nlsn.file - 1, 0) != 0) return (DB_NOTFOUND); --nlsn.file; @@ -183,7 +173,7 @@ retry: /* If we've switched files, discard the current fd. */ if (dblp->c_lsn.file != nlsn.file && dblp->c_fd != -1) { - (void)__db_close(dblp->c_fd); + (void)__os_close(dblp->c_fd); dblp->c_fd = -1; } @@ -203,24 +193,22 @@ retry: /* Acquire a file descriptor. */ if (dblp->c_fd == -1) { - if ((ret = __log_name(dblp, nlsn.file, &np)) != 0) - goto err1; - if ((ret = __db_open(np, DB_RDONLY | DB_SEQUENTIAL, - DB_RDONLY | DB_SEQUENTIAL, 0, &dblp->c_fd)) != 0) { + if ((ret = __log_name(dblp, nlsn.file, + &np, &dblp->c_fd, DB_RDONLY | DB_SEQUENTIAL)) != 0) { fail = np; goto err1; } - __db_free(np); + __os_freestr(np); np = NULL; } /* Seek to the header offset and read the header. */ if ((ret = - __db_seek(dblp->c_fd, 0, 0, nlsn.offset, 0, SEEK_SET)) != 0) { + __os_seek(dblp->c_fd, 0, 0, nlsn.offset, 0, SEEK_SET)) != 0) { fail = "seek"; goto err1; } - if ((ret = __db_read(dblp->c_fd, &hdr, sizeof(HDR), &nr)) != 0) { + if ((ret = __os_read(dblp->c_fd, &hdr, sizeof(HDR), &nr)) != 0) { fail = "read"; goto err1; } @@ -276,10 +264,8 @@ retry: * We're calling malloc(3) with a region locked. This isn't * a good idea. */ - if ((tbuf = (char *)__db_malloc(len)) == NULL) { - ret = ENOMEM; + if ((ret = __os_malloc(len, NULL, &tbuf)) != 0) goto err1; - } /* * Read the record into the buffer. If read returns a short count, @@ -287,7 +273,7 @@ retry: * buffer. Note, the information may be garbage if we're in recovery, * so don't read past the end of the buffer's memory. */ - if ((ret = __db_read(dblp->c_fd, tbuf, len, &nr)) != 0) { + if ((ret = __os_read(dblp->c_fd, tbuf, len, &nr)) != 0) { fail = "read"; goto err1; } @@ -305,7 +291,7 @@ retry: if ((ret = __db_retcopy(dbt, tbuf, len, &dblp->c_dbt.data, &dblp->c_dbt.ulen, NULL)) != 0) goto err1; - __db_free(tbuf); + __os_free(tbuf, 0); tbuf = NULL; cksum: if (hdr.cksum != __ham_func4(dbt->data, dbt->size)) { @@ -329,7 +315,7 @@ corrupt:/* ret = EIO; fail = "read"; - err1: if (!silent) { +err1: if (!silent) { if (fail == NULL) __db_err(dblp->dbenv, "log_get: %s", strerror(ret)); else @@ -337,8 +323,8 @@ corrupt:/* "log_get: %s: %s", fail, strerror(ret)); } err2: if (np != NULL) - __db_free(np); + __os_freestr(np); if (tbuf != NULL) - __db_free(tbuf); + __os_free(tbuf, 0); return (ret); } diff --git a/db2/log/log_put.c b/db2/log/log_put.c index 5ef2294af5..86de6b0d1d 100644 --- a/db2/log/log_put.c +++ b/db2/log/log_put.c @@ -7,13 +7,14 @@ #include "config.h" #ifndef lint -static const char sccsid[] = "@(#)log_put.c 10.35 (Sleepycat) 5/6/98"; +static const char sccsid[] = "@(#)log_put.c 10.44 (Sleepycat) 11/3/98"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES #include <sys/types.h> #include <errno.h> +#include <stdio.h> #include <string.h> #include <time.h> #include <unistd.h> @@ -24,6 +25,7 @@ static const char sccsid[] = "@(#)log_put.c 10.35 (Sleepycat) 5/6/98"; #include "db_page.h" #include "log.h" #include "hash.h" +#include "clib_ext.h" #include "common_ext.h" static int __log_fill __P((DB_LOG *, DB_LSN *, void *, u_int32_t)); @@ -45,22 +47,12 @@ log_put(dblp, lsn, dbt, flags) { int ret; + LOG_PANIC_CHECK(dblp); + /* Validate arguments. */ -#define OKFLAGS (DB_CHECKPOINT | DB_FLUSH | DB_CURLSN) - if (flags != 0) { - if ((ret = - __db_fchk(dblp->dbenv, "log_put", flags, OKFLAGS)) != 0) - return (ret); - switch (flags) { - case DB_CHECKPOINT: - case DB_CURLSN: - case DB_FLUSH: - case 0: - break; - default: - return (__db_ferr(dblp->dbenv, "log_put", 1)); - } - } + if (flags != 0 && flags != DB_CHECKPOINT && + flags != DB_CURLSN && flags != DB_FLUSH) + return (__db_ferr(dblp->dbenv, "log_put", 0)); LOCK_LOGREGION(dblp); ret = __log_put(dblp, lsn, dbt, flags); @@ -95,7 +87,7 @@ __log_put(dblp, lsn, dbt, flags) * the information. Currently used by the transaction manager * to avoid writing TXN_begin records. */ - if (LF_ISSET(DB_CURLSN)) { + if (flags == DB_CURLSN) { lsn->file = lp->lsn.file; lsn->offset = lp->lsn.offset; return (0); @@ -165,6 +157,8 @@ __log_put(dblp, lsn, dbt, flags) for (fnp = SH_TAILQ_FIRST(&dblp->lp->fq, __fname); fnp != NULL; fnp = SH_TAILQ_NEXT(fnp, q, __fname)) { + if (fnp->ref == 0) /* Entry not in use. */ + continue; memset(&t, 0, sizeof(t)); t.data = R_ADDR(dblp, fnp->name_off); t.size = strlen(t.data) + 1; @@ -248,6 +242,8 @@ log_flush(dblp, lsn) { int ret; + LOG_PANIC_CHECK(dblp); + LOCK_LOGREGION(dblp); ret = __log_flush(dblp, lsn); UNLOCK_LOGREGION(dblp); @@ -304,8 +300,7 @@ __log_flush(dblp, lsn) * buffer's starting LSN. */ current = 0; - if (lp->b_off != 0 && - lsn->file >= lp->f_lsn.file && lsn->offset >= lp->f_lsn.offset) { + if (lp->b_off != 0 && log_compare(lsn, &lp->f_lsn) >= 0) { if ((ret = __log_write(dblp, lp->buf, lp->b_off)) != 0) return (ret); @@ -322,8 +317,10 @@ __log_flush(dblp, lsn) return (ret); /* Sync all writes to disk. */ - if ((ret = __db_fsync(dblp->lfd)) != 0) + if ((ret = __os_fsync(dblp->lfd)) != 0) { + __db_panic(dblp->dbenv, ret); return (ret); + } ++lp->stat.st_scount; /* @@ -331,9 +328,16 @@ __log_flush(dblp, lsn) * the current buffer was flushed, we know the LSN of the first byte * of the buffer is on disk, otherwise, we only know that the LSN of * the record before the one beginning the current buffer is on disk. + * + * XXX + * Check to make sure that the saved lsn isn't 0 before we go making + * this change. If DB_CHECKPOINT was called before we actually wrote + * something, you can end up here without ever having written anything + * to a log file, and decrementing either s_lsn.file or s_lsn.offset + * will cause much sadness later on. */ lp->s_lsn = lp->f_lsn; - if (!current) { + if (!current && lp->s_lsn.file != 0) { if (lp->s_lsn.offset == 0) { --lp->s_lsn.file; lp->s_lsn.offset = lp->persist.lg_max; @@ -431,10 +435,11 @@ __log_write(dblp, addr, len) * Seek to the offset in the file (someone may have written it * since we last did). */ - if ((ret = __db_seek(dblp->lfd, 0, 0, lp->w_off, 0, SEEK_SET)) != 0) - return (ret); - if ((ret = __db_write(dblp->lfd, addr, len, &nw)) != 0) + if ((ret = __os_seek(dblp->lfd, 0, 0, lp->w_off, 0, SEEK_SET)) != 0 || + (ret = __os_write(dblp->lfd, addr, len, &nw)) != 0) { + __db_panic(dblp->dbenv, ret); return (ret); + } if (nw != (int32_t)len) return (EIO); @@ -467,21 +472,23 @@ log_file(dblp, lsn, namep, len) size_t len; { int ret; - char *p; + char *name; + + LOG_PANIC_CHECK(dblp); LOCK_LOGREGION(dblp); - ret = __log_name(dblp, lsn->file, &p); + ret = __log_name(dblp, lsn->file, &name, NULL, 0); UNLOCK_LOGREGION(dblp); if (ret != 0) return (ret); /* Check to make sure there's enough room and copy the name. */ - if (len < strlen(p) + 1) { + if (len < strlen(name) + 1) { *namep = '\0'; return (ENOMEM); } - (void)strcpy(namep, p); - __db_free(p); + (void)strcpy(namep, name); + __os_freestr(name); return (0); } @@ -495,43 +502,102 @@ __log_newfd(dblp) DB_LOG *dblp; { int ret; - char *p; + char *name; /* Close any previous file descriptor. */ if (dblp->lfd != -1) { - (void)__db_close(dblp->lfd); + (void)__os_close(dblp->lfd); dblp->lfd = -1; } /* Get the path of the new file and open it. */ dblp->lfname = dblp->lp->lsn.file; - if ((ret = __log_name(dblp, dblp->lfname, &p)) != 0) - return (ret); - if ((ret = __db_open(p, - DB_CREATE | DB_SEQUENTIAL, - DB_CREATE | DB_SEQUENTIAL, - dblp->lp->persist.mode, &dblp->lfd)) != 0) - __db_err(dblp->dbenv, - "log_put: %s: %s", p, strerror(ret)); - FREES(p); + if ((ret = __log_name(dblp, + dblp->lfname, &name, &dblp->lfd, DB_CREATE | DB_SEQUENTIAL)) != 0) + __db_err(dblp->dbenv, "log_put: %s: %s", name, strerror(ret)); + + __os_freestr(name); return (ret); } /* * __log_name -- - * Return the log name for a particular file. + * Return the log name for a particular file, and optionally open it. * - * PUBLIC: int __log_name __P((DB_LOG *, int, char **)); + * PUBLIC: int __log_name __P((DB_LOG *, u_int32_t, char **, int *, u_int32_t)); */ int -__log_name(dblp, filenumber, namep) +__log_name(dblp, filenumber, namep, fdp, flags) DB_LOG *dblp; + u_int32_t filenumber, flags; char **namep; - int filenumber; + int *fdp; { - char name[sizeof(LFNAME) + 10]; + int ret; + char *oname; + char old[sizeof(LFPREFIX) + 5 + 20], new[sizeof(LFPREFIX) + 10 + 20]; + + /* + * !!! + * The semantics of this routine are bizarre. + * + * The reason for all of this is that we need a place where we can + * intercept requests for log files, and, if appropriate, check for + * both the old-style and new-style log file names. The trick is + * that all callers of this routine that are opening the log file + * read-only want to use an old-style file name if they can't find + * a match using a new-style name. The only down-side is that some + * callers may check for the old-style when they really don't need + * to, but that shouldn't mess up anything, and we only check for + * the old-style name when we've already failed to find a new-style + * one. + * + * Create a new-style file name, and if we're not going to open the + * file, return regardless. + */ + (void)snprintf(new, sizeof(new), LFNAME, filenumber); + if ((ret = __db_appname(dblp->dbenv, + DB_APP_LOG, dblp->dir, new, 0, NULL, namep)) != 0 || fdp == NULL) + return (ret); - (void)snprintf(name, sizeof(name), LFNAME, filenumber); - return (__db_appname(dblp->dbenv, - DB_APP_LOG, dblp->dir, name, 0, NULL, namep)); + /* Open the new-style file -- if we succeed, we're done. */ + if ((ret = __db_open(*namep, + flags, flags, dblp->lp->persist.mode, fdp)) == 0) + return (0); + + /* + * The open failed... if the DB_RDONLY flag isn't set, we're done, + * the caller isn't interested in old-style files. + */ + if (!LF_ISSET(DB_RDONLY)) + return (ret); + + /* Create an old-style file name. */ + (void)snprintf(old, sizeof(old), LFNAME_V1, filenumber); + if ((ret = __db_appname(dblp->dbenv, + DB_APP_LOG, dblp->dir, old, 0, NULL, &oname)) != 0) + goto err; + + /* + * Open the old-style file -- if we succeed, we're done. Free the + * space allocated for the new-style name and return the old-style + * name to the caller. + */ + if ((ret = __db_open(oname, + flags, flags, dblp->lp->persist.mode, fdp)) == 0) { + __os_freestr(*namep); + *namep = oname; + return (0); + } + + /* + * Couldn't find either style of name -- return the new-style name + * for the caller's error message. If it's an old-style name that's + * actually missing we're going to confuse the user with the error + * message, but that implies that not only were we looking for an + * old-style name, but we expected it to exist and we weren't just + * looking for any log file. That's not a likely error. + */ +err: __os_freestr(oname); + return (ret); } diff --git a/db2/log/log_rec.c b/db2/log/log_rec.c index 5deac46298..8895150be1 100644 --- a/db2/log/log_rec.c +++ b/db2/log/log_rec.c @@ -40,7 +40,7 @@ #include "config.h" #ifndef lint -static const char sccsid[] = "@(#)log_rec.c 10.20 (Sleepycat) 4/28/98"; +static const char sccsid[] = "@(#)log_rec.c 10.26 (Sleepycat) 10/21/98"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -56,8 +56,10 @@ static const char sccsid[] = "@(#)log_rec.c 10.20 (Sleepycat) 4/28/98"; #include "db_dispatch.h" #include "common_ext.h" -static int __log_open_file __P((DB_LOG *, +static int __log_do_open __P((DB_LOG *, u_int8_t *, char *, DBTYPE, u_int32_t)); +static int __log_lid_to_fname __P((DB_LOG *, u_int32_t, FNAME **)); +static int __log_open_file __P((DB_LOG *, __log_register_args *)); /* * PUBLIC: int __log_register_recover @@ -80,7 +82,7 @@ __log_register_recover(logp, dbtp, lsnp, redo, info) COMPQUIET(info, NULL); COMPQUIET(lsnp, NULL); - F_SET(logp, DB_AM_RECOVER); + F_SET(logp, DBC_RECOVER); if ((ret = __log_register_read(dbtp->data, &argp)) != 0) goto out; @@ -95,13 +97,11 @@ __log_register_recover(logp, dbtp, lsnp, redo, info) * If we are redoing an open or undoing a close, then we need * to open a file. */ - ret = __log_open_file(logp, - argp->uid.data, argp->name.data, argp->ftype, argp->id); + ret = __log_open_file(logp, argp); if (ret == ENOENT) { if (redo == TXN_OPENFILES) - __db_err(logp->dbenv, - "warning: file %s not found", - argp->name.data); + __db_err(logp->dbenv, "warning: %s: %s", + argp->name.data, strerror(ENOENT)); ret = 0; } } else if (argp->opcode != LOG_CHECKPOINT) { @@ -109,26 +109,42 @@ __log_register_recover(logp, dbtp, lsnp, redo, info) * If we are redoing a close or undoing an open, then we need * to close the file. * - * If the file is deleted, then we can just ignore this close. - * Otherwise, we'd better have a valid dbp that we should either - * close or whose reference count should be decremented. + * If the file is deleted, then we can just ignore this close. + * Otherwise, we should usually have a valid dbp we should + * close or whose reference count should be decremented. + * However, if we shut down without closing a file, we + * may, in fact, not have the file open, and that's OK. */ LOCK_LOGTHREAD(logp); - if (logp->dbentry[argp->id].dbp == NULL) { - if (!logp->dbentry[argp->id].deleted) - ret = EINVAL; - } else if (--logp->dbentry[argp->id].refcount == 0) { - F_SET(logp->dbentry[argp->id].dbp, DB_AM_RECOVER); + if (logp->dbentry[argp->id].dbp != NULL && + --logp->dbentry[argp->id].refcount == 0) { ret = logp->dbentry[argp->id].dbp->close( logp->dbentry[argp->id].dbp, 0); logp->dbentry[argp->id].dbp = NULL; } UNLOCK_LOGTHREAD(logp); + } else if (redo == TXN_UNDO && + (argp->id >= logp->dbentry_cnt || + (!logp->dbentry[argp->id].deleted && + logp->dbentry[argp->id].dbp == NULL))) { + /* + * It's a checkpoint and we are rolling backward. It + * is possible that the system was shut down and thus + * ended with a stable checkpoint; this file was never + * closed and has therefore not been reopened yet. If + * so, we need to try to open it. + */ + ret = __log_open_file(logp, argp); + if (ret == ENOENT) { + __db_err(logp->dbenv, "warning: %s: %s", + argp->name.data, strerror(ENOENT)); + ret = 0; + } } -out: F_CLR(logp, DB_AM_RECOVER); +out: F_CLR(logp, DBC_RECOVER); if (argp != NULL) - __db_free(argp); + __os_free(argp, 0); return (ret); } @@ -140,34 +156,49 @@ out: F_CLR(logp, DB_AM_RECOVER); * Returns 0 on success, non-zero on error. */ static int -__log_open_file(lp, uid, name, ftype, ndx) +__log_open_file(lp, argp) DB_LOG *lp; - u_int8_t *uid; - char *name; - DBTYPE ftype; - u_int32_t ndx; + __log_register_args *argp; { - DB *dbp; - int ret; - LOCK_LOGTHREAD(lp); - if (ndx < lp->dbentry_cnt && - (lp->dbentry[ndx].deleted == 1 || lp->dbentry[ndx].dbp != NULL)) { - lp->dbentry[ndx].refcount++; + if (argp->id < lp->dbentry_cnt && + (lp->dbentry[argp->id].deleted == 1 || + lp->dbentry[argp->id].dbp != NULL)) { + if (argp->opcode != LOG_CHECKPOINT) + lp->dbentry[argp->id].refcount++; UNLOCK_LOGTHREAD(lp); return (0); } UNLOCK_LOGTHREAD(lp); + return (__log_do_open(lp, + argp->uid.data, argp->name.data, argp->ftype, argp->id)); +} + +/* + * __log_do_open -- + * Open files referenced in the log. This is the part of the open that + * is not protected by the thread mutex. + */ + +static int +__log_do_open(lp, uid, name, ftype, ndx) + DB_LOG *lp; + u_int8_t *uid; + char *name; + DBTYPE ftype; + u_int32_t ndx; +{ + DB *dbp; + int ret; - /* Need to open file. */ dbp = NULL; if ((ret = db_open(name, ftype, 0, 0, lp->dbenv, NULL, &dbp)) == 0) { /* * Verify that we are opening the same file that we were * referring to when we wrote this log record. */ - if (memcmp(uid, dbp->lock.fileid, DB_FILE_ID_LEN) != 0) { + if (memcmp(uid, dbp->fileid, DB_FILE_ID_LEN) != 0) { (void)dbp->close(dbp, 0); dbp = NULL; ret = ENOENT; @@ -181,10 +212,9 @@ __log_open_file(lp, uid, name, ftype, ndx) } /* - * This function returns: - * 0 SUCCESS (the entry was not previously set and is now set or the - * entry was previously set and we just inced the ref count. - * >0 on system error (returns errno value). + * __log_add_logid -- + * Adds a DB entry to the log's DB entry table. + * * PUBLIC: int __log_add_logid __P((DB_LOG *, DB *, u_int32_t)); */ int @@ -193,43 +223,30 @@ __log_add_logid(logp, dbp, ndx) DB *dbp; u_int32_t ndx; { - DB_ENTRY *temp_entryp; u_int32_t i; int ret; ret = 0; LOCK_LOGTHREAD(logp); + /* - * Check if we need to grow the table. + * Check if we need to grow the table. Note, ndx is 0-based (the + * index into the DB entry table) an dbentry_cnt is 1-based, the + * number of available slots. */ if (logp->dbentry_cnt <= ndx) { - if (logp->dbentry_cnt == 0) { - logp->dbentry = (DB_ENTRY *) - __db_malloc(DB_GROW_SIZE * sizeof(DB_ENTRY)); - if (logp->dbentry == NULL) { - ret = ENOMEM; - goto err; - } - } else { - temp_entryp = (DB_ENTRY *)__db_realloc(logp->dbentry, - (DB_GROW_SIZE + logp->dbentry_cnt) * - sizeof(DB_ENTRY)); - if (temp_entryp == NULL) { - ret = ENOMEM; - goto err; - } - logp->dbentry = temp_entryp; + if ((ret = __os_realloc(&logp->dbentry, + (ndx + DB_GROW_SIZE) * sizeof(DB_ENTRY))) != 0) + goto err; - } /* Initialize the new entries. */ - for (i = logp->dbentry_cnt; - i < logp->dbentry_cnt + DB_GROW_SIZE; i++) { + for (i = logp->dbentry_cnt; i < ndx + DB_GROW_SIZE; i++) { logp->dbentry[i].dbp = NULL; logp->dbentry[i].deleted = 0; } - logp->dbentry_cnt += DB_GROW_SIZE; + logp->dbentry_cnt = i; } if (logp->dbentry[ndx].deleted == 0 && logp->dbentry[ndx].dbp == NULL) { @@ -257,11 +274,47 @@ __db_fileid_to_db(logp, dbpp, ndx) u_int32_t ndx; { int ret; + char *name; + FNAME *fname; ret = 0; LOCK_LOGTHREAD(logp); /* + * Under XA, a process different than the one issuing DB + * operations may abort a transaction. In this case, + * recovery routines are run by a process that does not + * necessarily have the file open. In this case, we must + * open the file explicitly. + */ + if (ndx >= logp->dbentry_cnt || + (!logp->dbentry[ndx].deleted && logp->dbentry[ndx].dbp == NULL)) { + if (__log_lid_to_fname(logp, ndx, &fname) != 0) { + /* Couldn't find entry; this is a fatal error. */ + ret = EINVAL; + goto err; + } + name = R_ADDR(logp, fname->name_off); + /* + * __log_do_open is called without protection of the + * log thread lock. + */ + UNLOCK_LOGTHREAD(logp); + /* + * At this point, we are not holding the thread lock, so + * exit directly instead of going through the exit code + * at the bottom. If the __log_do_open succeeded, then + * we don't need to do any of the remaining error checking + * at the end of this routine. + */ + if ((ret = __log_do_open(logp, + fname->ufid, name, fname->s_type, ndx)) != 0) + return (ret); + *dbpp = logp->dbentry[ndx].dbp; + return (0); + } + + /* * Return DB_DELETED if the file has been deleted * (it's not an error). */ @@ -294,8 +347,12 @@ __log_close_files(logp) LOCK_LOGTHREAD(logp); for (i = 0; i < logp->dbentry_cnt; i++) - if (logp->dbentry[i].dbp) + if (logp->dbentry[i].dbp) { logp->dbentry[i].dbp->close(logp->dbentry[i].dbp, 0); + logp->dbentry[i].dbp = NULL; + logp->dbentry[i].deleted = 0; + } + F_CLR(logp, DBC_RECOVER); UNLOCK_LOGTHREAD(logp); } @@ -314,3 +371,28 @@ __log_rem_logid(logp, ndx) } UNLOCK_LOGTHREAD(logp); } + +/* + * __log_lid_to_fname -- + * Traverse the shared-memory region looking for the entry that + * matches the passed log fileid. Returns 0 on success; -1 on error. + */ +static int +__log_lid_to_fname(dblp, lid, fnamep) + DB_LOG *dblp; + u_int32_t lid; + FNAME **fnamep; +{ + FNAME *fnp; + + for (fnp = SH_TAILQ_FIRST(&dblp->lp->fq, __fname); + fnp != NULL; fnp = SH_TAILQ_NEXT(fnp, q, __fname)) { + if (fnp->ref == 0) /* Entry not in use. */ + continue; + if (fnp->id == lid) { + *fnamep = fnp; + return (0); + } + } + return (-1); +} diff --git a/db2/log/log_register.c b/db2/log/log_register.c index a6fc4c1b3b..22264e3291 100644 --- a/db2/log/log_register.c +++ b/db2/log/log_register.c @@ -7,7 +7,7 @@ #include "config.h" #ifndef lint -static const char sccsid[] = "@(#)log_register.c 10.18 (Sleepycat) 5/3/98"; +static const char sccsid[] = "@(#)log_register.c 10.22 (Sleepycat) 9/27/98"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -36,17 +36,18 @@ log_register(dblp, dbp, name, type, idp) { DBT fid_dbt, r_name; DB_LSN r_unused; - FNAME *fnp; + FNAME *fnp, *reuse_fnp; size_t len; - u_int32_t fid; + u_int32_t maxid; int inserted, ret; char *fullname; void *namep; - fid = 0; inserted = 0; fullname = NULL; - fnp = namep = NULL; + fnp = namep = reuse_fnp = NULL; + + LOG_PANIC_CHECK(dblp); /* Check the arguments. */ if (type != DB_BTREE && type != DB_HASH && type != DB_RECNO) { @@ -63,26 +64,37 @@ log_register(dblp, dbp, name, type, idp) /* * See if we've already got this file in the log, finding the - * next-to-lowest file id currently in use as we do it. + * (maximum+1) in-use file id and some available file id (if we + * find an available fid, we'll use it, else we'll have to allocate + * one after the maximum that we found). */ - for (fid = 1, fnp = SH_TAILQ_FIRST(&dblp->lp->fq, __fname); + for (maxid = 0, fnp = SH_TAILQ_FIRST(&dblp->lp->fq, __fname); fnp != NULL; fnp = SH_TAILQ_NEXT(fnp, q, __fname)) { - if (fid <= fnp->id) - fid = fnp->id + 1; - if (!memcmp(dbp->lock.fileid, fnp->ufid, DB_FILE_ID_LEN)) { + if (fnp->ref == 0) { /* Entry is not in use. */ + if (reuse_fnp == NULL) + reuse_fnp = fnp; + continue; + } + if (!memcmp(dbp->fileid, fnp->ufid, DB_FILE_ID_LEN)) { ++fnp->ref; - fid = fnp->id; goto found; } + if (maxid <= fnp->id) + maxid = fnp->id + 1; } - /* Allocate a new file name structure. */ - if ((ret = __db_shalloc(dblp->addr, sizeof(FNAME), 0, &fnp)) != 0) + /* Fill in fnp structure. */ + + if (reuse_fnp != NULL) /* Reuse existing one. */ + fnp = reuse_fnp; + else if ((ret = __db_shalloc(dblp->addr, sizeof(FNAME), 0, &fnp)) != 0) goto err; + else /* Allocate a new one. */ + fnp->id = maxid; + fnp->ref = 1; - fnp->id = fid; fnp->s_type = type; - memcpy(fnp->ufid, dbp->lock.fileid, DB_FILE_ID_LEN); + memcpy(fnp->ufid, dbp->fileid, DB_FILE_ID_LEN); len = strlen(name) + 1; if ((ret = __db_shalloc(dblp->addr, len, 0, &namep)) != 0) @@ -90,20 +102,22 @@ log_register(dblp, dbp, name, type, idp) fnp->name_off = R_OFFSET(dblp, namep); memcpy(namep, name, len); - SH_TAILQ_INSERT_HEAD(&dblp->lp->fq, fnp, q, __fname); + /* Only do the insert if we allocated a new fnp. */ + if (reuse_fnp == NULL) + SH_TAILQ_INSERT_HEAD(&dblp->lp->fq, fnp, q, __fname); inserted = 1; found: /* Log the registry. */ - if (!F_ISSET(dblp, DB_AM_RECOVER)) { + if (!F_ISSET(dblp, DBC_RECOVER)) { r_name.data = (void *)name; /* XXX: Yuck! */ r_name.size = strlen(name) + 1; memset(&fid_dbt, 0, sizeof(fid_dbt)); - fid_dbt.data = dbp->lock.fileid; + fid_dbt.data = dbp->fileid; fid_dbt.size = DB_FILE_ID_LEN; if ((ret = __log_register_log(dblp, NULL, &r_unused, - 0, LOG_OPEN, &r_name, &fid_dbt, fid, type)) != 0) + 0, LOG_OPEN, &r_name, &fid_dbt, fnp->id, type)) != 0) goto err; - if ((ret = __log_add_logid(dblp, dbp, fid)) != 0) + if ((ret = __log_add_logid(dblp, dbp, fnp->id)) != 0) goto err; } @@ -120,13 +134,13 @@ err: /* __db_shalloc_free(dblp->addr, fnp); } + if (idp != NULL) + *idp = fnp->id; UNLOCK_LOGREGION(dblp); if (fullname != NULL) - FREES(fullname); + __os_freestr(fullname); - if (idp != NULL) - *idp = fid; return (ret); } @@ -144,6 +158,8 @@ log_unregister(dblp, fid) FNAME *fnp; int ret; + LOG_PANIC_CHECK(dblp); + ret = 0; LOCK_LOGREGION(dblp); @@ -159,7 +175,7 @@ log_unregister(dblp, fid) } /* Unlog the registry. */ - if (!F_ISSET(dblp, DB_AM_RECOVER)) { + if (!F_ISSET(dblp, DBC_RECOVER)) { memset(&r_name, 0, sizeof(r_name)); r_name.data = R_ADDR(dblp, fnp->name_off); r_name.size = strlen(r_name.data) + 1; @@ -173,22 +189,18 @@ log_unregister(dblp, fid) /* * If more than 1 reference, just decrement the reference and return. - * Otherwise, free the unique file information, name and structure. + * Otherwise, free the name. */ - if (fnp->ref > 1) - --fnp->ref; - else { + --fnp->ref; + if (fnp->ref == 0) __db_shalloc_free(dblp->addr, R_ADDR(dblp, fnp->name_off)); - SH_TAILQ_REMOVE(&dblp->lp->fq, fnp, q, __fname); - __db_shalloc_free(dblp->addr, fnp); - } /* * Remove from the process local table. If this operation is taking * place during recovery, then the logid was never added to the table, * so do not remove it. */ - if (!F_ISSET(dblp, DB_AM_RECOVER)) + if (!F_ISSET(dblp, DBC_RECOVER)) __log_rem_logid(dblp, fid); ret1: UNLOCK_LOGREGION(dblp); diff --git a/db2/mp/mp_bh.c b/db2/mp/mp_bh.c index d89f9c2ded..12c53417d9 100644 --- a/db2/mp/mp_bh.c +++ b/db2/mp/mp_bh.c @@ -7,7 +7,7 @@ #include "config.h" #ifndef lint -static const char sccsid[] = "@(#)mp_bh.c 10.38 (Sleepycat) 5/20/98"; +static const char sccsid[] = "@(#)mp_bh.c 10.45 (Sleepycat) 11/25/98"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -42,11 +42,13 @@ __memp_bhwrite(dbmp, mfp, bhp, restartp, wrotep) { DB_MPOOLFILE *dbmfp; DB_MPREG *mpreg; + int incremented, ret; if (restartp != NULL) *restartp = 0; if (wrotep != NULL) *wrotep = 0; + incremented = 0; /* * Walk the process' DB_MPOOLFILE list and find a file descriptor for @@ -63,6 +65,13 @@ __memp_bhwrite(dbmp, mfp, bhp, restartp, wrotep) UNLOCKHANDLE(dbmp, dbmp->mutexp); return (0); } + + /* + * Increment the reference count -- see the comment in + * memp_fclose(). + */ + ++dbmfp->ref; + incremented = 1; break; } UNLOCKHANDLE(dbmp, dbmp->mutexp); @@ -117,7 +126,15 @@ __memp_bhwrite(dbmp, mfp, bhp, restartp, wrotep) 0, 0, mfp->stat.st_pagesize, 0, NULL, &dbmfp) != 0) return (0); -found: return (__memp_pgwrite(dbmfp, bhp, restartp, wrotep)); +found: ret = __memp_pgwrite(dbmfp, bhp, restartp, wrotep); + + if (incremented) { + LOCKHANDLE(dbmp, dbmp->mutexp); + --dbmfp->ref; + UNLOCKHANDLE(dbmp, dbmp->mutexp); + } + + return (ret); } /* @@ -132,11 +149,12 @@ __memp_pgread(dbmfp, bhp, can_create) BH *bhp; int can_create; { + DB_IO db_io; DB_MPOOL *dbmp; MPOOLFILE *mfp; - size_t pagesize; + size_t len, pagesize; ssize_t nr; - int ret; + int created, ret; dbmp = dbmfp->dbmp; mfp = dbmfp->mfp; @@ -147,70 +165,63 @@ __memp_pgread(dbmfp, bhp, can_create) UNLOCKREGION(dbmp); /* - * Temporary files may not yet have been created. - * - * Seek to the page location. + * Temporary files may not yet have been created. We don't create + * them now, we create them when the pages have to be flushed. */ - ret = 0; - LOCKHANDLE(dbmp, dbmfp->mutexp); - if (dbmfp->fd == -1 || (ret = - __db_seek(dbmfp->fd, pagesize, bhp->pgno, 0, 0, SEEK_SET)) != 0) { - if (!can_create) { - if (dbmfp->fd == -1) - ret = EINVAL; - UNLOCKHANDLE(dbmp, dbmfp->mutexp); + nr = 0; + if (dbmfp->fd == -1) + ret = 0; + else { + /* + * Ignore read errors if we have permission to create the page. + * Assume that the page doesn't exist, and that we'll create it + * when we write it out. + */ + db_io.fd_io = dbmfp->fd; + db_io.fd_lock = dbmp->reginfo.fd; + db_io.mutexp = + F_ISSET(dbmp, MP_LOCKHANDLE) ? dbmfp->mutexp : NULL; + db_io.pagesize = db_io.bytes = pagesize; + db_io.pgno = bhp->pgno; + db_io.buf = bhp->buf; + + ret = __os_io(&db_io, DB_IO_READ, &nr); + } + + created = 0; + if (nr < (ssize_t)pagesize) { + if (can_create) + created = 1; + else { + /* If we had a short read, ret may be 0. */ + if (ret == 0) + ret = EIO; __db_err(dbmp->dbenv, "%s: page %lu doesn't exist, create flag not set", __memp_fn(dbmfp), (u_long)bhp->pgno); goto err; } - UNLOCKHANDLE(dbmp, dbmfp->mutexp); - - /* Clear the created page. */ - if (mfp->clear_len == 0) - memset(bhp->buf, 0, pagesize); - else { - memset(bhp->buf, 0, mfp->clear_len); -#ifdef DIAGNOSTIC - memset(bhp->buf + mfp->clear_len, - 0xff, pagesize - mfp->clear_len); -#endif - } - - goto pgin; } /* - * Read the page; short reads are treated like creates, although - * any valid data is preserved. + * Clear any bytes we didn't read that need to be cleared. If we're + * running in diagnostic mode, smash any bytes on the page that are + * unknown quantities for the caller. */ - ret = __db_read(dbmfp->fd, bhp->buf, pagesize, &nr); - UNLOCKHANDLE(dbmp, dbmfp->mutexp); - if (ret != 0) - goto err; - - if (nr == (ssize_t)pagesize) - can_create = 0; - else { - if (!can_create) { - ret = EINVAL; - goto err; - } - - /* - * If we didn't fail until we tried the read, don't clear the - * whole page, it wouldn't be insane for a filesystem to just - * always behave that way. Else, clear any uninitialized data. - */ - if (nr == 0) - memset(bhp->buf, 0, - mfp->clear_len == 0 ? pagesize : mfp->clear_len); - else - memset(bhp->buf + nr, 0, pagesize - nr); + if (nr != (ssize_t)pagesize) { + len = mfp->clear_len == 0 ? pagesize : mfp->clear_len; + if (nr < (ssize_t)len) + memset(bhp->buf + nr, 0, len - nr); +#ifdef DIAGNOSTIC + if (nr > (ssize_t)len) + len = nr; + if (len < pagesize) + memset(bhp->buf + len, 0xdb, pagesize - len); +#endif } /* Call any pgin function. */ -pgin: ret = mfp->ftype == 0 ? 0 : __memp_pg(dbmfp, bhp, 1); + ret = mfp->ftype == 0 ? 0 : __memp_pg(dbmfp, bhp, 1); /* Unlock the buffer and reacquire the region lock. */ err: UNLOCKBUFFER(dbmp, bhp); @@ -225,7 +236,7 @@ err: UNLOCKBUFFER(dbmp, bhp); F_CLR(bhp, BH_TRASH); /* Update the statistics. */ - if (can_create) { + if (created) { ++dbmp->mp->stat.st_page_create; ++mfp->stat.st_page_create; } else { @@ -250,12 +261,12 @@ __memp_pgwrite(dbmfp, bhp, restartp, wrotep) int *restartp, *wrotep; { DB_ENV *dbenv; + DB_IO db_io; DB_LOG *lg_info; DB_LSN lsn; DB_MPOOL *dbmp; MPOOL *mp; MPOOLFILE *mfp; - size_t pagesize; ssize_t nw; int callpgin, ret, syncfail; const char *fail; @@ -270,7 +281,6 @@ __memp_pgwrite(dbmfp, bhp, restartp, wrotep) if (wrotep != NULL) *wrotep = 0; callpgin = 0; - pagesize = mfp->stat.st_pagesize; /* * Check the dirty bit -- this buffer may have been written since we @@ -326,34 +336,32 @@ __memp_pgwrite(dbmfp, bhp, restartp, wrotep) } /* Temporary files may not yet have been created. */ - LOCKHANDLE(dbmp, dbmfp->mutexp); - if (dbmfp->fd == -1 && - ((ret = __db_appname(dbenv, DB_APP_TMP, NULL, NULL, - DB_CREATE | DB_EXCL | DB_TEMPORARY, &dbmfp->fd, NULL)) != 0 || - dbmfp->fd == -1)) { + if (dbmfp->fd == -1) { + LOCKHANDLE(dbmp, dbmfp->mutexp); + if (dbmfp->fd == -1 && ((ret = __db_appname(dbenv, + DB_APP_TMP, NULL, NULL, DB_CREATE | DB_EXCL | DB_TEMPORARY, + &dbmfp->fd, NULL)) != 0 || dbmfp->fd == -1)) { + UNLOCKHANDLE(dbmp, dbmfp->mutexp); + __db_err(dbenv, + "unable to create temporary backing file"); + goto err; + } UNLOCKHANDLE(dbmp, dbmfp->mutexp); - __db_err(dbenv, "unable to create temporary backing file"); - goto err; } - /* - * Write the page out. - * - * XXX - * Shut the compiler up; it doesn't understand the correlation between - * the failing clauses to __db_lseek and __db_write and this ret != 0. - */ - COMPQUIET(fail, NULL); - if ((ret = - __db_seek(dbmfp->fd, pagesize, bhp->pgno, 0, 0, SEEK_SET)) != 0) - fail = "seek"; - else if ((ret = __db_write(dbmfp->fd, bhp->buf, pagesize, &nw)) != 0) + /* Write the page. */ + db_io.fd_io = dbmfp->fd; + db_io.fd_lock = dbmp->reginfo.fd; + db_io.mutexp = F_ISSET(dbmp, MP_LOCKHANDLE) ? dbmfp->mutexp : NULL; + db_io.pagesize = db_io.bytes = mfp->stat.st_pagesize; + db_io.pgno = bhp->pgno; + db_io.buf = bhp->buf; + if ((ret = __os_io(&db_io, DB_IO_WRITE, &nw)) != 0) { + __db_panic(dbenv, ret); fail = "write"; - UNLOCKHANDLE(dbmp, dbmfp->mutexp); - if (ret != 0) goto syserr; - - if (nw != (ssize_t)pagesize) { + } + if (nw != (ssize_t)mfp->stat.st_pagesize) { ret = EIO; fail = "write"; goto syserr; @@ -394,7 +402,7 @@ __memp_pgwrite(dbmfp, bhp, restartp, wrotep) if (F_ISSET(bhp, BH_WRITE)) { if (mfp->lsn_cnt == 1) { UNLOCKREGION(dbmp); - syncfail = __db_fsync(dbmfp->fd) != 0; + syncfail = __os_fsync(dbmfp->fd) != 0; LOCKREGION(dbmp); if (syncfail) F_SET(mp, MP_LSN_RETRY); @@ -574,11 +582,11 @@ __memp_upgrade(dbmp, dbmfp, mfp) ret = 1; } else { /* Swap the descriptors and set the upgrade flag. */ - (void)__db_close(dbmfp->fd); + (void)__os_close(dbmfp->fd); dbmfp->fd = fd; F_SET(dbmfp, MP_UPGRADE); ret = 0; } - FREES(rpath); + __os_freestr(rpath); return (ret); } diff --git a/db2/mp/mp_fget.c b/db2/mp/mp_fget.c index 0777aa7dc6..f159dc2d3e 100644 --- a/db2/mp/mp_fget.c +++ b/db2/mp/mp_fget.c @@ -7,7 +7,7 @@ #include "config.h" #ifndef lint -static const char sccsid[] = "@(#)mp_fget.c 10.48 (Sleepycat) 6/2/98"; +static const char sccsid[] = "@(#)mp_fget.c 10.53 (Sleepycat) 11/16/98"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -46,6 +46,8 @@ memp_fget(dbmfp, pgnoaddr, flags, addrp) mp = dbmp->mp; mfp = dbmfp->mfp; + MP_PANIC_CHECK(dbmp); + /* * Validate arguments. * @@ -79,12 +81,11 @@ memp_fget(dbmfp, pgnoaddr, flags, addrp) #ifdef DIAGNOSTIC /* * XXX - * We want to switch threads as often as possible. Sleep every time - * we get a new page to make it more likely. + * We want to switch threads as often as possible. Yield every time + * we get a new page to ensure contention. */ - if (DB_GLOBAL(db_pageyield) && - (__db_yield == NULL || __db_yield() != 0)) - __db_sleep(0, 1); + if (DB_GLOBAL(db_pageyield)) + __os_yield(1); #endif /* Initialize remaining local variables. */ @@ -205,8 +206,8 @@ memp_fget(dbmfp, pgnoaddr, flags, addrp) * up running to the end of our CPU quantum as we will * simply be swapping between the two locks. */ - if (!first && (__db_yield == NULL || __db_yield() != 0)) - __db_sleep(0, 1); + if (!first) + __os_yield(1); LOCKBUFFER(dbmp, bhp); /* Wait for I/O to finish... */ @@ -240,7 +241,7 @@ memp_fget(dbmfp, pgnoaddr, flags, addrp) } alloc: /* Allocate new buffer header and data space. */ - if ((ret = __memp_ralloc(dbmp, sizeof(BH) - + if ((ret = __memp_alloc(dbmp, sizeof(BH) - sizeof(u_int8_t) + mfp->stat.st_pagesize, NULL, &bhp)) != 0) goto err; @@ -285,7 +286,7 @@ alloc: /* Allocate new buffer header and data space. */ else { memset(bhp->buf, 0, mfp->clear_len); #ifdef DIAGNOSTIC - memset(bhp->buf + mfp->clear_len, 0xff, + memset(bhp->buf + mfp->clear_len, 0xdb, mfp->stat.st_pagesize - mfp->clear_len); #endif } @@ -335,11 +336,9 @@ done: /* Update the chain search statistics. */ mp->stat.st_hash_examined += st_hsearch; } - UNLOCKREGION(dbmp); - - LOCKHANDLE(dbmp, dbmfp->mutexp); ++dbmfp->pinref; - UNLOCKHANDLE(dbmp, dbmfp->mutexp); + + UNLOCKREGION(dbmp); return (0); diff --git a/db2/mp/mp_fopen.c b/db2/mp/mp_fopen.c index a4cbac8d4e..dd02662fd8 100644 --- a/db2/mp/mp_fopen.c +++ b/db2/mp/mp_fopen.c @@ -7,7 +7,7 @@ #include "config.h" #ifndef lint -static const char sccsid[] = "@(#)mp_fopen.c 10.47 (Sleepycat) 5/4/98"; +static const char sccsid[] = "@(#)mp_fopen.c 10.60 (Sleepycat) 1/1/99"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -43,6 +43,8 @@ memp_fopen(dbmp, path, flags, mode, pagesize, finfop, retp) { int ret; + MP_PANIC_CHECK(dbmp); + /* Validate arguments. */ if ((ret = __db_fchk(dbmp->dbenv, "memp_fopen", flags, DB_CREATE | DB_NOMMAP | DB_RDONLY)) != 0) @@ -53,6 +55,8 @@ memp_fopen(dbmp, path, flags, mode, pagesize, finfop, retp) __db_err(dbmp->dbenv, "memp_fopen: pagesize not specified"); return (EINVAL); } + if (finfop != NULL && finfop->clear_len > pagesize) + return (EINVAL); return (__memp_fopen(dbmp, NULL, path, flags, mode, pagesize, 1, finfop, retp)); @@ -80,7 +84,7 @@ __memp_fopen(dbmp, mfp, path, flags, mode, pagesize, needlock, finfop, retp) DB_MPOOLFILE *dbmfp; DB_MPOOL_FINFO finfo; db_pgno_t last_pgno; - size_t size; + size_t maxmap; u_int32_t mbytes, bytes; int ret; u_int8_t idbuf[DB_FILE_ID_LEN]; @@ -115,13 +119,11 @@ __memp_fopen(dbmp, mfp, path, flags, mode, pagesize, needlock, finfop, retp) } /* Allocate and initialize the per-process structure. */ - if ((dbmfp = - (DB_MPOOLFILE *)__db_calloc(1, sizeof(DB_MPOOLFILE))) == NULL) { - __db_err(dbenv, "memp_fopen: %s", strerror(ENOMEM)); - return (ENOMEM); - } + if ((ret = __os_calloc(1, sizeof(DB_MPOOLFILE), &dbmfp)) != 0) + return (ret); dbmfp->dbmp = dbmp; dbmfp->fd = -1; + dbmfp->ref = 1; if (LF_ISSET(DB_RDONLY)) F_SET(dbmfp, MP_READONLY); @@ -132,7 +134,6 @@ __memp_fopen(dbmp, mfp, path, flags, mode, pagesize, needlock, finfop, retp) ret = EINVAL; goto err; } - size = 0; last_pgno = 0; } else { /* Get the real name for this file and open it. */ @@ -146,21 +147,40 @@ __memp_fopen(dbmp, mfp, path, flags, mode, pagesize, needlock, finfop, retp) goto err; } - /* Don't permit files that aren't a multiple of the pagesize. */ - if ((ret = __db_ioinfo(rpath, + /* + * Don't permit files that aren't a multiple of the pagesize, + * and find the number of the last page in the file, all the + * time being careful not to overflow 32 bits. + * + * !!! + * We can't use off_t's here, or in any code in the mainline + * library for that matter. (We have to use them in the os + * stubs, of course, as there are system calls that take them + * as arguments.) The reason is that some customers build in + * environments where an off_t is 32-bits, but still run where + * offsets are 64-bits, and they pay us a lot of money. + */ + if ((ret = __os_ioinfo(rpath, dbmfp->fd, &mbytes, &bytes, NULL)) != 0) { __db_err(dbenv, "%s: %s", rpath, strerror(ret)); goto err; } - if (bytes % pagesize) { + + /* Page sizes have to be a power-of-two, ignore mbytes. */ + if (bytes % pagesize != 0) { __db_err(dbenv, "%s: file size not a multiple of the pagesize", rpath); ret = EINVAL; goto err; } - size = mbytes * MEGABYTE + bytes; - last_pgno = size == 0 ? 0 : (size - 1) / pagesize; + + last_pgno = mbytes * (MEGABYTE / pagesize); + last_pgno += bytes / pagesize; + + /* Correction: page numbers are zero-based, not 1-based. */ + if (last_pgno != 0) + --last_pgno; /* * Get the file id if we weren't given one. Generated file id's @@ -168,7 +188,7 @@ __memp_fopen(dbmp, mfp, path, flags, mode, pagesize, needlock, finfop, retp) * other process joining the party. */ if (finfop->fileid == NULL) { - if ((ret = __db_fileid(dbenv, rpath, 0, idbuf)) != 0) + if ((ret = __os_fileid(dbenv, rpath, 0, idbuf)) != 0) goto err; finfop->fileid = idbuf; } @@ -191,7 +211,7 @@ __memp_fopen(dbmp, mfp, path, flags, mode, pagesize, needlock, finfop, retp) } if (ret == 0 && F_ISSET(dbmp, MP_LOCKHANDLE) && (ret = - __memp_ralloc(dbmp, sizeof(db_mutex_t), NULL, &dbmfp->mutexp)) == 0) + __memp_alloc(dbmp, sizeof(db_mutex_t), NULL, &dbmfp->mutexp)) == 0) LOCKINIT(dbmp, dbmfp->mutexp); if (needlock) @@ -232,13 +252,15 @@ __memp_fopen(dbmp, mfp, path, flags, mode, pagesize, needlock, finfop, retp) F_CLR(mfp, MP_CAN_MMAP); if (LF_ISSET(DB_NOMMAP)) F_CLR(mfp, MP_CAN_MMAP); - if (size > (dbenv == NULL || dbenv->mp_mmapsize == 0 ? - DB_MAXMMAPSIZE : dbenv->mp_mmapsize)) + maxmap = dbenv == NULL || dbenv->mp_mmapsize == 0 ? + DB_MAXMMAPSIZE : dbenv->mp_mmapsize; + if (mbytes > maxmap / MEGABYTE || + (mbytes == maxmap / MEGABYTE && bytes >= maxmap % MEGABYTE)) F_CLR(mfp, MP_CAN_MMAP); } dbmfp->addr = NULL; if (F_ISSET(mfp, MP_CAN_MMAP)) { - dbmfp->len = size; + dbmfp->len = (size_t)mbytes * MEGABYTE + bytes; if (__db_mapfile(rpath, dbmfp->fd, dbmfp->len, 1, &dbmfp->addr) != 0) { dbmfp->addr = NULL; @@ -246,7 +268,7 @@ __memp_fopen(dbmp, mfp, path, flags, mode, pagesize, needlock, finfop, retp) } } if (rpath != NULL) - FREES(rpath); + __os_freestr(rpath); LOCKHANDLE(dbmp, dbmp->mutexp); TAILQ_INSERT_TAIL(&dbmp->dbmfq, dbmfp, q); @@ -260,11 +282,11 @@ err: /* * never get to here after we have successfully allocated it. */ if (rpath != NULL) - FREES(rpath); + __os_freestr(rpath); if (dbmfp->fd != -1) - (void)__db_close(dbmfp->fd); + (void)__os_close(dbmfp->fd); if (dbmfp != NULL) - FREE(dbmfp, sizeof(DB_MPOOLFILE)); + __os_free(dbmfp, sizeof(DB_MPOOLFILE)); return (ret); } @@ -315,7 +337,7 @@ __memp_mf_open(dbmp, path, pagesize, last_pgno, finfop, retp) } /* Allocate a new MPOOLFILE. */ - if ((ret = __memp_ralloc(dbmp, sizeof(MPOOLFILE), NULL, &mfp)) != 0) + if ((ret = __memp_alloc(dbmp, sizeof(MPOOLFILE), NULL, &mfp)) != 0) return (ret); *retp = mfp; @@ -334,21 +356,22 @@ __memp_mf_open(dbmp, path, pagesize, last_pgno, finfop, retp) mfp->stat.st_pagesize = pagesize; mfp->orig_last_pgno = mfp->last_pgno = last_pgno; - F_SET(mfp, MP_CAN_MMAP); if (ISTEMPORARY) F_SET(mfp, MP_TEMP); else { /* Copy the file path into shared memory. */ - if ((ret = __memp_ralloc(dbmp, + if ((ret = __memp_alloc(dbmp, strlen(path) + 1, &mfp->path_off, &p)) != 0) goto err; memcpy(p, path, strlen(path) + 1); /* Copy the file identification string into shared memory. */ - if ((ret = __memp_ralloc(dbmp, + if ((ret = __memp_alloc(dbmp, DB_FILE_ID_LEN, &mfp->fileid_off, &p)) != 0) goto err; memcpy(p, finfop->fileid, DB_FILE_ID_LEN); + + F_SET(mfp, MP_CAN_MMAP); } /* Copy the page cookie into shared memory. */ @@ -356,7 +379,7 @@ __memp_mf_open(dbmp, path, pagesize, last_pgno, finfop, retp) mfp->pgcookie_len = 0; mfp->pgcookie_off = 0; } else { - if ((ret = __memp_ralloc(dbmp, + if ((ret = __memp_alloc(dbmp, finfop->pgcookie->size, &mfp->pgcookie_off, &p)) != 0) goto err; memcpy(p, finfop->pgcookie->data, finfop->pgcookie->size); @@ -394,16 +417,48 @@ memp_fclose(dbmfp) dbmp = dbmfp->dbmp; ret = 0; + MP_PANIC_CHECK(dbmp); + + for (;;) { + LOCKHANDLE(dbmp, dbmp->mutexp); + + /* + * We have to reference count DB_MPOOLFILE structures as other + * threads may be using them. The problem only happens if the + * application makes a bad design choice. Here's the path: + * + * Thread A opens a database. + * Thread B uses thread A's DB_MPOOLFILE to write a buffer + * in order to free up memory in the mpool cache. + * Thread A closes the database while thread B is using the + * DB_MPOOLFILE structure. + * + * By opening all databases before creating the threads, and + * closing them after the threads have exited, applications + * get better performance and avoid the problem path entirely. + * + * Regardless, holding the DB_MPOOLFILE to flush a dirty buffer + * is a short-term lock, even in worst case, since we better be + * the only thread of control using the DB_MPOOLFILE structure + * to read pages *into* the cache. Wait until we're the only + * reference holder and remove the DB_MPOOLFILE structure from + * the list, so nobody else can even find it. + */ + if (dbmfp->ref == 1) { + TAILQ_REMOVE(&dbmp->dbmfq, dbmfp, q); + break; + } + UNLOCKHANDLE(dbmp, dbmp->mutexp); + + (void)__os_sleep(1, 0); + } + UNLOCKHANDLE(dbmp, dbmp->mutexp); + /* Complain if pinned blocks never returned. */ if (dbmfp->pinref != 0) __db_err(dbmp->dbenv, "%s: close: %lu blocks left pinned", __memp_fn(dbmfp), (u_long)dbmfp->pinref); - /* Remove the DB_MPOOLFILE structure from the list. */ - LOCKHANDLE(dbmp, dbmp->mutexp); - TAILQ_REMOVE(&dbmp->dbmfq, dbmfp, q); - UNLOCKHANDLE(dbmp, dbmp->mutexp); - /* Close the underlying MPOOLFILE. */ (void)__memp_mf_close(dbmp, dbmfp); @@ -414,7 +469,7 @@ memp_fclose(dbmfp) "%s: %s", __memp_fn(dbmfp), strerror(ret)); /* Close the file; temporary files may not yet have been created. */ - if (dbmfp->fd != -1 && (t_ret = __db_close(dbmfp->fd)) != 0) { + if (dbmfp->fd != -1 && (t_ret = __os_close(dbmfp->fd)) != 0) { __db_err(dbmp->dbenv, "%s: %s", __memp_fn(dbmfp), strerror(t_ret)); if (ret != 0) @@ -429,7 +484,7 @@ memp_fclose(dbmfp) } /* Discard the DB_MPOOLFILE structure. */ - FREE(dbmfp, sizeof(DB_MPOOLFILE)); + __os_free(dbmfp, sizeof(DB_MPOOLFILE)); return (ret); } diff --git a/db2/mp/mp_fput.c b/db2/mp/mp_fput.c index 48fdfc3b7f..c551f97380 100644 --- a/db2/mp/mp_fput.c +++ b/db2/mp/mp_fput.c @@ -7,7 +7,7 @@ #include "config.h" #ifndef lint -static const char sccsid[] = "@(#)mp_fput.c 10.22 (Sleepycat) 4/26/98"; +static const char sccsid[] = "@(#)mp_fput.c 10.24 (Sleepycat) 9/27/98"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -40,6 +40,8 @@ memp_fput(dbmfp, pgaddr, flags) dbmp = dbmfp->dbmp; mp = dbmp->mp; + MP_PANIC_CHECK(dbmp); + /* Validate arguments. */ if (flags) { if ((ret = __db_fchk(dbmp->dbenv, "memp_fput", flags, @@ -57,15 +59,15 @@ memp_fput(dbmfp, pgaddr, flags) } } + LOCKREGION(dbmp); + /* Decrement the pinned reference count. */ - LOCKHANDLE(dbmp, dbmfp->mutexp); if (dbmfp->pinref == 0) __db_err(dbmp->dbenv, "%s: put: more blocks returned than retrieved", __memp_fn(dbmfp)); else --dbmfp->pinref; - UNLOCKHANDLE(dbmp, dbmfp->mutexp); /* * If we're mapping the file, there's nothing to do. Because we can @@ -74,14 +76,14 @@ memp_fput(dbmfp, pgaddr, flags) * region. */ if (dbmfp->addr != NULL && pgaddr >= dbmfp->addr && - (u_int8_t *)pgaddr <= (u_int8_t *)dbmfp->addr + dbmfp->len) + (u_int8_t *)pgaddr <= (u_int8_t *)dbmfp->addr + dbmfp->len) { + UNLOCKREGION(dbmp); return (0); + } /* Convert the page address to a buffer header. */ bhp = (BH *)((u_int8_t *)pgaddr - SSZA(BH, buf)); - LOCKREGION(dbmp); - /* Set/clear the page bits. */ if (LF_ISSET(DB_MPOOL_CLEAN) && F_ISSET(bhp, BH_DIRTY)) { ++mp->stat.st_page_clean; diff --git a/db2/mp/mp_fset.c b/db2/mp/mp_fset.c index 3b352aa553..1940d3b198 100644 --- a/db2/mp/mp_fset.c +++ b/db2/mp/mp_fset.c @@ -7,7 +7,7 @@ #include "config.h" #ifndef lint -static const char sccsid[] = "@(#)mp_fset.c 10.15 (Sleepycat) 4/26/98"; +static const char sccsid[] = "@(#)mp_fset.c 10.16 (Sleepycat) 9/27/98"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -40,6 +40,8 @@ memp_fset(dbmfp, pgaddr, flags) dbmp = dbmfp->dbmp; mp = dbmp->mp; + MP_PANIC_CHECK(dbmp); + /* Validate arguments. */ if (flags == 0) return (__db_ferr(dbmp->dbenv, "memp_fset", 1)); diff --git a/db2/mp/mp_open.c b/db2/mp/mp_open.c index fc985bc521..4c90fc438f 100644 --- a/db2/mp/mp_open.c +++ b/db2/mp/mp_open.c @@ -7,7 +7,7 @@ #include "config.h" #ifndef lint -static const char sccsid[] = "@(#)mp_open.c 10.23 (Sleepycat) 5/3/98"; +static const char sccsid[] = "@(#)mp_open.c 10.27 (Sleepycat) 10/1/98"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -52,8 +52,8 @@ memp_open(path, flags, mode, dbenv, retp) cachesize = dbenv == NULL ? 0 : dbenv->mp_size; /* Create and initialize the DB_MPOOL structure. */ - if ((dbmp = (DB_MPOOL *)__db_calloc(1, sizeof(DB_MPOOL))) == NULL) - return (ENOMEM); + if ((ret = __os_calloc(1, sizeof(DB_MPOOL), &dbmp)) != 0) + return (ret); LIST_INIT(&dbmp->dbregq); TAILQ_INIT(&dbmp->dbmfq); @@ -83,7 +83,7 @@ memp_open(path, flags, mode, dbenv, retp) if (LF_ISSET(DB_THREAD)) { F_SET(dbmp, MP_LOCKHANDLE | MP_LOCKREGION); LOCKREGION(dbmp); - ret = __memp_ralloc(dbmp, + ret = __memp_alloc(dbmp, sizeof(db_mutex_t), NULL, &dbmp->mutexp); UNLOCKREGION(dbmp); if (ret != 0) { @@ -97,7 +97,7 @@ memp_open(path, flags, mode, dbenv, retp) return (0); err: if (dbmp != NULL) - FREE(dbmp, sizeof(DB_MPOOL)); + __os_free(dbmp, sizeof(DB_MPOOL)); return (ret); } @@ -115,10 +115,12 @@ memp_close(dbmp) ret = 0; + MP_PANIC_CHECK(dbmp); + /* Discard DB_MPREGs. */ while ((mpreg = LIST_FIRST(&dbmp->dbregq)) != NULL) { LIST_REMOVE(mpreg, q); - FREE(mpreg, sizeof(DB_MPREG)); + __os_free(mpreg, sizeof(DB_MPREG)); } /* Discard DB_MPOOLFILEs. */ @@ -138,13 +140,27 @@ memp_close(dbmp) ret = t_ret; if (dbmp->reginfo.path != NULL) - FREES(dbmp->reginfo.path); - FREE(dbmp, sizeof(DB_MPOOL)); + __os_freestr(dbmp->reginfo.path); + __os_free(dbmp, sizeof(DB_MPOOL)); return (ret); } /* + * __memp_panic -- + * Panic a memory pool. + * + * PUBLIC: void __memp_panic __P((DB_ENV *)); + */ +void +__memp_panic(dbenv) + DB_ENV *dbenv; +{ + if (dbenv->mp_info != NULL) + dbenv->mp_info->mp->rlayout.panic = 1; +} + +/* * memp_unlink -- * Exit a memory pool. */ @@ -160,12 +176,12 @@ memp_unlink(path, force, dbenv) memset(®info, 0, sizeof(reginfo)); reginfo.dbenv = dbenv; reginfo.appname = DB_APP_NONE; - if (path != NULL && (reginfo.path = __db_strdup(path)) == NULL) - return (ENOMEM); + if (path != NULL && (ret = __os_strdup(path, ®info.path)) != 0) + return (ret); reginfo.file = DB_DEFAULT_MPOOL_FILE; ret = __db_runlink(®info, force); if (reginfo.path != NULL) - FREES(reginfo.path); + __os_freestr(reginfo.path); return (ret); } @@ -181,9 +197,12 @@ memp_register(dbmp, ftype, pgin, pgout) int (*pgout) __P((db_pgno_t, void *, DBT *)); { DB_MPREG *mpr; + int ret; + + MP_PANIC_CHECK(dbmp); - if ((mpr = (DB_MPREG *)__db_malloc(sizeof(DB_MPREG))) == NULL) - return (ENOMEM); + if ((ret = __os_malloc(sizeof(DB_MPREG), NULL, &mpr)) != 0) + return (ret); mpr->ftype = ftype; mpr->pgin = pgin; diff --git a/db2/mp/mp_pr.c b/db2/mp/mp_pr.c index e83e0f44fa..84c782e781 100644 --- a/db2/mp/mp_pr.c +++ b/db2/mp/mp_pr.c @@ -7,7 +7,7 @@ #include "config.h" #ifndef lint -static const char sccsid[] = "@(#)mp_pr.c 10.26 (Sleepycat) 5/23/98"; +static const char sccsid[] = "@(#)mp_pr.c 10.30 (Sleepycat) 10/1/98"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -44,16 +44,17 @@ memp_stat(dbmp, gspp, fspp, db_malloc) DB_MPOOL_FSTAT **tfsp; MPOOLFILE *mfp; size_t len, nlen; + int ret; char *name; + MP_PANIC_CHECK(dbmp); + /* Allocate space for the global statistics. */ if (gspp != NULL) { *gspp = NULL; - if ((*gspp = db_malloc == NULL ? - (DB_MPOOL_STAT *)__db_malloc(sizeof(**gspp)) : - (DB_MPOOL_STAT *)db_malloc(sizeof(**gspp))) == NULL) - return (ENOMEM); + if ((ret = __os_malloc(sizeof(**gspp), db_malloc, gspp)) != 0) + return (ret); LOCKREGION(dbmp); @@ -89,10 +90,8 @@ memp_stat(dbmp, gspp, fspp, db_malloc) /* Allocate space for the pointers. */ len = (len + 1) * sizeof(DB_MPOOL_FSTAT *); - if ((*fspp = db_malloc == NULL ? - (DB_MPOOL_FSTAT **)__db_malloc(len) : - (DB_MPOOL_FSTAT **)db_malloc(len)) == NULL) - return (ENOMEM); + if ((ret = __os_malloc(len, db_malloc, fspp)) != 0) + return (ret); LOCKREGION(dbmp); @@ -104,10 +103,8 @@ memp_stat(dbmp, gspp, fspp, db_malloc) name = __memp_fns(dbmp, mfp); nlen = strlen(name); len = sizeof(DB_MPOOL_FSTAT) + nlen + 1; - if ((*tfsp = db_malloc == NULL ? - (DB_MPOOL_FSTAT *)__db_malloc(len) : - (DB_MPOOL_FSTAT *)db_malloc(len)) == NULL) - return (ENOMEM); + if ((ret = __os_malloc(len, db_malloc, tfsp)) != 0) + return (ret); **tfsp = mfp->stat; (*tfsp)->file_name = (char *) (u_int8_t *)*tfsp + sizeof(DB_MPOOL_FSTAT); @@ -212,8 +209,9 @@ __memp_dump_region(dbmp, area, fp) cnt = 0; for (mfp = SH_TAILQ_FIRST(&dbmp->mp->mpfq, __mpoolfile); mfp != NULL; mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile), ++cnt) { - (void)fprintf(fp, "file #%d: %s: %lu references: %s\n", + (void)fprintf(fp, "file #%d: %s: refs %lu, type %ld, %s\n", cnt + 1, __memp_fns(dbmp, mfp), (u_long)mfp->ref, + (long)mfp->ftype, F_ISSET(mfp, MP_CAN_MMAP) ? "mmap" : "read/write"); if (cnt < FMAP_ENTRIES) fmap[cnt] = R_OFFSET(dbmp, mfp); diff --git a/db2/mp/mp_region.c b/db2/mp/mp_region.c index b8a72286cd..b9c92f2e13 100644 --- a/db2/mp/mp_region.c +++ b/db2/mp/mp_region.c @@ -7,7 +7,7 @@ #include "config.h" #ifndef lint -static const char sccsid[] = "@(#)mp_region.c 10.30 (Sleepycat) 5/31/98"; +static const char sccsid[] = "@(#)mp_region.c 10.35 (Sleepycat) 12/11/98"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -24,13 +24,33 @@ static const char sccsid[] = "@(#)mp_region.c 10.30 (Sleepycat) 5/31/98"; #include "common_ext.h" /* - * __memp_ralloc -- + * __memp_reg_alloc -- + * Allocate some space in the mpool region, with locking. + * + * PUBLIC: int __memp_reg_alloc __P((DB_MPOOL *, size_t, size_t *, void *)); + */ +int +__memp_reg_alloc(dbmp, len, offsetp, retp) + DB_MPOOL *dbmp; + size_t len, *offsetp; + void *retp; +{ + int ret; + + LOCKREGION(dbmp); + ret = __memp_alloc(dbmp, len, offsetp, retp); + UNLOCKREGION(dbmp); + return (ret); +} + +/* + * __memp_alloc -- * Allocate some space in the mpool region. * - * PUBLIC: int __memp_ralloc __P((DB_MPOOL *, size_t, size_t *, void *)); + * PUBLIC: int __memp_alloc __P((DB_MPOOL *, size_t, size_t *, void *)); */ int -__memp_ralloc(dbmp, len, offsetp, retp) +__memp_alloc(dbmp, len, offsetp, retp) DB_MPOOL *dbmp; size_t len, *offsetp; void *retp; @@ -52,7 +72,9 @@ alloc: if ((ret = __db_shalloc(dbmp->addr, len, MUTEX_ALIGNMENT, &p)) == 0) { return (0); } if (nomore) { - __db_err(dbmp->dbenv, "%s", strerror(ret)); + __db_err(dbmp->dbenv, + "Unable to allocate %lu bytes from mpool shared region: %s\n", + (u_long)len, strerror(ret)); return (ret); } @@ -91,7 +113,7 @@ alloc: if ((ret = __db_shalloc(dbmp->addr, len, MUTEX_ALIGNMENT, &p)) == 0) { } retry: /* Find a buffer we can flush; pure LRU. */ - total = 0; + restart = total = 0; for (bhp = SH_TAILQ_FIRST(&mp->bhq, __bh); bhp != NULL; bhp = nbhp) { nbhp = SH_TAILQ_NEXT(bhp, q, __bh); @@ -222,8 +244,8 @@ __memp_ropen(dbmp, path, cachesize, mode, is_private, flags) if (path == NULL) dbmp->reginfo.path = NULL; else - if ((dbmp->reginfo.path = __db_strdup(path)) == NULL) - return (ENOMEM); + if ((ret = __os_strdup(path, &dbmp->reginfo.path)) != 0) + return (ret); dbmp->reginfo.file = DB_DEFAULT_MPOOL_FILE; dbmp->reginfo.mode = mode; dbmp->reginfo.size = rlen; @@ -244,7 +266,7 @@ __memp_ropen(dbmp, path, cachesize, mode, is_private, flags) if ((ret = __db_rattach(&dbmp->reginfo)) != 0) { if (dbmp->reginfo.path != NULL) - FREES(dbmp->reginfo.path); + __os_freestr(dbmp->reginfo.path); return (ret); } @@ -303,6 +325,6 @@ err: UNLOCKREGION(dbmp); (void)memp_unlink(path, 1, dbmp->dbenv); if (dbmp->reginfo.path != NULL) - FREES(dbmp->reginfo.path); + __os_freestr(dbmp->reginfo.path); return (ret); } diff --git a/db2/mp/mp_sync.c b/db2/mp/mp_sync.c index 33218eef1a..535348517c 100644 --- a/db2/mp/mp_sync.c +++ b/db2/mp/mp_sync.c @@ -7,7 +7,7 @@ #include "config.h" #ifndef lint -static const char sccsid[] = "@(#)mp_sync.c 10.25 (Sleepycat) 4/26/98"; +static const char sccsid[] = "@(#)mp_sync.c 10.31 (Sleepycat) 12/11/98"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -39,9 +39,12 @@ memp_sync(dbmp, lsnp) DB_ENV *dbenv; MPOOL *mp; MPOOLFILE *mfp; - int ar_cnt, cnt, nalloc, next, ret, wrote; + int ar_cnt, nalloc, next, maxpin, ret, wrote; + + MP_PANIC_CHECK(dbmp); dbenv = dbmp->dbenv; + mp = dbmp->mp; if (dbenv->lg_info == NULL) { __db_err(dbenv, "memp_sync: requires logging"); @@ -49,16 +52,19 @@ memp_sync(dbmp, lsnp) } /* - * We try and write the buffers in page order so that the underlying - * filesystem doesn't have to seek and can write contiguous blocks, - * plus, we don't want to hold the region lock while we write the - * buffers. Get memory to hold the buffer pointers. Get a good-size - * block, too, because we realloc while holding the region lock if we - * run out. + * We try and write the buffers in page order: it should reduce seeks + * by the underlying filesystem and possibly reduce the actual number + * of writes. We don't want to hold the region lock while we write + * the buffers, so only hold it lock while we create a list. Get a + * good-size block of memory to hold buffer pointers, we don't want + * to run out. */ - if ((bharray = - (BH **)__db_malloc((nalloc = 1024) * sizeof(BH *))) == NULL) - return (ENOMEM); + LOCKREGION(dbmp); + nalloc = mp->stat.st_page_dirty + mp->stat.st_page_dirty / 2 + 10; + UNLOCKREGION(dbmp); + + if ((ret = __os_malloc(nalloc * sizeof(BH *), NULL, &bharray)) != 0) + return (ret); LOCKREGION(dbmp); @@ -70,7 +76,6 @@ memp_sync(dbmp, lsnp) * we've already handled or are currently handling, then we return a * result based on the count for the larger LSN. */ - mp = dbmp->mp; if (!F_ISSET(mp, MP_LSN_RETRY) && log_compare(lsnp, &mp->lsn) <= 0) { if (mp->lsn_cnt == 0) { *lsnp = mp->lsn; @@ -114,10 +119,15 @@ memp_sync(dbmp, lsnp) * finish. Since the application may have restarted the sync, clear * any BH_WRITE flags that appear to be left over from previous calls. * + * We don't want to pin down the entire buffer cache, otherwise we'll + * starve threads needing new pages. Don't pin down more than 80% of + * the cache. + * * Keep a count of the total number of buffers we need to write in * MPOOL->lsn_cnt, and for each file, in MPOOLFILE->lsn_count. */ ar_cnt = 0; + maxpin = ((mp->stat.st_page_dirty + mp->stat.st_page_clean) * 8) / 10; for (bhp = SH_TAILQ_FIRST(&mp->bhq, __bh); bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, q, __bh)) if (F_ISSET(bhp, BH_DIRTY) || bhp->ref != 0) { @@ -130,19 +140,27 @@ memp_sync(dbmp, lsnp) /* * If the buffer isn't in use, we should be able to - * write it immediately, so save a reference to it. + * write it immediately, so increment the reference + * count to lock it and its contents down, and then + * save a reference to it. + * + * If we've run out space to store buffer references, + * we're screwed. We don't want to realloc the array + * while holding a region lock, so we set the flag to + * force the checkpoint to be done again, from scratch, + * later. + * + * If we've pinned down too much of the cache stop, and + * set a flag to force the checkpoint to be tried again + * later. */ if (bhp->ref == 0) { - if (ar_cnt == nalloc) { - nalloc *= 2; - if ((bharray = - (BH **)__db_realloc(bharray, - nalloc * sizeof(BH *))) == NULL) { - ret = ENOMEM; - goto err; - } + ++bhp->ref; + bharray[ar_cnt] = bhp; + if (++ar_cnt >= nalloc || ar_cnt >= maxpin) { + F_SET(mp, MP_LSN_RETRY); + break; } - bharray[ar_cnt++] = bhp; } } else if (F_ISSET(bhp, BH_WRITE)) @@ -154,10 +172,6 @@ memp_sync(dbmp, lsnp) goto done; } - /* Lock down the buffers and their contents. */ - for (cnt = 0; cnt < ar_cnt; ++cnt) - ++bharray[cnt]->ref; - UNLOCKREGION(dbmp); /* Sort the buffers we're going to write. */ @@ -205,7 +219,8 @@ memp_sync(dbmp, lsnp) goto err; } } - ret = mp->lsn_cnt ? DB_INCOMPLETE : 0; + ret = mp->lsn_cnt != 0 || + F_ISSET(mp, MP_LSN_RETRY) ? DB_INCOMPLETE : 0; done: if (0) { @@ -224,7 +239,7 @@ err: /* F_CLR(bhp, BH_WRITE); } UNLOCKREGION(dbmp); - __db_free(bharray); + __os_free(bharray, nalloc * sizeof(BH *)); return (ret); } @@ -241,6 +256,8 @@ memp_fsync(dbmfp) dbmp = dbmfp->dbmp; + MP_PANIC_CHECK(dbmp); + /* * If this handle doesn't have a file descriptor that's open for * writing, or if the file is a temporary, there's no reason to @@ -300,25 +317,29 @@ __memp_fsync(dbmfp) { BH *bhp, **bharray; DB_MPOOL *dbmp; + MPOOL *mp; size_t mf_offset; - int ar_cnt, cnt, nalloc, next, pincnt, ret, wrote; + int ar_cnt, incomplete, nalloc, next, ret, wrote; ret = 0; dbmp = dbmfp->dbmp; + mp = dbmp->mp; mf_offset = R_OFFSET(dbmp, dbmfp->mfp); /* - * We try and write the buffers in page order so that the underlying - * filesystem doesn't have to seek and can write contiguous blocks, - * plus, we don't want to hold the region lock while we write the - * buffers. Get memory to hold the buffer pointers. Get a good-size - * block, too, because we realloc while holding the region lock if we - * run out. + * We try and write the buffers in page order: it should reduce seeks + * by the underlying filesystem and possibly reduce the actual number + * of writes. We don't want to hold the region lock while we write + * the buffers, so only hold it lock while we create a list. Get a + * good-size block of memory to hold buffer pointers, we don't want + * to run out. */ - nalloc = 1024; - if ((bharray = - (BH **)__db_malloc((size_t)nalloc * sizeof(BH *))) == NULL) - return (ENOMEM); + LOCKREGION(dbmp); + nalloc = mp->stat.st_page_dirty + mp->stat.st_page_dirty / 2 + 10; + UNLOCKREGION(dbmp); + + if ((ret = __os_malloc(nalloc * sizeof(BH *), NULL, &bharray)) != 0) + return (ret); LOCKREGION(dbmp); @@ -326,36 +347,37 @@ __memp_fsync(dbmfp) * Walk the LRU list of buffer headers, and get a list of buffers to * write for this MPOOLFILE. */ - ar_cnt = pincnt = 0; - for (bhp = SH_TAILQ_FIRST(&dbmp->mp->bhq, __bh); + ar_cnt = incomplete = 0; + for (bhp = SH_TAILQ_FIRST(&mp->bhq, __bh); bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, q, __bh)) { if (!F_ISSET(bhp, BH_DIRTY) || bhp->mf_offset != mf_offset) continue; if (bhp->ref != 0 || F_ISSET(bhp, BH_LOCKED)) { - ++pincnt; + incomplete = 1; continue; } - if (ar_cnt == nalloc) { - nalloc *= 2; - if ((bharray = (BH **)__db_realloc(bharray, - nalloc * sizeof(BH *))) == NULL) { - ret = ENOMEM; - goto err; - } - } + ++bhp->ref; + bharray[ar_cnt] = bhp; - bharray[ar_cnt++] = bhp; + /* + * If we've run out space to store buffer references, we're + * screwed, as we don't want to realloc the array holding a + * region lock. Set the incomplete flag -- the only way we + * can get here is if the file is active in the buffer cache, + * which is the same thing as finding pinned buffers. + */ + if (++ar_cnt >= nalloc) { + incomplete = 1; + break; + } } - /* Lock down the buffers and their contents. */ - for (cnt = 0; cnt < ar_cnt; ++cnt) - ++bharray[cnt]->ref; - UNLOCKREGION(dbmp); /* Sort the buffers we're going to write. */ - qsort(bharray, ar_cnt, sizeof(BH *), __bhcmp); + if (ar_cnt != 0) + qsort(bharray, ar_cnt, sizeof(BH *), __bhcmp); LOCKREGION(dbmp); @@ -365,11 +387,10 @@ __memp_fsync(dbmfp) * It's possible for a thread to have gotten the buffer since * we listed it for writing. If the reference count is still * 1, we're the only ones using the buffer, go ahead and write. - * If it's >1, then skip the buffer and assume that it will be - * written when it's returned to the cache. + * If it's >1, then skip the buffer. */ if (bharray[next]->ref > 1) { - ++pincnt; + incomplete = 1; --bharray[next]->ref; continue; @@ -387,13 +408,18 @@ __memp_fsync(dbmfp) --bharray[next]->ref; goto err; } + + /* + * If we didn't write the buffer for some reason, don't return + * success. + */ if (!wrote) - ++pincnt; + incomplete = 1; } err: UNLOCKREGION(dbmp); - __db_free(bharray); + __os_free(bharray, nalloc * sizeof(BH *)); /* * Sync the underlying file as the last thing we do, so that the OS @@ -404,7 +430,7 @@ err: UNLOCKREGION(dbmp); * issues. */ if (ret == 0) - return (pincnt == 0 ? __db_fsync(dbmfp->fd) : DB_INCOMPLETE); + return (incomplete ? DB_INCOMPLETE : __os_fsync(dbmfp->fd)); return (ret); } @@ -423,6 +449,8 @@ memp_trickle(dbmp, pct, nwrotep) u_long total; int ret, wrote; + MP_PANIC_CHECK(dbmp); + mp = dbmp->mp; if (nwrotep != NULL) *nwrotep = 0; @@ -487,7 +515,7 @@ loop: total = mp->stat.st_page_clean + mp->stat.st_page_dirty; } /* No more buffers to write. */ - return (0); + ret = 0; err: UNLOCKREGION(dbmp); return (ret); @@ -508,6 +536,14 @@ __bhcmp(p1, p2) if (bhp1->mf_offset > bhp2->mf_offset) return (1); - /* Sort by page in file. */ - return (bhp1->pgno < bhp2->pgno ? -1 : 1); + /* + * !!! + * Defend against badly written quicksort code calling the comparison + * function with two identical pointers (e.g., WATCOM C++ (Power++)). + */ + if (bhp1->pgno < bhp2->pgno) + return (-1); + if (bhp1->pgno > bhp2->pgno) + return (1); + return (0); } diff --git a/db2/mutex/alpha.dec b/db2/mutex/alpha.dec deleted file mode 100644 index 83ed371136..0000000000 --- a/db2/mutex/alpha.dec +++ /dev/null @@ -1,25 +0,0 @@ -/* - * @(#)alpha.dec 8.3 (Sleepycat Software) 1/18/97 - * - * The DEC C asm acts as a pseudo-call. The first argument is the assembly - * code, and the remaining arguments are assigned as in a procedure call, to - * r16, r17, etc. (represented in asm as %a0, %a1, and so forth). - * - * From: Dave Butenhof. - */ - -#include <c_asm.h> - -#define TSL_SET(tsl) (asm ("mb; \ - 10: ldl_l %v0,(%a0) ; \ - bne %v0,30f ; \ - or %v0,1,%r1 ; \ - stl_c %r1,(%a0) ; \ - beq %r1,20f ; \ - mb ; \ - br %r31,30f ; \ - 20: br %r31,10b ; \ - 30: ", (tsl))) - -THIS WAS NOT CONVERTED TO TAKE A POINTER AS AN ARGUMENT... -#define TSL_UNSET(tsl) (asm ("mb"), *(tsl) = 0) diff --git a/db2/mutex/alpha.gcc b/db2/mutex/alpha.gcc deleted file mode 100644 index 247d04cf31..0000000000 --- a/db2/mutex/alpha.gcc +++ /dev/null @@ -1,52 +0,0 @@ -/* - * @(#)alpha.gcc 10.1 (Sleepycat) 4/12/97 - * - * The code appearing below is taken from Richard L. Sites, ed. "Alpha - * Architecture Reference Manual", Digital Press, 1992, page 5-7 and 5-8. - * There are 2 modifications: - * - * 1. The jump from blbs __r1,30f to !__r1, which is dictated by the way the - * TSL_SET macro is used. The code suggested in Sites includes the main loop - * of the spin lock, whereas in this code the rest the loop is specified in C. - * The generated code might be suboptimal if the compiler generates a forward - * branch for the usual case in which the mutex is uncontested. - * - * 2. At label 20, Sites suggests including code for testing for an excessive - * number of _processor_ lock conflicts. (The seq_c instruction stores its - * first argument provided that no other processor has written to a byte range - * including its memory-location argument.) Absent such checking the code - * below could conceivably stall silently on a multiprocessor alpha, depending - * on how often processor/processor conflicts occur in a particular byte range. - * - * Note that the mb ("memory-barrier") instruction in TSL_UNSET is critical to - * correct operation in a multiprocessor alpha (as is, of course, the mb in - * the TSL_SET macro). Without the mb, changes to shared memory that occurred - * inside the critical section (before the TSL_UNSET) might reach shared memory - * _after_ the change of tsl to 0, thereby permitting another processor to see - * an inconsistent view of the data protected by the mutex. - * - * For gcc/alpha, 0 is clear, 1 is set. - */ -#define TSL_SET(tsl) ({ \ - register tsl_t *__l = (tsl); \ - register tsl_t __r1, __r2; \ - __asm__ volatile(" \n\ - 10: ldq_l %0,(%2) \n\ - blbs %0,30f \n\ - or %0,1,%1 \n\ - stq_c %1,(%2) \n\ - beq %1,20f \n\ - mb \n\ - br 30f \n\ - 20: br 10b \n\ - 30: " \ - : "=&r" (__r1), "=&r" (__r2) \ - : "r" (__l)); \ - !__r1; \ -}) - -#define TSL_UNSET(tsl) ({ \ - register tsl_t *__l = (tsl); \ - __asm__ volatile("mb; stq $31,(%0);" : : "r" (__l)); \ -}) -#define TSL_INIT(tsl) TSL_UNSET(tsl) diff --git a/db2/mutex/mutex.c b/db2/mutex/mutex.c index de0d0e23fe..acc6aa07c9 100644 --- a/db2/mutex/mutex.c +++ b/db2/mutex/mutex.c @@ -8,7 +8,7 @@ #include "config.h" #ifndef lint -static const char sccsid[] = "@(#)mutex.c 10.48 (Sleepycat) 5/23/98"; +static const char sccsid[] = "@(#)mutex.c 10.52 (Sleepycat) 11/8/98"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -37,9 +37,12 @@ static const char sccsid[] = "@(#)mutex.c 10.48 (Sleepycat) 5/23/98"; #if defined(HAVE_FUNC_MSEM) /* - * XXX - * Should we not use MSEM_IF_NOWAIT and let the system block for us? - * I've no idea if this will block all threads in the process or not. + * !!! + * Do not remove the MSEM_IF_NOWAIT flag. The problem is that if a single + * process makes two msem_lock() calls in a row, the second one returns an + * error. We depend on the fact that we can lock against ourselves in the + * locking subsystem, where we set up a mutex so that we can block ourselves. + * Tested on OSF1 v4.0. */ #define TSL_INIT(x) (msem_init(x, MSEM_UNLOCKED) == NULL) #define TSL_INIT_ERROR 1 @@ -74,6 +77,17 @@ static const char sccsid[] = "@(#)mutex.c 10.48 (Sleepycat) 5/23/98"; #define TSL_UNSET(x) _lock_clear(x) #endif +#ifdef HAVE_FUNC_VMS +#include <builtins.h> +#ifdef __ALPHA +#define TSL_SET(tsl) (!__TESTBITSSI(tsl, 0)) +#else /* __VAX */ +#define TSL_SET(tsl) (!(int)_BBSSI(0, tsl)) +#endif +#define TSL_UNSET(tsl) (*(tsl) = 0) +#define TSL_INIT(tsl) TSL_UNSET(tsl) +#endif + #ifdef HAVE_ASSEM_PARISC_GCC #include "parisc.gcc" #endif @@ -181,7 +195,7 @@ __db_mutex_lock(mp, fd) #ifdef HAVE_SPINLOCKS COMPQUIET(fd, 0); - for (usecs = MS(10);;) { + for (usecs = MS(1);;) { /* Try and acquire the uncontested resource lock for N spins. */ for (nspins = mp->spins; nspins > 0; --nspins) if (TSL_SET(&mp->tsl_resource)) { @@ -193,19 +207,17 @@ __db_mutex_lock(mp, fd) } mp->pid = getpid(); #endif - if (usecs == MS(10)) + if (usecs == MS(1)) ++mp->mutex_set_nowait; else ++mp->mutex_set_wait; return (0); } - /* Yield the processor; wait 10ms initially, up to 1 second. */ - if (__db_yield == NULL || __db_yield() != 0) { - (void)__db_sleep(0, usecs); - if ((usecs <<= 1) > SECOND) - usecs = SECOND; - } + /* Yield the processor; wait 1ms initially, up to 1 second. */ + __os_yield(usecs); + if ((usecs <<= 1) > SECOND) + usecs = SECOND; } /* NOTREACHED */ @@ -218,15 +230,14 @@ __db_mutex_lock(mp, fd) for (locked = 0, mypid = getpid();;) { /* - * Wait for the lock to become available; wait 10ms initially, + * Wait for the lock to become available; wait 1ms initially, * up to 1 second. */ - for (usecs = MS(10); mp->pid != 0;) - if (__db_yield == NULL || __db_yield() != 0) { - (void)__db_sleep(0, usecs); - if ((usecs <<= 1) > SECOND) - usecs = SECOND; - } + for (usecs = MS(1); mp->pid != 0;) { + __os_yield(usecs); + if ((usecs <<= 1) > SECOND) + usecs = SECOND; + } /* Acquire an exclusive kernel lock. */ k_lock.l_type = F_WRLCK; diff --git a/db2/mutex/parisc.hp b/db2/mutex/parisc.hp deleted file mode 100644 index bd0e37fc78..0000000000 --- a/db2/mutex/parisc.hp +++ /dev/null @@ -1,29 +0,0 @@ -/* - * @(#)parisc.hp 8.6 (Sleepycat) 6/2/98 - * - * Copyright (c) 1996-1997, The University of Utah and the Computer Systems - * Laboratory at the University of Utah (CSL). All rights reserved. - * - * Permission to use, copy, modify and distribute this software is hereby - * granted provided that (1) source code retains these copyright, permission, - * and disclaimer notices, and (2) redistributions including binaries - * reproduce the notices in supporting documentation, and (3) all advertising - * materials mentioning features or use of this software display the following - * acknowledgement: ``This product includes software developed by the Computer - * Systems Laboratory at the University of Utah.'' - * - * THE UNIVERSITY OF UTAH AND CSL ALLOW FREE USE OF THIS SOFTWARE IN ITS "AS - * IS" CONDITION. THE UNIVERSITY OF UTAH AND CSL DISCLAIM ANY LIABILITY OF - * ANY KIND FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. - * - * CSL requests users of this software to return to csl-dist@cs.utah.edu any - * improvements that they make and grant CSL redistribution rights. - */ - -/* - * The PA-RISC has a "load and clear" instead of a "test and set" instruction. - * The 32-bit word used by that instruction must be 16-byte aligned hence we - * allocate 16 bytes for a tsl_t and use the word that is properly aligned. - */ -#define TSL_SET(tsl) tsl_set(tsl) -#define TSL_UNSET(tsl) tsl_unset(tsl) diff --git a/db2/mutex/uts4.cc.s b/db2/mutex/uts4_cc.s index ee5f4143bd..ee5f4143bd 100644 --- a/db2/mutex/uts4.cc.s +++ b/db2/mutex/uts4_cc.s diff --git a/db2/os/os_abs.c b/db2/os/os_abs.c index d9f4970467..547a6804b4 100644 --- a/db2/os/os_abs.c +++ b/db2/os/os_abs.c @@ -8,7 +8,7 @@ #include "config.h" #ifndef lint -static const char sccsid[] = "@(#)os_abs.c 10.8 (Sleepycat) 4/10/98"; +static const char sccsid[] = "@(#)os_abs.c 10.9 (Sleepycat) 7/21/98"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -18,13 +18,13 @@ static const char sccsid[] = "@(#)os_abs.c 10.8 (Sleepycat) 4/10/98"; #include "db_int.h" /* - * __db_abspath -- + * __os_abspath -- * Return if a path is an absolute path. * - * PUBLIC: int __db_abspath __P((const char *)); + * PUBLIC: int __os_abspath __P((const char *)); */ int -__db_abspath(path) +__os_abspath(path) const char *path; { return (path[0] == '/'); diff --git a/db2/os/os_alloc.c b/db2/os/os_alloc.c index 35784476c0..0090eb14a7 100644 --- a/db2/os/os_alloc.c +++ b/db2/os/os_alloc.c @@ -8,40 +8,22 @@ #include "config.h" #ifndef lint -static const char sccsid[] = "@(#)os_alloc.c 10.6 (Sleepycat) 5/2/98"; +static const char sccsid[] = "@(#)os_alloc.c 10.10 (Sleepycat) 10/12/98"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES #include <sys/types.h> +#include <errno.h> #include <string.h> +#include <stdlib.h> #endif #include "db_int.h" +#include "os_jump.h" /* - * __db_strdup -- - * The strdup(3) function for DB. - * - * PUBLIC: char *__db_strdup __P((const char *)); - */ -char * -__db_strdup(str) - const char *str; -{ - size_t len; - char *copy; - - len = strlen(str) + 1; - if ((copy = __db_malloc(len)) == NULL) - return (NULL); - - memcpy(copy, str, len); - return (copy); -} - -/* - * XXX + * !!! * Correct for systems that return NULL when you allocate 0 bytes of memory. * There are several places in DB where we allocate the number of bytes held * by the key/data item, and it can be 0. Correct here so that malloc never @@ -49,59 +31,189 @@ __db_strdup(str) * could make these calls macros on non-Alpha architectures (that's where we * saw the problem), but it's probably not worth the autoconf complexity. * + * !!! + * Correct for systems that don't set errno when malloc and friends fail. + * * Out of memory. * We wish to hold the whole sky, * But we never will. */ + +/* + * __os_strdup -- + * The strdup(3) function for DB. + * + * PUBLIC: int __os_strdup __P((const char *, void *)); + */ +int +__os_strdup(str, storep) + const char *str; + void *storep; +{ + size_t size; + int ret; + void *p; + + *(void **)storep = NULL; + + size = strlen(str) + 1; + if ((ret = __os_malloc(size, NULL, &p)) != 0) + return (ret); + + memcpy(p, str, size); + + *(void **)storep = p; + return (0); +} + /* - * __db_calloc -- + * __os_calloc -- * The calloc(3) function for DB. * - * PUBLIC: void *__db_calloc __P((size_t, size_t)); + * PUBLIC: int __os_calloc __P((size_t, size_t, void *)); */ -void * -__db_calloc(num, size) +int +__os_calloc(num, size, storep) size_t num, size; + void *storep; { void *p; + int ret; size *= num; - if ((p = __db_jump.j_malloc(size == 0 ? 1 : size)) != NULL) - memset(p, 0, size); - return (p); + if ((ret = __os_malloc(size, NULL, &p)) != 0) + return (ret); + + memset(p, 0, size); + *(void **)storep = p; + + return (0); } /* - * __db_malloc -- + * __os_malloc -- * The malloc(3) function for DB. * - * PUBLIC: void *__db_malloc __P((size_t)); + * PUBLIC: int __os_malloc __P((size_t, void *(*)(size_t), void *)); */ -void * -__db_malloc(size) +int +__os_malloc(size, db_malloc, storep) size_t size; + void *(*db_malloc) __P((size_t)), *storep; { -#ifdef DIAGNOSTIC void *p; - p = __db_jump.j_malloc(size == 0 ? 1 : size); - memset(p, 0xff, size == 0 ? 1 : size); - return (p); -#else - return (__db_jump.j_malloc(size == 0 ? 1 : size)); + *(void **)storep = NULL; + + /* Never allocate 0 bytes -- some C libraries don't like it. */ + if (size == 0) + ++size; + + /* Some C libraries don't correctly set errno when malloc(3) fails. */ + errno = 0; + if (db_malloc != NULL) + p = db_malloc(size); + else if (__db_jump.j_malloc != NULL) + p = __db_jump.j_malloc(size); + else + p = malloc(size); + if (p == NULL) { + if (errno == 0) + errno = ENOMEM; + return (errno); + } + +#ifdef DIAGNOSTIC + memset(p, 0xdb, size); #endif + *(void **)storep = p; + + return (0); } /* - * __db_realloc -- + * __os_realloc -- * The realloc(3) function for DB. * - * PUBLIC: void *__db_realloc __P((void *, size_t)); + * PUBLIC: int __os_realloc __P((void *, size_t)); + */ +int +__os_realloc(storep, size) + void *storep; + size_t size; +{ + void *p, *ptr; + + ptr = *(void **)storep; + + /* If we haven't yet allocated anything yet, simply call malloc. */ + if (ptr == NULL) + return (__os_malloc(size, NULL, storep)); + + /* Never allocate 0 bytes -- some C libraries don't like it. */ + if (size == 0) + ++size; + + /* + * Some C libraries don't correctly set errno when realloc(3) fails. + * + * Don't overwrite the original pointer, there are places in DB we + * try to continue after realloc fails. + */ + errno = 0; + if (__db_jump.j_realloc != NULL) + p = __db_jump.j_realloc(ptr, size); + else + p = realloc(ptr, size); + if (p == NULL) { + if (errno == 0) + errno = ENOMEM; + return (errno); + } + + *(void **)storep = p; + + return (0); +} + +/* + * __os_free -- + * The free(3) function for DB. + * + * PUBLIC: void __os_free __P((void *, size_t)); */ -void * -__db_realloc(ptr, size) +void +__os_free(ptr, size) void *ptr; size_t size; { - return (__db_jump.j_realloc(ptr, size == 0 ? 1 : size)); +#ifdef DIAGNOSTIC + if (size != 0) + memset(ptr, 0xdb, size); +#endif + + if (__db_jump.j_free != NULL) + __db_jump.j_free(ptr); + else + free(ptr); +} + +/* + * __os_freestr -- + * The free(3) function for DB, freeing a string. + * + * PUBLIC: void __os_freestr __P((void *)); + */ +void +__os_freestr(ptr) + void *ptr; +{ +#ifdef DIAGNOSTIC + memset(ptr, 0xdb, strlen(ptr) + 1); +#endif + + if (__db_jump.j_free != NULL) + __db_jump.j_free(ptr); + else + free(ptr); } diff --git a/db2/os/os_config.c b/db2/os/os_config.c index 4150c843e4..71d379a387 100644 --- a/db2/os/os_config.c +++ b/db2/os/os_config.c @@ -8,7 +8,7 @@ #include "config.h" #ifndef lint -static const char sccsid[] = "@(#)os_config.c 10.26 (Sleepycat) 5/23/98"; +static const char sccsid[] = "@(#)os_config.c 10.30 (Sleepycat) 10/12/98"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -18,72 +18,18 @@ static const char sccsid[] = "@(#)os_config.c 10.26 (Sleepycat) 5/23/98"; #endif #include "db_int.h" +#include "os_jump.h" -/* - * XXX - * We provide our own extern declarations so that we don't collide with - * systems that get them wrong, e.g., SunOS. - */ -#ifdef _WIN32 -#define fsync _commit -#define imported __declspec(dllimport) -#else -#define imported -#endif - -/* - * XXX - * HP/UX MPE doesn't have fsync, but you can build one using FCONTROL. - */ -#ifdef __hp3000s900 -#define fsync __mpe_fsync -#endif - -imported extern int close __P((int)); -imported extern void free __P((void *)); -imported extern int fsync __P((int)); -imported extern void *malloc __P((size_t)); -imported extern int open __P((const char *, int, ...)); -imported extern ssize_t read __P((int, void *, size_t)); -imported extern void *realloc __P((void *, size_t)); -imported extern int unlink __P((const char *)); -imported extern ssize_t write __P((int, const void *, size_t)); - -/* - * __db_jump -- - * This list of interfaces that applications can replace. In some - * cases, the user is permitted to replace the standard ANSI C or - * POSIX 1003.1 call, e.g., malloc or read. In others, we provide - * a local interface to the functionality, e.g., __os_ioinfo. - */ -struct __db_jumptab __db_jump = { - close, /* DB_FUNC_CLOSE */ - __os_dirfree, /* DB_FUNC_DIRFREE */ - __os_dirlist, /* DB_FUNC_DIRLIST */ - __os_exists, /* DB_FUNC_EXISTS */ - free, /* DB_FUNC_FREE */ - fsync, /* DB_FUNC_FSYNC */ - __os_ioinfo, /* DB_FUNC_IOINFO */ - malloc, /* DB_FUNC_MALLOC */ - NULL, /* DB_FUNC_MAP */ - open, /* DB_FUNC_OPEN */ - read, /* DB_FUNC_READ */ - realloc, /* DB_FUNC_REALLOC */ - NULL, /* DB_FUNC_RUNLINK */ - __os_seek, /* DB_FUNC_SEEK */ - __os_sleep, /* DB_FUNC_SLEEP */ - unlink, /* DB_FUNC_UNLINK */ - NULL, /* DB_FUNC_UNMAP */ - write, /* DB_FUNC_WRITE */ - NULL /* DB_FUNC_YIELD */ -}; +struct __db_jumptab __db_jump; DB_GLOBALS __db_global_values = { 1, /* DB_MUTEXLOCKS */ + 0, /* DB_PAGEYIELD */ 0, /* DB_REGION_ANON, DB_REGION_NAME */ 0, /* DB_REGION_INIT */ 0, /* DB_TSL_SPINS */ - 0 /* DB_PAGEYIELD */ + {NULL, &__db_global_values.db_envq.tqh_first}, /* Environemnt queue */ + {NULL, &__db_global_values.db_nameq.tqh_first} /* Name queue */ }; /* diff --git a/db2/os/os_dir.c b/db2/os/os_dir.c index 14a10ad23f..f2ee128c1e 100644 --- a/db2/os/os_dir.c +++ b/db2/os/os_dir.c @@ -8,7 +8,7 @@ #include "config.h" #ifndef lint -static const char sccsid[] = "@(#)os_dir.c 10.15 (Sleepycat) 4/26/98"; +static const char sccsid[] = "@(#)os_dir.c 10.19 (Sleepycat) 10/12/98"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -35,6 +35,7 @@ static const char sccsid[] = "@(#)os_dir.c 10.15 (Sleepycat) 4/26/98"; #endif #include "db_int.h" +#include "os_jump.h" /* * __os_dirlist -- @@ -50,22 +51,23 @@ __os_dirlist(dir, namesp, cntp) { struct dirent *dp; DIR *dirp; - int arraysz, cnt; + int arraysz, cnt, ret; char **names; + if (__db_jump.j_dirlist != NULL) + return (__db_jump.j_dirlist(dir, namesp, cntp)); + if ((dirp = opendir(dir)) == NULL) return (errno); names = NULL; for (arraysz = cnt = 0; (dp = readdir(dirp)) != NULL; ++cnt) { if (cnt >= arraysz) { arraysz += 100; - names = (char **)(names == NULL ? - __db_malloc(arraysz * sizeof(names[0])) : - __db_realloc(names, arraysz * sizeof(names[0]))); - if (names == NULL) + if ((ret = __os_realloc(&names, + arraysz * sizeof(names[0]))) != 0) goto nomem; } - if ((names[cnt] = (char *)__db_strdup(dp->d_name)) == NULL) + if ((ret = __os_strdup(dp->d_name, &names[cnt])) != 0) goto nomem; } (void)closedir(dirp); @@ -76,7 +78,7 @@ __os_dirlist(dir, namesp, cntp) nomem: if (names != NULL) __os_dirfree(names, cnt); - return (ENOMEM); + return (ret); } /* @@ -90,7 +92,10 @@ __os_dirfree(names, cnt) char **names; int cnt; { + if (__db_jump.j_dirfree != NULL) + __db_jump.j_dirfree(names, cnt); + while (cnt > 0) - __db_free(names[--cnt]); - __db_free(names); + __os_free(names[--cnt], 0); + __os_free(names, 0); } diff --git a/db2/os/os_fid.c b/db2/os/os_fid.c index cf48c01bd8..62da590611 100644 --- a/db2/os/os_fid.c +++ b/db2/os/os_fid.c @@ -8,7 +8,7 @@ #include "config.h" #ifndef lint -static const char sccsid[] = "@(#)os_fid.c 10.11 (Sleepycat) 4/26/98"; +static const char sccsid[] = "@(#)os_fid.c 10.12 (Sleepycat) 7/21/98"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -24,13 +24,13 @@ static const char sccsid[] = "@(#)os_fid.c 10.11 (Sleepycat) 4/26/98"; #include "common_ext.h" /* - * __db_fileid -- + * __os_fileid -- * Return a unique identifier for a file. * - * PUBLIC: int __db_fileid __P((DB_ENV *, const char *, int, u_int8_t *)); + * PUBLIC: int __os_fileid __P((DB_ENV *, const char *, int, u_int8_t *)); */ int -__db_fileid(dbenv, fname, timestamp, fidp) +__os_fileid(dbenv, fname, timestamp, fidp) DB_ENV *dbenv; const char *fname; int timestamp; diff --git a/db2/os/os_fsync.c b/db2/os/os_fsync.c index e1f271a75c..61a504f84d 100644 --- a/db2/os/os_fsync.c +++ b/db2/os/os_fsync.c @@ -8,34 +8,21 @@ #include "config.h" #ifndef lint -static const char sccsid[] = "@(#)os_fsync.c 10.5 (Sleepycat) 4/19/98"; +static const char sccsid[] = "@(#)os_fsync.c 10.7 (Sleepycat) 10/12/98"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES #include <sys/types.h> #include <errno.h> +#include <fcntl.h> /* XXX: Required by __hp3000s900 */ #include <unistd.h> #endif #include "db_int.h" - -/* - * __db_fsync -- - * Flush a file descriptor. - * - * PUBLIC: int __db_fsync __P((int)); - */ -int -__db_fsync(fd) - int fd; -{ - return (__os_fsync(fd) ? errno : 0); -} +#include "os_jump.h" #ifdef __hp3000s900 -#include <fcntl.h> - int __mpe_fsync(fd) int fd; @@ -47,3 +34,26 @@ __mpe_fsync(fd) return (0); } #endif + +#ifdef __hp3000s900 +#define fsync(fd) __mpe_fsync(fd); +#endif +#ifdef _WIN32 +#define fsync(fd) _commit(fd); +#endif + +/* + * __os_fsync -- + * Flush a file descriptor. + * + * PUBLIC: int __os_fsync __P((int)); + */ +int +__os_fsync(fd) + int fd; +{ + int ret; + + ret = __db_jump.j_fsync != NULL ? __db_jump.j_fsync(fd) : fsync(fd); + return (ret == 0 ? 0 : errno); +} diff --git a/db2/os/os_map.c b/db2/os/os_map.c index 5f0fd790e6..5664a2edec 100644 --- a/db2/os/os_map.c +++ b/db2/os/os_map.c @@ -8,7 +8,7 @@ #include "config.h" #ifndef lint -static const char sccsid[] = "@(#)os_map.c 10.19 (Sleepycat) 5/3/98"; +static const char sccsid[] = "@(#)os_map.c 10.24 (Sleepycat) 10/12/98"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -27,13 +27,14 @@ static const char sccsid[] = "@(#)os_map.c 10.19 (Sleepycat) 5/3/98"; #endif #include "db_int.h" +#include "os_jump.h" #include "common_ext.h" #ifdef HAVE_MMAP static int __os_map __P((char *, int, size_t, int, int, int, void **)); #endif #ifdef HAVE_SHMGET -static int __os_shmget __P((char *, REGINFO *)); +static int __os_shmget __P((REGINFO *)); #endif /* @@ -165,7 +166,7 @@ __db_mapregion(path, infop) #ifdef HAVE_SHMGET if (!called) { called = 1; - ret = __os_shmget(path, infop); + ret = __os_shmget(infop); } #endif #ifdef HAVE_MMAP @@ -207,7 +208,7 @@ __db_mapregion(path, infop) #ifdef HAVE_SHMGET if (!called) { called = 1; - ret = __os_shmget(path, infop); + ret = __os_shmget(infop); } #endif } @@ -271,10 +272,9 @@ __db_unlinkregion(name, infop) called = 1; ret = shmctl(infop->segid, IPC_RMID, NULL) ? errno : 0; } -#else - COMPQUIET(infop, NULL); #endif #ifdef HAVE_MMAP + COMPQUIET(infop, NULL); if (!called) { called = 1; ret = 0; @@ -388,6 +388,23 @@ __os_map(path, fd, len, is_region, is_anonymous, is_rdonly, addr) prot = PROT_READ | (is_rdonly ? 0 : PROT_WRITE); +/* + * XXX + * Work around a bug in the VMS V7.1 mmap() implementation. To map a file + * into memory on VMS it needs to be opened in a certain way, originally. + * To get the file opened in that certain way, the VMS mmap() closes the + * file and re-opens it. When it does this, it doesn't flush any caches + * out to disk before closing. The problem this causes us is that when the + * memory cache doesn't get written out, the file isn't big enough to match + * the memory chunk and the mmap() call fails. This call to fsync() fixes + * the problem. DEC thinks this isn't a bug because of language in XPG5 + * discussing user responsibility for on-disk and in-memory synchronization. + */ +#ifdef VMS + if (__os_fsync(fd) == -1) + return(errno); +#endif + /* MAP_FAILED was not defined in early mmap implementations. */ #ifndef MAP_FAILED #define MAP_FAILED -1 @@ -407,47 +424,12 @@ __os_map(path, fd, len, is_region, is_anonymous, is_rdonly, addr) * Call the shmget(2) family of functions. */ static int -__os_shmget(path, infop) +__os_shmget(infop) REGINFO *infop; - char *path; { - key_t key; - int shmflg; - - if (F_ISSET(infop, REGION_CREATED)) { - /* - * The return key from ftok(3) is not guaranteed to be unique. - * The nice thing about the shmget(2) interface is that it - * allows you to name anonymous pieces of memory. The evil - * thing about it is that the name space is separate from the - * filesystem. - */ -#ifdef __hp3000s900 - {char mpe_path[MAXPATHLEN]; - /* - * MPE ftok() is broken as of 5.5pp4. If the file path does - * not start with '/' or '.', then ftok() tries to interpret - * the file path in MPE syntax instead of POSIX HFS syntax. - * The workaround is to prepend "./" to these paths. See HP - * SR 5003416081 for details. - */ - if (*path != '/' && *path != '.') { - if (strlen(path) + strlen("./") + 1 > sizeof(mpe_path)) - return (ENAMETOOLONG); - mpe_path[0] = '.'; - mpe_path[1] = '/'; - (void)strcpy(mpe_path + 2, path); - path = mpe_path; - } - } -#endif - if ((key = ftok(path, 1)) == (key_t)-1) - return (errno); - - shmflg = IPC_CREAT | 0600; - if ((infop->segid = shmget(key, infop->size, shmflg)) == -1) - return (errno); - } + if (F_ISSET(infop, REGION_CREATED) && + (infop->segid = shmget(0, infop->size, IPC_PRIVATE | 0600)) == -1) + return (errno); if ((infop->addr = shmat(infop->segid, NULL, 0)) == (void *)-1) { /* diff --git a/db2/os/os_oflags.c b/db2/os/os_oflags.c index 976b84d709..a4003dd5f0 100644 --- a/db2/os/os_oflags.c +++ b/db2/os/os_oflags.c @@ -44,7 +44,7 @@ __db_oflags(oflags) case O_RDWR: break; default: /* Bogus flags value from user. */ - /* XXX no way to return error from here */ + /* XXX no way to return error from here */ } if (oflags & O_CREAT) dbflags |= DB_CREATE; diff --git a/db2/os/os_open.c b/db2/os/os_open.c index e960377ebb..c54fd7365d 100644 --- a/db2/os/os_open.c +++ b/db2/os/os_open.c @@ -8,7 +8,7 @@ #include "config.h" #ifndef lint -static const char sccsid[] = "@(#)os_open.c 10.26 (Sleepycat) 5/4/98"; +static const char sccsid[] = "@(#)os_open.c 10.33 (Sleepycat) 10/12/98"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -16,10 +16,12 @@ static const char sccsid[] = "@(#)os_open.c 10.26 (Sleepycat) 5/4/98"; #include <errno.h> #include <fcntl.h> +#include <signal.h> #include <unistd.h> #endif #include "db_int.h" +#include "os_jump.h" /* * __db_open -- @@ -33,7 +35,10 @@ __db_open(name, arg_flags, ok_flags, mode, fdp) u_int32_t arg_flags, ok_flags; int mode, *fdp; { - int fd, flags; +#if !defined(_WIN32) && defined(HAVE_SIGFILLSET) + sigset_t set, oset; +#endif + int flags, ret; if (arg_flags & ~ok_flags) return (EINVAL); @@ -71,41 +76,77 @@ __db_open(name, arg_flags, ok_flags, mode, fdp) if (arg_flags & DB_TRUNCATE) flags |= O_TRUNC; +#if !defined(_WIN32) && defined(HAVE_SIGFILLSET) + /* + * We block every signal we can get our hands on so that the temporary + * file isn't left around if we're interrupted at the wrong time. Of + * course, if we drop core in-between the calls we'll hang forever, but + * that's probably okay. ;-) + */ + if (arg_flags & DB_TEMPORARY) { + (void)sigfillset(&set); + (void)sigprocmask(SIG_BLOCK, &set, &oset); + } +#endif + /* Open the file. */ - if ((fd = __os_open(name, flags, mode)) == -1) - return (errno); + if ((ret = __os_open(name, flags, mode, fdp)) != 0) + return (ret); -#ifndef _WIN32 +#if !defined(_WIN32) /* Delete any temporary file; done for Win32 by _O_TEMPORARY. */ - if (arg_flags & DB_TEMPORARY) + if (arg_flags & DB_TEMPORARY) { (void)__os_unlink(name); +#if defined(HAVE_SIGFILLSET) + (void)sigprocmask(SIG_SETMASK, &oset, NULL); +#endif + } #endif -#if !defined(_WIN32) && !defined(WIN16) +#if !defined(_WIN32) && !defined(WIN16) && !defined(VMS) /* - * Deny access to any child process; done for Win32 by O_NOINHERIT, - * MacOS has neither child processes nor fd inheritance. + * Deny access to any child process. + * VMS: does not have fd inheritance. + * Win32: done by O_NOINHERIT. */ - if (fcntl(fd, F_SETFD, 1) == -1) { - int ret = errno; + if (fcntl(*fdp, F_SETFD, 1) == -1) { + ret = errno; - (void)__os_close(fd); + (void)__os_close(*fdp); return (ret); } #endif - *fdp = fd; return (0); } /* - * __db_close -- + * __os_open -- + * Open a file. + * + * PUBLIC: int __os_open __P((const char *, int, int, int *)); + */ +int +__os_open(name, flags, mode, fdp) + const char *name; + int flags, mode, *fdp; +{ + *fdp = __db_jump.j_open != NULL ? + __db_jump.j_open(name, flags, mode) : open(name, flags, mode); + return (*fdp == -1 ? errno : 0); +} + +/* + * __os_close -- * Close a file descriptor. * - * PUBLIC: int __db_close __P((int)); + * PUBLIC: int __os_close __P((int)); */ int -__db_close(fd) +__os_close(fd) int fd; { - return (__os_close(fd) ? errno : 0); + int ret; + + ret = __db_jump.j_close != NULL ? __db_jump.j_close(fd) : close(fd); + return (ret == 0 ? 0 : errno); } diff --git a/db2/os/os_rw.c b/db2/os/os_rw.c index 7591041981..38f5b9473a 100644 --- a/db2/os/os_rw.c +++ b/db2/os/os_rw.c @@ -8,7 +8,7 @@ #include "config.h" #ifndef lint -static const char sccsid[] = "@(#)os_rw.c 10.7 (Sleepycat) 4/10/98"; +static const char sccsid[] = "@(#)os_rw.c 10.11 (Sleepycat) 10/12/98"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -19,15 +19,73 @@ static const char sccsid[] = "@(#)os_rw.c 10.7 (Sleepycat) 4/10/98"; #endif #include "db_int.h" +#include "os_jump.h" /* - * __db_read -- + * __os_io -- + * Do an I/O. + * + * PUBLIC: int __os_io __P((DB_IO *, int, ssize_t *)); + */ +int +__os_io(db_iop, op, niop) + DB_IO *db_iop; + int op; + ssize_t *niop; +{ + int ret; + +#ifdef HAVE_PREAD + switch (op) { + case DB_IO_READ: + if (__db_jump.j_read != NULL) + goto slow; + *niop = pread(db_iop->fd_io, db_iop->buf, + db_iop->bytes, (off_t)db_iop->pgno * db_iop->pagesize); + break; + case DB_IO_WRITE: + if (__db_jump.j_write != NULL) + goto slow; + *niop = pwrite(db_iop->fd_io, db_iop->buf, + db_iop->bytes, (off_t)db_iop->pgno * db_iop->pagesize); + break; + } + if (*niop == db_iop->bytes) + return (0); +slow: +#endif + if (db_iop->mutexp != NULL) + (void)__db_mutex_lock(db_iop->mutexp, db_iop->fd_lock); + + if ((ret = __os_seek(db_iop->fd_io, + db_iop->pagesize, db_iop->pgno, 0, 0, SEEK_SET)) != 0) + goto err; + switch (op) { + case DB_IO_READ: + ret = + __os_read(db_iop->fd_io, db_iop->buf, db_iop->bytes, niop); + break; + case DB_IO_WRITE: + ret = + __os_write(db_iop->fd_io, db_iop->buf, db_iop->bytes, niop); + break; + } + +err: if (db_iop->mutexp != NULL) + (void)__db_mutex_unlock(db_iop->mutexp, db_iop->fd_lock); + + return (ret); + +} + +/* + * __os_read -- * Read from a file handle. * - * PUBLIC: int __db_read __P((int, void *, size_t, ssize_t *)); + * PUBLIC: int __os_read __P((int, void *, size_t, ssize_t *)); */ int -__db_read(fd, addr, len, nrp) +__os_read(fd, addr, len, nrp) int fd; void *addr; size_t len; @@ -39,7 +97,9 @@ __db_read(fd, addr, len, nrp) for (taddr = addr, offset = 0; offset < len; taddr += nr, offset += nr) { - if ((nr = __os_read(fd, taddr, len - offset)) < 0) + if ((nr = __db_jump.j_read != NULL ? + __db_jump.j_read(fd, taddr, len - offset) : + read(fd, taddr, len - offset)) < 0) return (errno); if (nr == 0) break; @@ -49,15 +109,15 @@ __db_read(fd, addr, len, nrp) } /* - * __db_write -- + * __os_write -- * Write to a file handle. * - * PUBLIC: int __db_write __P((int, void *, size_t, ssize_t *)); + * PUBLIC: int __os_write __P((int, void *, size_t, ssize_t *)); */ int -__db_write(fd, addr, len, nwp) +__os_write(fd, addr, len, nwp) int fd; - void *addr; + const void *addr; size_t len; ssize_t *nwp; { @@ -67,7 +127,9 @@ __db_write(fd, addr, len, nwp) for (taddr = addr, offset = 0; offset < len; taddr += nw, offset += nw) - if ((nw = __os_write(fd, taddr, len - offset)) < 0) + if ((nw = __db_jump.j_write != NULL ? + __db_jump.j_write(fd, taddr, len - offset) : + write(fd, taddr, len - offset)) < 0) return (errno); *nwp = len; return (0); diff --git a/db2/os/os_seek.c b/db2/os/os_seek.c index 159425cc27..ae5272bd1c 100644 --- a/db2/os/os_seek.c +++ b/db2/os/os_seek.c @@ -8,7 +8,7 @@ #include "config.h" #ifndef lint -static const char sccsid[] = "@(#)os_seek.c 10.9 (Sleepycat) 4/19/98"; +static const char sccsid[] = "@(#)os_seek.c 10.11 (Sleepycat) 10/12/98"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -19,6 +19,7 @@ static const char sccsid[] = "@(#)os_seek.c 10.9 (Sleepycat) 4/19/98"; #endif #include "db_int.h" +#include "os_jump.h" /* * __os_seek -- @@ -35,10 +36,17 @@ __os_seek(fd, pgsize, pageno, relative, isrewind, whence) int isrewind, whence; { off_t offset; - - offset = (off_t)pgsize * pageno + relative; - if (isrewind) - offset = -offset; - - return (lseek(fd, offset, whence) == -1 ? errno : 0); + int ret; + + if (__db_jump.j_seek != NULL) + ret = __db_jump.j_seek(fd, + pgsize, pageno, relative, isrewind, whence); + else { + offset = (off_t)pgsize * pageno + relative; + if (isrewind) + offset = -offset; + + ret = lseek(fd, offset, whence); + } + return (ret == -1 ? errno : 0); } diff --git a/db2/os/os_sleep.c b/db2/os/os_sleep.c index 6a5b91f5c4..5aa476352e 100644 --- a/db2/os/os_sleep.c +++ b/db2/os/os_sleep.c @@ -8,7 +8,7 @@ #include "config.h" #ifndef lint -static const char sccsid[] = "@(#)os_sleep.c 10.10 (Sleepycat) 4/27/98"; +static const char sccsid[] = "@(#)os_sleep.c 10.12 (Sleepycat) 10/12/98"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -28,6 +28,7 @@ static const char sccsid[] = "@(#)os_sleep.c 10.10 (Sleepycat) 4/27/98"; #endif #include "db_int.h" +#include "os_jump.h" /* * __os_sleep -- @@ -45,6 +46,9 @@ __os_sleep(secs, usecs) for (; usecs >= 1000000; ++secs, usecs -= 1000000) ; + if (__db_jump.j_sleep != NULL) + return (__db_jump.j_sleep(secs, usecs)); + /* * It's important that we yield the processor here so that other * processes or threads are permitted to run. diff --git a/db2/os/os_spin.c b/db2/os/os_spin.c index 2fd21d018b..cbde58894a 100644 --- a/db2/os/os_spin.c +++ b/db2/os/os_spin.c @@ -8,17 +8,50 @@ #include "config.h" #ifndef lint -static const char sccsid[] = "@(#)os_spin.c 10.7 (Sleepycat) 5/20/98"; +static const char sccsid[] = "@(#)os_spin.c 10.10 (Sleepycat) 10/12/98"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES #include <sys/types.h> +#if defined(HAVE_PSTAT_GETDYNAMIC) +#include <sys/pstat.h> +#endif #include <limits.h> #include <unistd.h> #endif #include "db_int.h" +#include "os_jump.h" + +#if defined(HAVE_PSTAT_GETDYNAMIC) +/* + * __os_pstat_getdynamic -- + * HP/UX. + */ +static int +__os_pstat_getdynamic() +{ + struct pst_dynamic psd; + + return (pstat_getdynamic(&psd, + sizeof(psd), (size_t)1, 0) == -1 ? 1 : psd.psd_proc_cnt); +} +#endif + +#if defined(HAVE_SYSCONF) && defined(_SC_NPROCESSORS_ONLN) +/* + * __os_sysconf -- + * Solaris, Linux. + */ +static int +__os_sysconf(void) +{ + int nproc; + + return ((nproc = sysconf(_SC_NPROCESSORS_ONLN)) > 1 ? nproc : 1); +} +#endif /* * __os_spin -- @@ -29,33 +62,46 @@ static const char sccsid[] = "@(#)os_spin.c 10.7 (Sleepycat) 5/20/98"; int __os_spin() { - static long sys_val; - - /* If the application specified the spins, use its value. */ + /* + * If the application specified a value or we've already figured it + * out, return it. + * + * XXX + * We don't want to repeatedly call the underlying function because + * it can be expensive (e.g., requiring multiple filesystem accesses + * under Debian Linux). + */ if (DB_GLOBAL(db_tsl_spins) != 0) return (DB_GLOBAL(db_tsl_spins)); - /* If we've already figured this out, return the value. */ - if (sys_val != 0) - return (sys_val); + DB_GLOBAL(db_tsl_spins) = 1; +#if defined(HAVE_PSTAT_GETDYNAMIC) + DB_GLOBAL(db_tsl_spins) = __os_pstat_getdynamic(); +#endif +#if defined(HAVE_SYSCONF) && defined(_SC_NPROCESSORS_ONLN) + DB_GLOBAL(db_tsl_spins) = __os_sysconf(); +#endif /* - * XXX - * Solaris and Linux use _SC_NPROCESSORS_ONLN to return the number of - * online processors. We don't want to repeatedly call sysconf because - * it's quite expensive (requiring multiple filesystem accesses) under - * Debian Linux. - * - * Spin 50 times per processor -- we have anecdotal evidence that this + * Spin 50 times per processor, we have anecdotal evidence that this * is a reasonable value. */ -#if defined(HAVE_SYSCONF) && defined(_SC_NPROCESSORS_ONLN) - if ((sys_val = sysconf(_SC_NPROCESSORS_ONLN)) > 1) - sys_val *= 50; - else - sys_val = 1; -#else - sys_val = 1; -#endif - return (sys_val); + DB_GLOBAL(db_tsl_spins) *= 50; + + return (DB_GLOBAL(db_tsl_spins)); +} + +/* + * __os_yield -- + * Yield the processor. + * + * PUBLIC: void __os_yield __P((u_long)); + */ +void +__os_yield(usecs) + u_long usecs; +{ + if (__db_jump.j_yield != NULL && __db_jump.j_yield() == 0) + return; + __os_sleep(0, usecs); } diff --git a/db2/os/os_stat.c b/db2/os/os_stat.c index e7d3f24174..65cba82efa 100644 --- a/db2/os/os_stat.c +++ b/db2/os/os_stat.c @@ -8,7 +8,7 @@ #include "config.h" #ifndef lint -static const char sccsid[] = "@(#)os_stat.c 10.15 (Sleepycat) 4/27/98"; +static const char sccsid[] = "@(#)os_stat.c 10.18 (Sleepycat) 10/12/98"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -19,6 +19,7 @@ static const char sccsid[] = "@(#)os_stat.c 10.15 (Sleepycat) 4/27/98"; #endif #include "db_int.h" +#include "os_jump.h" /* * __os_exists -- @@ -33,6 +34,9 @@ __os_exists(path, isdirp) { struct stat sb; + if (__db_jump.j_exists != NULL) + return (__db_jump.j_exists(path, isdirp)); + if (stat(path, &sb) != 0) return (errno); @@ -65,7 +69,8 @@ __os_ioinfo(path, fd, mbytesp, bytesp, iosizep) { struct stat sb; - COMPQUIET(path, NULL); + if (__db_jump.j_ioinfo != NULL) + return (__db_jump.j_ioinfo(path, fd, mbytesp, bytesp, iosizep)); if (fstat(fd, &sb) == -1) return (errno); @@ -80,7 +85,7 @@ __os_ioinfo(path, fd, mbytesp, bytesp, iosizep) * Return the underlying filesystem blocksize, if available. * * XXX - * Check for a 0 size -- HP's MPE architecture has st_blksize, + * Check for a 0 size -- the HP MPE/iX architecture has st_blksize, * but it's always 0. */ #ifdef HAVE_ST_BLKSIZE diff --git a/db2/os/os_tmpdir.c b/db2/os/os_tmpdir.c new file mode 100644 index 0000000000..0b0bbc7c61 --- /dev/null +++ b/db2/os/os_tmpdir.c @@ -0,0 +1,113 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1998 + * Sleepycat Software. All rights reserved. + */ + +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)os_tmpdir.c 10.3 (Sleepycat) 10/13/98"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <errno.h> +#include <stdlib.h> +#endif + +#include "db_int.h" +#include "common_ext.h" + +#ifdef macintosh +#include <TFileSpec.h> +#endif + +/* + * __os_tmpdir -- + * Set the temporary directory path. + * + * The order of items in the list structure and the order of checks in + * the environment are documented. + * + * PUBLIC: int __os_tmpdir __P((DB_ENV *, u_int32_t)); + */ +int +__os_tmpdir(dbenv, flags) + DB_ENV *dbenv; + u_int32_t flags; +{ + /* + * !!! + * Don't change this to: + * + * static const char * const list[] + * + * because it creates a text relocation in position independent code. + */ + static const char * list[] = { + "/var/tmp", + "/usr/tmp", + "/temp", /* Windows. */ + "/tmp", + "C:/temp", /* Windows. */ + "C:/tmp", /* Windows. */ + NULL + }; + const char * const *lp, *p; + + /* Use the environment if it's permitted and initialized. */ + p = NULL; +#ifdef HAVE_GETEUID + if (LF_ISSET(DB_USE_ENVIRON) || + (LF_ISSET(DB_USE_ENVIRON_ROOT) && getuid() == 0)) +#else + if (LF_ISSET(DB_USE_ENVIRON)) +#endif + { + if ((p = getenv("TMPDIR")) != NULL && p[0] == '\0') { + __db_err(dbenv, "illegal TMPDIR environment variable"); + return (EINVAL); + } + /* Windows */ + if (p == NULL && (p = getenv("TEMP")) != NULL && p[0] == '\0') { + __db_err(dbenv, "illegal TEMP environment variable"); + return (EINVAL); + } + /* Windows */ + if (p == NULL && (p = getenv("TMP")) != NULL && p[0] == '\0') { + __db_err(dbenv, "illegal TMP environment variable"); + return (EINVAL); + } + /* Macintosh */ + if (p == NULL && + (p = getenv("TempFolder")) != NULL && p[0] == '\0') { + __db_err(dbenv, + "illegal TempFolder environment variable"); + return (EINVAL); + } + } + +#ifdef macintosh + /* Get the path to the temporary folder. */ + if (p == NULL) { + FSSpec spec; + + if (!Special2FSSpec(kTemporaryFolderType, + kOnSystemDisk, 0, &spec)) + (void)__os_strdup(FSp2FullPath(&spec), &p); + } +#endif + + /* Step through the list looking for a possibility. */ + if (p == NULL) + for (lp = list; *lp != NULL; ++lp) + if (__os_exists(p = *lp, NULL) == 0) + break; + if (p == NULL) + return (0); + + return (__os_strdup(p, &dbenv->db_tmp_dir)); +} diff --git a/db2/os/os_unlink.c b/db2/os/os_unlink.c index 3a1fa3ff99..aa484de843 100644 --- a/db2/os/os_unlink.c +++ b/db2/os/os_unlink.c @@ -8,7 +8,7 @@ #include "config.h" #ifndef lint -static const char sccsid[] = "@(#)os_unlink.c 10.5 (Sleepycat) 4/10/98"; +static const char sccsid[] = "@(#)os_unlink.c 10.7 (Sleepycat) 10/12/98"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -19,16 +19,21 @@ static const char sccsid[] = "@(#)os_unlink.c 10.5 (Sleepycat) 4/10/98"; #endif #include "db_int.h" +#include "os_jump.h" /* - * __db_unlink -- + * __os_unlink -- * Remove a file. * - * PUBLIC: int __db_unlink __P((const char *)); + * PUBLIC: int __os_unlink __P((const char *)); */ int -__db_unlink(path) +__os_unlink(path) const char *path; { - return (__os_unlink(path) == -1 ? errno : 0); + int ret; + + ret = __db_jump.j_unlink != NULL ? + __db_jump.j_unlink(path) : unlink(path); + return (ret == -1 ? errno : 0); } diff --git a/db2/progs/db_archive/db_archive.c b/db2/progs/db_archive/db_archive.c index 691824c2ab..ca489954f6 100644 --- a/db2/progs/db_archive/db_archive.c +++ b/db2/progs/db_archive/db_archive.c @@ -11,7 +11,7 @@ static const char copyright[] = "@(#) Copyright (c) 1996, 1997, 1998\n\ Sleepycat Software Inc. All rights reserved.\n"; -static const char sccsid[] = "@(#)db_archive.c 10.17 (Sleepycat) 4/10/98"; +static const char sccsid[] = "@(#)db_archive.c 10.20 (Sleepycat) 10/3/98"; #endif #ifndef NO_SYSTEM_INCLUDES @@ -33,12 +33,10 @@ static const char sccsid[] = "@(#)db_archive.c 10.17 (Sleepycat) 4/10/98"; #include "common_ext.h" DB_ENV *db_init __P((char *, int)); -void onint __P((int)); int main __P((int, char *[])); -void siginit __P((void)); +void nosig __P((void)); void usage __P((void)); -int interrupted; const char *progname = "db_archive"; /* Program name. */ @@ -83,13 +81,18 @@ main(argc, argv) if (argc != 0) usage(); - /* Initialize the environment. */ + /* + * Ignore signals -- we don't want to be interrupted because we're + * spending all of our time in the DB library. + */ + nosig(); dbenv = db_init(home, verbose); /* Get the list of names. */ if ((errno = log_archive(dbenv->lg_info, &list, flags, NULL)) != 0) { + warn(NULL); (void)db_appexit(dbenv); - err(1, "log_archive"); + return (1); } /* Print the names. */ @@ -97,7 +100,12 @@ main(argc, argv) for (; *list != NULL; ++list) printf("%s\n", *list); - return (db_appexit(dbenv) ? 1 : 0); + if ((errno = db_appexit(dbenv)) != 0) { + warn(NULL); + return (1); + } + + return (0); } /* @@ -123,40 +131,21 @@ db_init(home, verbose) DB_CREATE | DB_INIT_LOG | DB_INIT_TXN | DB_USE_ENVIRON)) != 0) err(1, "db_appinit"); - siginit(); - return (dbenv); } /* - * siginit -- - * Initialize the set of signals for which we want to clean up. - * Generally, we try not to leave the shared regions locked if - * we can. + * nosig -- + * We don't want to be interrupted. */ void -siginit() +nosig() { #ifdef SIGHUP - (void)signal(SIGHUP, onint); + (void)signal(SIGHUP, SIG_IGN); #endif - (void)signal(SIGINT, onint); -#ifdef SIGKILL - (void)signal(SIGKILL, onint); -#endif - (void)signal(SIGTERM, onint); -} - -/* - * oninit -- - * Interrupt signal handler. - */ -void -onint(signo) - int signo; -{ - if ((interrupted = signo) == 0) - interrupted = SIGINT; + (void)signal(SIGINT, SIG_IGN); + (void)signal(SIGTERM, SIG_IGN); } void diff --git a/db2/progs/db_checkpoint/db_checkpoint.c b/db2/progs/db_checkpoint/db_checkpoint.c index 74f95ccce2..f0fe48ab2e 100644 --- a/db2/progs/db_checkpoint/db_checkpoint.c +++ b/db2/progs/db_checkpoint/db_checkpoint.c @@ -11,7 +11,7 @@ static const char copyright[] = "@(#) Copyright (c) 1996, 1997, 1998\n\ Sleepycat Software Inc. All rights reserved.\n"; -static const char sccsid[] = "@(#)db_checkpoint.c 10.17 (Sleepycat) 5/3/98"; +static const char sccsid[] = "@(#)db_checkpoint.c 10.21 (Sleepycat) 10/4/98"; #endif #ifndef NO_SYSTEM_INCLUDES @@ -59,7 +59,7 @@ main(argc, argv) time_t now; long argval; u_int32_t kbytes, minutes, seconds; - int ch, eval, once, verbose; + int ch, once, ret, verbose; char *home, *logfile; /* @@ -70,7 +70,7 @@ main(argc, argv) #define MAX_UINT32_T 2147483647 kbytes = minutes = 0; - once = verbose = 0; + once = ret = verbose = 0; home = logfile = NULL; while ((ch = getopt(argc, argv, "1h:k:L:p:v")) != EOF) switch (ch) { @@ -110,6 +110,7 @@ main(argc, argv) } /* Initialize the environment. */ + siginit(); dbenv = db_init(home); if (logfile != NULL && logpid(logfile, 1)) { @@ -122,37 +123,40 @@ main(argc, argv) * to wake up when a checkpoint is necessary. If we have a "kbytes" * field set, then we'll check every 30 seconds. */ - eval = 0; seconds = kbytes != 0 ? 30 : minutes * 60; while (!interrupted) { if (verbose) { (void)time(&now); - printf("checkpoint: %s", ctime(&now)); + warnx("checkpoint: %s", ctime(&now)); } - errno = txn_checkpoint(dbenv->tx_info, kbytes, minutes); + errno = txn_checkpoint(dbenv->tx_info, kbytes, minutes); while (errno == DB_INCOMPLETE) { if (verbose) - __db_err(dbenv, - "checkpoint did not finish, retrying"); - (void)__db_sleep(2, 0); + warnx("checkpoint did not finish, retrying\n"); + (void)sleep(2); errno = txn_checkpoint(dbenv->tx_info, 0, 0); } if (errno != 0) { - eval = 1; - __db_err(dbenv, "checkpoint: %s", strerror(errno)); + ret = 1; + warn(NULL); break; } if (once) break; - (void)__db_sleep(seconds, 0); + (void)sleep(seconds); } if (logfile != NULL && logpid(logfile, 0)) - eval = 1; + ret = 1; + + if ((errno = db_appexit(dbenv)) != 0) { + ret = 1; + warn(NULL); + } if (interrupted) { (void)signal(interrupted, SIG_DFL); @@ -160,7 +164,7 @@ main(argc, argv) /* NOTREACHED */ } - return (db_appexit(dbenv) || eval ? 1 : 0); + return (ret); } /* @@ -193,8 +197,6 @@ db_init(home) "db_appinit: failed to register access method functions"); } - siginit(); - return (dbenv); } @@ -237,14 +239,11 @@ siginit() (void)signal(SIGHUP, onint); #endif (void)signal(SIGINT, onint); -#ifdef SIGKILL - (void)signal(SIGKILL, onint); -#endif (void)signal(SIGTERM, onint); } /* - * oninit -- + * onint -- * Interrupt signal handler. */ void diff --git a/db2/progs/db_deadlock/db_deadlock.c b/db2/progs/db_deadlock/db_deadlock.c index 49a52416dd..bc5039e95f 100644 --- a/db2/progs/db_deadlock/db_deadlock.c +++ b/db2/progs/db_deadlock/db_deadlock.c @@ -11,7 +11,7 @@ static const char copyright[] = "@(#) Copyright (c) 1996, 1997, 1998\n\ Sleepycat Software Inc. All rights reserved.\n"; -static const char sccsid[] = "@(#)db_deadlock.c 10.19 (Sleepycat) 4/10/98"; +static const char sccsid[] = "@(#)db_deadlock.c 10.23 (Sleepycat) 10/4/98"; #endif #ifndef NO_SYSTEM_INCLUDES @@ -55,14 +55,14 @@ main(argc, argv) time_t now; long usecs; u_int32_t flags; - int ch, verbose; + int ch, ret, verbose; char *home, *logfile; atype = DB_LOCK_DEFAULT; home = logfile = NULL; usecs = 0; flags = 0; - verbose = 0; + ret = verbose = 0; while ((ch = getopt(argc, argv, "a:h:L:t:vw")) != EOF) switch (ch) { case 'a': @@ -119,6 +119,7 @@ main(argc, argv) usecs = 100000; /* Initialize the deadlock detector by opening the lock manager. */ + siginit(); dbenv = db_init(home, verbose); if (logfile != NULL && logpid(logfile, 1)) { @@ -129,18 +130,26 @@ main(argc, argv) while (!interrupted) { if (dbenv->db_verbose != 0) { time(&now); - __db_err(dbenv, "Running at %.24s", ctime(&now)); + warnx("Running at %.24s", ctime(&now)); } - if ((errno = lock_detect(dbenv->lk_info, flags, atype)) != 0) + if ((errno = lock_detect(dbenv->lk_info, flags, atype)) != 0) { + ret = 1; + warnx(NULL); break; + } /* Make a pass every "usecs" usecs. */ - (void)__db_sleep(0, usecs); + (void)usleep(usecs); } - if (logfile != NULL) - (void)logpid(logfile, 0); + if (logfile != NULL && logpid(logfile, 0)) + ret = 1; + + if ((errno = db_appexit(dbenv)) != 0) { + ret = 1; + warn(NULL); + } if (interrupted) { (void)signal(interrupted, SIG_DFL); @@ -148,7 +157,7 @@ main(argc, argv) /* NOTREACHED */ } - return (db_appexit(dbenv)); + return (ret); } DB_ENV * @@ -170,8 +179,6 @@ db_init(home, verbose) NULL, dbenv, DB_INIT_LOCK | DB_USE_ENVIRON)) != 0) err(1, "db_appinit"); - siginit(); - return (dbenv); } @@ -214,14 +221,11 @@ siginit() (void)signal(SIGHUP, onint); #endif (void)signal(SIGINT, onint); -#ifdef SIGKILL - (void)signal(SIGKILL, onint); -#endif (void)signal(SIGTERM, onint); } /* - * oninit -- + * onint -- * Interrupt signal handler. */ void diff --git a/db2/progs/db_dump/db_dump.c b/db2/progs/db_dump/db_dump.c index f532bc2779..0f34ddc789 100644 --- a/db2/progs/db_dump/db_dump.c +++ b/db2/progs/db_dump/db_dump.c @@ -11,7 +11,7 @@ static const char copyright[] = "@(#) Copyright (c) 1996, 1997, 1998\n\ Sleepycat Software Inc. All rights reserved.\n"; -static const char sccsid[] = "@(#)db_dump.c 10.19 (Sleepycat) 5/23/98"; +static const char sccsid[] = "@(#)db_dump.c 10.24 (Sleepycat) 11/22/98"; #endif #ifndef NO_SYSTEM_INCLUDES @@ -25,14 +25,14 @@ static const char sccsid[] = "@(#)db_dump.c 10.19 (Sleepycat) 5/23/98"; #include <unistd.h> #endif +#undef stat + #include "db_int.h" #include "db_page.h" #include "btree.h" #include "hash.h" #include "clib_ext.h" -#undef stat - void configure __P((char *)); DB_ENV *db_init __P((char *)); int main __P((int, char *[])); @@ -58,7 +58,7 @@ main(argc, argv) home = NULL; checkprint = dflag = 0; - while ((ch = getopt(argc, argv, "df:h:p")) != EOF) + while ((ch = getopt(argc, argv, "df:h:Np")) != EOF) switch (ch) { case 'd': dflag = 1; @@ -70,6 +70,9 @@ main(argc, argv) case 'h': home = optarg; break; + case 'N': + (void)db_value_set(0, DB_MUTEXLOCKS); + break; case 'p': checkprint = 1; break; @@ -83,16 +86,11 @@ main(argc, argv) if (argc != 1) usage(); - if (dflag) { - if (home != NULL) - errx(1, - "the -d and -h options may not both be specified"); - if (checkprint) - errx(1, - "the -d and -p options may not both be specified"); - } + if (dflag && checkprint) + errx(1, "the -d and -p options may not both be specified"); + /* Initialize the environment. */ - dbenv = dflag ? NULL : db_init(home); + dbenv = db_init(home); /* Open the DB file. */ if ((errno = @@ -108,7 +106,7 @@ main(argc, argv) } /* Get a cursor and step through the database. */ - if ((errno = dbp->cursor(dbp, NULL, &dbcp)) != 0) { + if ((errno = dbp->cursor(dbp, NULL, &dbcp, 0)) != 0) { (void)dbp->close(dbp, 0); err(1, "cursor"); } @@ -145,16 +143,35 @@ db_init(home) { DB_ENV *dbenv; - if ((dbenv = (DB_ENV *)calloc(sizeof(DB_ENV), 1)) == NULL) { + if ((dbenv = (DB_ENV *)calloc(1, sizeof(DB_ENV))) == NULL) { errno = ENOMEM; err(1, NULL); } + + /* + * Try and use the shared mpool region so that we get pages that + * haven't been flushed to disk (mostly useful for debugging). + * If that fails, try again, without the DB_INIT_MPOOL flag. + * + * If it works, set the error output options so that future errors + * are correctly reported. + */ + if ((errno = db_appinit(home, + NULL, dbenv, DB_USE_ENVIRON | DB_INIT_MPOOL)) == 0) { + dbenv->db_errfile = stderr; + dbenv->db_errpfx = progname; + return (dbenv); + } + + /* Set the error output options -- this time we want a message. */ + memset(dbenv, 0, sizeof(*dbenv)); dbenv->db_errfile = stderr; dbenv->db_errpfx = progname; - if ((errno = - db_appinit(home, NULL, dbenv, DB_CREATE | DB_USE_ENVIRON)) != 0) + /* Try again, and it's fatal if we fail. */ + if ((errno = db_appinit(home, NULL, dbenv, DB_USE_ENVIRON)) != 0) err(1, "db_appinit"); + return (dbenv); } @@ -167,10 +184,10 @@ pheader(dbp, pflag) DB *dbp; int pflag; { + DBC *dbc; DB_BTREE_STAT *btsp; - HTAB *hashp; - HASHHDR *hdr; - db_pgno_t pgno; + HASH_CURSOR *hcp; + int ret; printf("format=%s\n", pflag ? "print" : "bytevalue"); switch (dbp->type) { @@ -187,18 +204,25 @@ pheader(dbp, pflag) break; case DB_HASH: printf("type=hash\n"); - hashp = dbp->internal; - pgno = PGNO_METADATA; - if (memp_fget(dbp->mpf, &pgno, 0, &hdr) == 0) { - if (hdr->ffactor != 0) - printf("h_ffactor=%lu\n", (u_long)hdr->ffactor); - if (hdr->nelem != 0) - printf("h_nelem=%lu\n", (u_long)hdr->nelem); - (void)memp_fput(dbp->mpf, hdr, 0); + if ((ret = dbp->cursor(dbp, NULL, &dbc, 0)) != 0) + break; + hcp = (HASH_CURSOR *)dbc->internal; + GET_META(dbp, hcp, ret); + if (ret == 0) { + if (hcp->hdr->ffactor != 0) + printf("h_ffactor=%lu\n", + (u_long)hcp->hdr->ffactor); + if (hcp->hdr->nelem != 0) + printf("h_nelem=%lu\n", + (u_long)hcp->hdr->nelem); + RELEASE_META(dbp, hcp); } + (void)dbc->c_close(dbc); break; case DB_RECNO: printf("type=recno\n"); + if ((errno = dbp->stat(dbp, &btsp, NULL, 0)) != 0) + err(1, "dbp->stat"); if (F_ISSET(dbp, DB_RE_RENUMBER)) printf("renumber=1\n"); if (F_ISSET(dbp, DB_RE_FIXEDLEN)) @@ -231,6 +255,6 @@ void usage() { (void)fprintf(stderr, - "usage: db_dump [-dp] [-f file] [-h home] db_file\n"); + "usage: db_dump [-dNp] [-f file] [-h home] db_file\n"); exit(1); } diff --git a/db2/progs/db_load/db_load.c b/db2/progs/db_load/db_load.c index 84cfb36775..ca30cef342 100644 --- a/db2/progs/db_load/db_load.c +++ b/db2/progs/db_load/db_load.c @@ -11,7 +11,7 @@ static const char copyright[] = "@(#) Copyright (c) 1996, 1997, 1998\n\ Sleepycat Software Inc. All rights reserved.\n"; -static const char sccsid[] = "@(#)db_load.c 10.20 (Sleepycat) 6/2/98"; +static const char sccsid[] = "@(#)db_load.c 10.23 (Sleepycat) 10/4/98"; #endif #ifndef NO_SYSTEM_INCLUDES @@ -19,6 +19,7 @@ static const char sccsid[] = "@(#)db_load.c 10.20 (Sleepycat) 6/2/98"; #include <errno.h> #include <limits.h> +#include <signal.h> #include <stdio.h> #include <stdlib.h> #include <string.h> @@ -37,9 +38,12 @@ int dbt_rdump __P((DBT *)); int dbt_rprint __P((DBT *)); int digitize __P((int)); int main __P((int, char *[])); +void onint __P((int)); void rheader __P((DBTYPE *, int *, DB_INFO *)); +void siginit __P((void)); void usage __P((void)); +int interrupted; const char *progname = "db_load"; /* Program name. */ @@ -57,16 +61,17 @@ main(argc, argv) DB_INFO dbinfo; db_recno_t recno; u_int32_t db_nooverwrite; - int ch, checkprint, existed, no_header; + int ch, checkprint, existed, no_header, ret; char **clist, **clp, *home; /* Allocate enough room for configuration arguments. */ if ((clp = clist = (char **)calloc(argc + 1, sizeof(char *))) == NULL) err(1, NULL); + dbp = NULL; home = NULL; db_nooverwrite = 0; - existed = checkprint = no_header = 0; + checkprint = existed = no_header = ret = 0; argtype = dbtype = DB_UNKNOWN; while ((ch = getopt(argc, argv, "c:f:h:nTt:")) != EOF) switch (ch) { @@ -111,9 +116,6 @@ main(argc, argv) if (argc != 1) usage(); - /* Initialize the environment if the user specified one. */ - dbenv = home == NULL ? NULL : db_init(home); - /* * Read the header. If there isn't any header, we're expecting flat * text, set the checkprint flag appropriately. @@ -128,21 +130,17 @@ main(argc, argv) if ((dbtype == DB_RECNO && argtype != DB_RECNO) || (argtype == DB_RECNO && dbtype != DB_RECNO)) errx(1, - "databases of type recno may not be converted"); + "databases of type recno may not be converted"); dbtype = argtype; } } + if (dbtype == DB_UNKNOWN) errx(1, "no database type specified"); /* Apply command-line configuration changes. */ configure(&dbinfo, clist); - /* Open the DB file. */ - if ((errno = db_open(argv[0], dbtype, DB_CREATE, - __db_omode("rwrwrw"), dbenv, &dbinfo, &dbp)) != 0) - err(1, "%s", argv[0]); - /* Initialize the key/data pair. */ memset(&key, 0, sizeof(DBT)); if (dbtype == DB_RECNO) { @@ -159,9 +157,20 @@ main(argc, argv) err(1, NULL); } + /* Initialize the environment if the user specified one. */ + siginit(); + dbenv = home == NULL ? NULL : db_init(home); + + /* Open the DB file. */ + if ((errno = db_open(argv[0], dbtype, DB_CREATE, + __db_omode("rwrwrw"), dbenv, &dbinfo, &dbp)) != 0) { + warn("%s", argv[0]); + goto err; + } + /* Get each key/data pair and add them to the database. */ - for (recno = 1;; ++recno) { - if (dbtype == DB_RECNO) { + for (recno = 1; !interrupted; ++recno) { + if (dbtype == DB_RECNO) if (checkprint) { if (dbt_rprint(&data)) break; @@ -169,7 +178,7 @@ main(argc, argv) if (dbt_rdump(&data)) break; } - } else + else if (checkprint) { if (dbt_rprint(&key)) break; @@ -178,8 +187,10 @@ main(argc, argv) } else { if (dbt_rdump(&key)) break; - if (dbt_rdump(&data)) -fmt: err(1, "odd number of key/data pairs"); + if (dbt_rdump(&data)) { +fmt: warnx("odd number of key/data pairs"); + goto err; + } } switch (errno = dbp->put(dbp, NULL, &key, &data, db_nooverwrite)) { @@ -190,17 +201,36 @@ fmt: err(1, "odd number of key/data pairs"); warnx("%s: line %d: key already exists, not loaded:", argv[0], dbtype == DB_RECNO ? recno : recno * 2 - 1); + (void)__db_prdbt(&key, checkprint, stderr); break; default: - err(1, "%s", argv[0]); - /* NOTREACHED */ + warn(NULL); + goto err; } } - if ((errno = dbp->close(dbp, 0)) != 0) - err(1, "%s", argv[0]); - return (existed ? 1 : 0); + if (0) { +err: ret = 1; + } + if (dbp != NULL && (errno = dbp->close(dbp, 0)) != 0) { + ret = 1; + warn(NULL); + } + + if (dbenv != NULL && (errno = db_appexit(dbenv)) != 0) { + ret = 1; + warn(NULL); + } + + if (interrupted) { + (void)signal(interrupted, SIG_DFL); + (void)raise(interrupted); + /* NOTREACHED */ + } + + /* Return 0 on success, 1 if keys existed already, and 2 on failure. */ + return (ret == 0 ? (existed == 0 ? 0 : 1) : 2); } /* @@ -499,6 +529,34 @@ badnum() } /* + * siginit -- + * Initialize the set of signals for which we want to clean up. + * Generally, we try not to leave the shared regions locked if + * we can. + */ +void +siginit() +{ +#ifdef SIGHUP + (void)signal(SIGHUP, onint); +#endif + (void)signal(SIGINT, onint); + (void)signal(SIGTERM, onint); +} + +/* + * onint -- + * Interrupt signal handler. + */ +void +onint(signo) + int signo; +{ + if ((interrupted = signo) == 0) + interrupted = SIGINT; +} + +/* * usage -- * Display the usage message. */ diff --git a/db2/progs/db_printlog/README b/db2/progs/db_printlog/README new file mode 100644 index 0000000000..05051f33cd --- /dev/null +++ b/db2/progs/db_printlog/README @@ -0,0 +1,22 @@ +# @(#)README 10.3 (Sleepycat) 11/1/98 + +Berkeley DB log dump utility. This utility dumps out a DB log in human +readable form, a record at a time, to assist in recovery and transaction +abort debugging. + +=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= +commit.awk Output transaction ID of committed transactions. + +count.awk Print out the number of log records for transactions + that we encountered. + +pgno.awk Take a comma-separated list of page numbers and spit + out all the log records that affect those page numbers. + +range.awk Print out a range of the log. + +status.awk Read through db_printlog output and list the transactions + encountered, and whether they commited or aborted. + +txn.awk Print out all the records for a comma-separated list of + transaction IDs. diff --git a/db2/progs/db_printlog/commit.awk b/db2/progs/db_printlog/commit.awk new file mode 100644 index 0000000000..711064bb00 --- /dev/null +++ b/db2/progs/db_printlog/commit.awk @@ -0,0 +1,7 @@ +# @(#)commit.awk 10.1 (Sleepycat) 11/1/98 +# +# Output tid of committed transactions. + +/txn_regop/ { + print $5 +} diff --git a/db2/progs/db_printlog/count.awk b/db2/progs/db_printlog/count.awk new file mode 100644 index 0000000000..a0b214a6ff --- /dev/null +++ b/db2/progs/db_printlog/count.awk @@ -0,0 +1,9 @@ +# @(#)count.awk 10.1 (Sleepycat) 11/1/98 +# +# Print out the number of log records for transactions that we +# encountered. + +/^\[/{ + if ($5 != 0) + print $5 +} diff --git a/db2/progs/db_printlog/db_printlog.c b/db2/progs/db_printlog/db_printlog.c index 3b48ad9643..5a0c2ebd9f 100644 --- a/db2/progs/db_printlog/db_printlog.c +++ b/db2/progs/db_printlog/db_printlog.c @@ -11,7 +11,7 @@ static const char copyright[] = "@(#) Copyright (c) 1996, 1997, 1998\n\ Sleepycat Software Inc. All rights reserved.\n"; -static const char sccsid[] = "@(#)db_printlog.c 10.12 (Sleepycat) 4/10/98"; +static const char sccsid[] = "@(#)db_printlog.c 10.17 (Sleepycat) 11/1/98"; #endif #ifndef NO_SYSTEM_INCLUDES @@ -19,6 +19,7 @@ static const char sccsid[] = "@(#)db_printlog.c 10.12 (Sleepycat) 4/10/98"; #include <errno.h> #include <signal.h> +#include <stdio.h> #include <stdlib.h> #include <string.h> #include <unistd.h> @@ -37,6 +38,7 @@ static const char sccsid[] = "@(#)db_printlog.c 10.12 (Sleepycat) 4/10/98"; DB_ENV *db_init __P((char *)); int main __P((int, char *[])); void onint __P((int)); +void siginit __P((void)); void usage __P((void)); int interrupted; @@ -53,15 +55,19 @@ main(argc, argv) DB_ENV *dbenv; DBT data; DB_LSN key; - int ch, eval; + int ch, ret; char *home; + ret = 0; home = NULL; - while ((ch = getopt(argc, argv, "h:")) != EOF) + while ((ch = getopt(argc, argv, "h:N")) != EOF) switch (ch) { case 'h': home = optarg; break; + case 'N': + (void)db_value_set(0, DB_MUTEXLOCKS); + break; case '?': default: usage(); @@ -69,54 +75,62 @@ main(argc, argv) argc -= optind; argv += optind; - if ((home != NULL && argc > 0) || argc > 1) + if (argc > 0) usage(); - /* XXX: backward compatibility, first argument is home. */ - if (argc == 1) - home = argv[0]; - + /* Initialize the environment. */ + siginit(); dbenv = db_init(home); - eval = 0; if ((errno = __bam_init_print(dbenv)) != 0 || (errno = __db_init_print(dbenv)) != 0 || (errno = __ham_init_print(dbenv)) != 0 || (errno = __log_init_print(dbenv)) != 0 || (errno = __txn_init_print(dbenv)) != 0) { warn("initialization"); - eval = 1; (void)db_appexit(dbenv); + return (1); } - (void)signal(SIGINT, onint); - memset(&data, 0, sizeof(data)); while (!interrupted) { if ((errno = log_get(dbenv->lg_info, &key, &data, DB_NEXT)) != 0) { if (errno == DB_NOTFOUND) break; - eval = 1; warn("log_get"); - break; + goto err; } - if ((errno = - __db_dispatch(dbenv->lg_info, &data, &key, 0, NULL)) != 0) { - eval = 1; + if (dbenv->tx_recover != NULL) + errno = dbenv->tx_recover(dbenv->lg_info, + &data, &key, 0, NULL); + else + errno = __db_dispatch(dbenv->lg_info, + &data, &key, 0, NULL); + + fflush(stdout); + if (errno != 0) { warn("dispatch"); - break; + goto err; } } - (void)db_appexit(dbenv); + if (0) { +err: ret = 1; + } + + if (dbenv != NULL && (errno = db_appexit(dbenv)) != 0) { + ret = 1; + warn(NULL); + } if (interrupted) { - (void)signal(SIGINT, SIG_DFL); - (void)raise(SIGINT); + (void)signal(interrupted, SIG_DFL); + (void)raise(interrupted); /* NOTREACHED */ } - return (eval); + + return (ret); } /* @@ -143,21 +157,36 @@ db_init(home) } /* - * oninit -- + * siginit -- + * Initialize the set of signals for which we want to clean up. + * Generally, we try not to leave the shared regions locked if + * we can. + */ +void +siginit() +{ +#ifdef SIGHUP + (void)signal(SIGHUP, onint); +#endif + (void)signal(SIGINT, onint); + (void)signal(SIGTERM, onint); +} + +/* + * onint -- * Interrupt signal handler. */ void onint(signo) int signo; { - COMPQUIET(signo, 0); - - interrupted = 1; + if ((interrupted = signo) == 0) + interrupted = SIGINT; } void usage() { - fprintf(stderr, "usage: db_printlog [-h home]\n"); + fprintf(stderr, "usage: db_printlog [-N] [-h home]\n"); exit (1); } diff --git a/db2/progs/db_printlog/pgno.awk b/db2/progs/db_printlog/pgno.awk new file mode 100644 index 0000000000..99aa38f2b9 --- /dev/null +++ b/db2/progs/db_printlog/pgno.awk @@ -0,0 +1,43 @@ +# @(#)pgno.awk 10.1 (Sleepycat) 11/1/98 +# +# Take a comma-separated list of page numbers and spit out all the +# log records that affect those page numbers. + +{ + if (NR == 1) { + npages = 0 + while ((ndx = index(PGNO, ",")) != 0) { + pgno[npages] = substr(PGNO, 1, ndx - 1); + PGNO = substr(PGNO, ndx + 1, length(PGNO) - ndx); + npages++ + } + pgno[npages] = PGNO; + } +} +/^\[/{ + if (printme == 1) { + printf("%s\n", rec); + printme = 0 + } + rec = ""; + + rec = $0 +} +/^ /{ + rec = sprintf("%s\n%s", rec, $0); +} +/pgno/{ + for (i = 0; i <= npages; i++) + if ($2 == pgno[i]) + printme = 1 +} +/right/{ + for (i = 0; i <= npages; i++) + if ($2 == pgno[i]) + printme = 1 +} +/left/{ + for (i = 0; i <= npages; i++) + if ($2 == pgno[i]) + printme = 1 +} diff --git a/db2/progs/db_printlog/range.awk b/db2/progs/db_printlog/range.awk new file mode 100644 index 0000000000..89c56eae52 --- /dev/null +++ b/db2/progs/db_printlog/range.awk @@ -0,0 +1,27 @@ +# @(#)range.awk 10.1 (Sleepycat) 11/1/98 +# +# Print out a range of the log + +/^\[/{ + l = length($1) - 1; + i = index($1, "]"); + file = substr($1, 2, i - 2); + file += 0; + start = i + 2; + offset = substr($1, start, l - start + 1); + i = index(offset, "]"); + offset = substr($1, start, i - 1); + offset += 0; + + if ((file == START_FILE && offset >= START_OFFSET || file > START_FILE)\ + && (file < END_FILE || (file == END_FILE && offset < END_OFFSET))) + printme = 1 + else if (file == END_FILE && offset > END_OFFSET || file > END_FILE) + exit + else + printme = 0 +} +{ + if (printme == 1) + print $0 +} diff --git a/db2/progs/db_printlog/status.awk b/db2/progs/db_printlog/status.awk new file mode 100644 index 0000000000..d97e9357b7 --- /dev/null +++ b/db2/progs/db_printlog/status.awk @@ -0,0 +1,26 @@ +# @(#)status.awk 10.1 (Sleepycat) 11/1/98 +# +# Read through db_printlog output and list all the transactions encountered +# and whether they commited or aborted. +# +# 1 = started +# 2 = commited +BEGIN { + cur_txn = 0 +} +/^\[/{ + if (status[$5] == 0) { + status[$5] = 1; + txns[cur_txn] = $5; + cur_txn++; + } +} +/txn_regop/ { + status[$5] = 2 +} +END { + for (i = 0; i < cur_txn; i++) { + printf("%s\t%s\n", + txns[i], status[txns[i]] == 1 ? "ABORT" : "COMMIT"); + } +} diff --git a/db2/progs/db_printlog/txn.awk b/db2/progs/db_printlog/txn.awk new file mode 100644 index 0000000000..c8d3bd36c8 --- /dev/null +++ b/db2/progs/db_printlog/txn.awk @@ -0,0 +1,30 @@ +# @(#)txn.awk 10.1 (Sleepycat) 11/1/98 +# +# Print out all the records for a comma-separated list of transaction ids. +{ + if (NR == 1) { + ntxns = 0 + while ((ndx = index(TXN, ",")) != 0) { + txn[ntxns] = substr(TXN, 1, ndx - 1); + TXN = substr(TXN, ndx + 1, length(TXN) - ndx); + ntxns++ + } + txn[ntxns] = TXN; + } +} +/^\[/{ + if (printme == 1) { + printf("%s\n", rec); + printme = 0 + } + rec = ""; + + for (i = 0; i <= ntxns; i++) + if (txn[i] == $5) { + rec = $0 + printme = 1 + } +} +/^ /{ + rec = sprintf("%s\n%s", rec, $0); +} diff --git a/db2/progs/db_recover/db_recover.c b/db2/progs/db_recover/db_recover.c index a2845725b8..d946ca15ee 100644 --- a/db2/progs/db_recover/db_recover.c +++ b/db2/progs/db_recover/db_recover.c @@ -11,13 +11,14 @@ static const char copyright[] = "@(#) Copyright (c) 1996, 1997, 1998\n\ Sleepycat Software Inc. All rights reserved.\n"; -static const char sccsid[] = "@(#)db_recover.c 10.19 (Sleepycat) 4/10/98"; +static const char sccsid[] = "@(#)db_recover.c 10.23 (Sleepycat) 10/5/98"; #endif #ifndef NO_SYSTEM_INCLUDES #include <sys/types.h> #include <errno.h> +#include <signal.h> #include <stdlib.h> #include <time.h> #include <unistd.h> @@ -31,6 +32,7 @@ static const char sccsid[] = "@(#)db_recover.c 10.19 (Sleepycat) 4/10/98"; DB_ENV *db_init __P((char *, u_int32_t, int)); int main __P((int, char *[])); +void nosig __P((void)); void usage __P((void)); const char @@ -72,10 +74,15 @@ main(argc, argv) if (argc != 0) usage(); + /* + * Ignore signals -- we don't want to be interrupted because we're + * spending all of our time in the DB library. + */ + nosig(); dbenv = db_init(home, flags, verbose); if (verbose) { __db_err(dbenv, "Recovery complete at %.24s", ctime(&now)); - __db_err(dbenv, "%s %lu %s [%lu][%lu]", + __db_err(dbenv, "%s %lx %s [%lu][%lu]", "Maximum transaction id", (u_long)dbenv->tx_info->region->last_txnid, "Recovery checkpoint", @@ -118,6 +125,20 @@ db_init(home, flags, verbose) return (dbenv); } +/* + * nosig -- + * We don't want to be interrupted. + */ +void +nosig() +{ +#ifdef SIGHUP + (void)signal(SIGHUP, SIG_IGN); +#endif + (void)signal(SIGINT, SIG_IGN); + (void)signal(SIGTERM, SIG_IGN); +} + void usage() { diff --git a/db2/progs/db_stat/db_stat.c b/db2/progs/db_stat/db_stat.c index f2551805b0..cef645da00 100644 --- a/db2/progs/db_stat/db_stat.c +++ b/db2/progs/db_stat/db_stat.c @@ -11,7 +11,7 @@ static const char copyright[] = "@(#) Copyright (c) 1996, 1997, 1998\n\ Sleepycat Software Inc. All rights reserved.\n"; -static const char sccsid[] = "@(#)db_stat.c 8.38 (Sleepycat) 5/30/98"; +static const char sccsid[] = "@(#)db_stat.c 8.41 (Sleepycat) 10/3/98"; #endif #ifndef NO_SYSTEM_INCLUDES @@ -26,6 +26,8 @@ static const char sccsid[] = "@(#)db_stat.c 8.38 (Sleepycat) 5/30/98"; #include <unistd.h> #endif +#undef stat + #include "db_int.h" #include "shqueue.h" #include "db_shash.h" @@ -33,8 +35,6 @@ static const char sccsid[] = "@(#)db_stat.c 8.38 (Sleepycat) 5/30/98"; #include "mp.h" #include "clib_ext.h" -#undef stat - typedef enum { T_NOTSET, T_DB, T_LOCK, T_LOG, T_MPOOL, T_TXN } test_t; int argcheck __P((char *, const char *)); @@ -48,13 +48,12 @@ void log_stats __P((DB_ENV *)); int main __P((int, char *[])); int mpool_ok __P((char *)); void mpool_stats __P((DB_ENV *)); -void onint __P((int)); +void nosig __P((void)); void prflags __P((u_int32_t, const FN *)); int txn_compare __P((const void *, const void *)); void txn_stats __P((DB_ENV *)); void usage __P((void)); -int interrupted; char *internal; const char *progname = "db_stat"; /* Program name. */ @@ -118,15 +117,20 @@ main(argc, argv) if (argc != 0 || ttype == T_NOTSET) usage(); + /* + * Ignore signals -- we don't want to be interrupted because we're + * spending all of our time in the DB library. + */ + nosig(); dbenv = db_init(home, ttype); - (void)signal(SIGINT, onint); - switch (ttype) { case T_DB: if ((errno = db_open(db, DB_UNKNOWN, - DB_RDONLY, 0, dbenv, NULL, &dbp)) != 0) + DB_RDONLY, 0, dbenv, NULL, &dbp)) != 0) { + warn("%s", db); return (1); + } switch (dbp->type) { case DB_BTREE: case DB_RECNO: @@ -158,12 +162,9 @@ main(argc, argv) /* NOTREACHED */ } - (void)db_appexit(dbenv); - - if (interrupted) { - (void)signal(SIGINT, SIG_DFL); - (void)raise(SIGINT); - /* NOTREACHED */ + if ((errno = db_appexit(dbenv)) != 0) { + warn(NULL); + return (1); } return (0); } @@ -218,7 +219,6 @@ btree_stats(dbp) dl("Number of tree duplicate pages.\n", (u_long)sp->bt_dup_pg); dl("Number of tree overflow pages.\n", (u_long)sp->bt_over_pg); dl("Number of pages on the free list.\n", (u_long)sp->bt_free); - dl("Number of pages freed for reuse.\n", (u_long)sp->bt_freed); dl("Number of bytes free in tree internal pages", (u_long)sp->bt_int_pgfree); printf(" (%.0f%% ff).\n", PCT(sp->bt_int_pgfree, sp->bt_int_pg)); @@ -231,17 +231,6 @@ btree_stats(dbp) dl("Number of bytes free in tree overflow pages", (u_long)sp->bt_over_pgfree); printf(" (%.0f%% ff).\n", PCT(sp->bt_over_pgfree, sp->bt_over_pg)); - dl("Number of bytes saved by prefix compression.\n", - (u_long)sp->bt_pfxsaved); - dl("Total number of tree page splits.\n", (u_long)sp->bt_split); - dl("Number of root page splits.\n", (u_long)sp->bt_rootsplit); - dl("Number of fast splits.\n", (u_long)sp->bt_fastsplit); - dl("Number of hits in tree fast-insert code.\n", - (u_long)sp->bt_cache_hit); - dl("Number of misses in tree fast-insert code.\n", - (u_long)sp->bt_cache_miss); - dl("Number of keys added.\n", (u_long)sp->bt_added); - dl("Number of keys deleted.\n", (u_long)sp->bt_deleted); } /* @@ -610,16 +599,17 @@ argcheck(arg, ok_args) } /* - * oninit -- - * Interrupt signal handler. + * nosig -- + * We don't want to be interrupted. */ void -onint(signo) - int signo; +nosig() { - COMPQUIET(signo, 0); - - interrupted = 1; +#ifdef SIGHUP + (void)signal(SIGHUP, SIG_IGN); +#endif + (void)signal(SIGINT, SIG_IGN); + (void)signal(SIGTERM, SIG_IGN); } void diff --git a/db2/txn/txn.c b/db2/txn/txn.c index 4f3ffd8ed2..aa0b3652ce 100644 --- a/db2/txn/txn.c +++ b/db2/txn/txn.c @@ -43,7 +43,7 @@ #include "config.h" #ifndef lint -static const char sccsid[] = "@(#)txn.c 10.58 (Sleepycat) 5/31/98"; +static const char sccsid[] = "@(#)txn.c 10.66 (Sleepycat) 1/3/99"; #endif /* not lint */ @@ -66,12 +66,14 @@ static const char sccsid[] = "@(#)txn.c 10.58 (Sleepycat) 5/31/98"; #include "db_am.h" #include "common_ext.h" -static int __txn_check_running __P((const DB_TXN *)); -static int __txn_end __P((DB_TXN *, int)); -static int __txn_grow_region __P((DB_TXNMGR *)); -static int __txn_init __P((DB_TXNREGION *)); -static int __txn_undo __P((DB_TXN *)); -static int __txn_validate_region __P((DB_TXNMGR *)); +static int __txn_begin __P((DB_TXN *)); +static int __txn_check_running __P((const DB_TXN *, TXN_DETAIL **)); +static int __txn_end __P((DB_TXN *, int)); +static void __txn_freekids __P((DB_TXN *)); +static int __txn_grow_region __P((DB_TXNMGR *)); +static int __txn_init __P((DB_TXNREGION *)); +static int __txn_undo __P((DB_TXN *)); +static int __txn_validate_region __P((DB_TXNMGR *)); /* * This file contains the top level routines of the transaction library. @@ -93,7 +95,10 @@ __txn_init(txn_region) txn_region->magic = DB_TXNMAGIC; txn_region->version = DB_TXNVERSION; txn_region->last_txnid = TXN_MINIMUM; - /* XXX If we ever do more types of locking and logging, this changes. */ + /* + * XXX + * If we ever do more types of locking and logging, this changes. + */ txn_region->logtype = 0; txn_region->locktype = 0; txn_region->time_ckp = now; @@ -132,10 +137,8 @@ txn_open(path, flags, mode, dbenv, mgrpp) maxtxns = dbenv->tx_max != 0 ? dbenv->tx_max : 20; /* Now, create the transaction manager structure and set its fields. */ - if ((tmgrp = (DB_TXNMGR *)__db_calloc(1, sizeof(DB_TXNMGR))) == NULL) { - __db_err(dbenv, "txn_open: %s", strerror(ENOMEM)); - return (ENOMEM); - } + if ((ret = __os_calloc(1, sizeof(DB_TXNMGR), &tmgrp)) != 0) + return (ret); /* Initialize the transaction manager structure. */ tmgrp->mutexp = NULL; @@ -151,7 +154,7 @@ txn_open(path, flags, mode, dbenv, mgrpp) if (path == NULL) tmgrp->reginfo.path = NULL; else - if ((tmgrp->reginfo.path = (char *)__db_strdup(path)) == NULL) + if ((ret = __os_strdup(path, &tmgrp->reginfo.path)) != 0) goto err; tmgrp->reginfo.file = DEFAULT_TXN_FILE; tmgrp->reginfo.mode = mode; @@ -207,36 +210,96 @@ err: if (tmgrp->reginfo.addr != NULL) { } if (tmgrp->reginfo.path != NULL) - FREES(tmgrp->reginfo.path); - FREE(tmgrp, sizeof(*tmgrp)); + __os_freestr(tmgrp->reginfo.path); + __os_free(tmgrp, sizeof(*tmgrp)); return (ret); } /* - * Internally, we use TXN_DETAIL structures, but we allocate and return - * DB_TXN structures that provide access to the transaction ID and the - * offset in the transaction region of the TXN_DETAIL structure. + * __txn_panic -- + * Panic a transaction region. + * + * PUBLIC: void __txn_panic __P((DB_ENV *)); + */ +void +__txn_panic(dbenv) + DB_ENV *dbenv; +{ + if (dbenv->tx_info != NULL) + dbenv->tx_info->region->hdr.panic = 1; +} + +/* + * txn_begin -- + * This is a wrapper to the actual begin process. Normal txn_begin() + * allocates a DB_TXN structure for the caller, while txn_xa_begin() does + * not. Other than that, both call into the common __txn_begin code(). + * + * Internally, we use TXN_DETAIL structures, but the DB_TXN structure + * provides access to the transaction ID and the offset in the transaction + * region of the TXN_DETAIL structure. */ int txn_begin(tmgrp, parent, txnpp) DB_TXNMGR *tmgrp; - DB_TXN *parent; - DB_TXN **txnpp; + DB_TXN *parent, **txnpp; { - DB_LSN begin_lsn; - DB_TXN *retp; - TXN_DETAIL *txnp; - size_t off; - u_int32_t id; + DB_TXN *txn; int ret; - txnp = NULL; - *txnpp = NULL; + TXN_PANIC_CHECK(tmgrp); - if ((retp = (DB_TXN *)__db_malloc(sizeof(DB_TXN))) == NULL) { - __db_err(tmgrp->dbenv, "txn_begin : %s", strerror(ENOMEM)); - return (ENOMEM); + if ((ret = __os_calloc(1, sizeof(DB_TXN), &txn)) != 0) + return (ret); + + txn->parent = parent; + TAILQ_INIT(&txn->kids); + txn->mgrp = tmgrp; + txn->flags = TXN_MALLOC; + if ((ret = __txn_begin(txn)) != 0) { + __os_free(txn, sizeof(DB_TXN)); + txn = NULL; } + if (txn != NULL && parent != NULL) + TAILQ_INSERT_HEAD(&parent->kids, txn, klinks); + *txnpp = txn; + return (ret); +} + +/* + * __txn_xa_begin -- + * XA version of txn_begin. + * + * PUBLIC: int __txn_xa_begin __P((DB_ENV *, DB_TXN *)); + */ +int +__txn_xa_begin(dbenv, txn) + DB_ENV *dbenv; + DB_TXN *txn; +{ + TXN_PANIC_CHECK(dbenv->tx_info); + + memset(txn, 0, sizeof(DB_TXN)); + + txn->mgrp = dbenv->tx_info; + + return (__txn_begin(txn)); +} + +/* + * __txn_begin -- + * Normal DB version of txn_begin. + */ +static int +__txn_begin(txn) + DB_TXN *txn; +{ + DB_LSN begin_lsn; + DB_TXNMGR *mgr; + TXN_DETAIL *td; + size_t off; + u_int32_t id; + int ret; /* * We do not have to write begin records (and if we do not, then we @@ -244,65 +307,67 @@ txn_begin(tmgrp, parent, txnpp) * we do need to find the current LSN so that we can store it in the * transaction structure, so we can know where to take checkpoints. */ - if (tmgrp->dbenv->lg_info != NULL && (ret = - log_put(tmgrp->dbenv->lg_info, &begin_lsn, NULL, DB_CURLSN)) != 0) + mgr = txn->mgrp; + if (mgr->dbenv->lg_info != NULL && (ret = + log_put(mgr->dbenv->lg_info, &begin_lsn, NULL, DB_CURLSN)) != 0) goto err2; - LOCK_TXNREGION(tmgrp); + LOCK_TXNREGION(mgr); /* Make sure that last_txnid is not going to wrap around. */ - if (tmgrp->region->last_txnid == TXN_INVALID) { - __db_err(tmgrp->dbenv, "txn_begin: %s %s", + if (mgr->region->last_txnid == TXN_INVALID) { + __db_err(mgr->dbenv, "txn_begin: %s %s", "Transaction ID wrapping.", "Snapshot your database and start a new log."); ret = EINVAL; goto err1; } - if ((ret = __txn_validate_region(tmgrp)) != 0) + if ((ret = __txn_validate_region(mgr)) != 0) goto err1; /* Allocate a new transaction detail structure. */ - if ((ret = __db_shalloc(tmgrp->mem, sizeof(TXN_DETAIL), 0, &txnp)) != 0 - && ret == ENOMEM && (ret = __txn_grow_region(tmgrp)) == 0) - ret = __db_shalloc(tmgrp->mem, sizeof(TXN_DETAIL), 0, &txnp); + if ((ret = __db_shalloc(mgr->mem, sizeof(TXN_DETAIL), 0, &td)) != 0 + && ret == ENOMEM && (ret = __txn_grow_region(mgr)) == 0) + ret = __db_shalloc(mgr->mem, sizeof(TXN_DETAIL), 0, &td); if (ret != 0) goto err1; /* Place transaction on active transaction list. */ - SH_TAILQ_INSERT_HEAD(&tmgrp->region->active_txn, - txnp, links, __txn_detail); - - id = ++tmgrp->region->last_txnid; - tmgrp->region->nbegins++; - - txnp->txnid = id; - txnp->begin_lsn = begin_lsn; - ZERO_LSN(txnp->last_lsn); - txnp->last_lock = 0; - txnp->status = TXN_RUNNING; - off = (u_int8_t *)txnp - (u_int8_t *)tmgrp->region; - UNLOCK_TXNREGION(tmgrp); + SH_TAILQ_INSERT_HEAD(&mgr->region->active_txn, td, links, __txn_detail); + + id = ++mgr->region->last_txnid; + ++mgr->region->nbegins; + + td->txnid = id; + td->begin_lsn = begin_lsn; + ZERO_LSN(td->last_lsn); + td->last_lock = 0; + td->status = TXN_RUNNING; + if (txn->parent != NULL) + td->parent = txn->parent->off; + else + td->parent = 0; - ZERO_LSN(retp->last_lsn); - retp->txnid = id; - retp->parent = parent; - retp->mgrp = tmgrp; - retp->off = off; + off = (u_int8_t *)td - (u_int8_t *)mgr->region; + UNLOCK_TXNREGION(mgr); + + ZERO_LSN(txn->last_lsn); + txn->txnid = id; + txn->off = off; - LOCK_TXNTHREAD(tmgrp); - TAILQ_INSERT_TAIL(&tmgrp->txn_chain, retp, links); - UNLOCK_TXNTHREAD(tmgrp); + if (F_ISSET(txn, TXN_MALLOC)) { + LOCK_TXNTHREAD(mgr); + TAILQ_INSERT_TAIL(&mgr->txn_chain, txn, links); + UNLOCK_TXNTHREAD(mgr); + } - *txnpp = retp; return (0); -err1: UNLOCK_TXNREGION(tmgrp); +err1: UNLOCK_TXNREGION(mgr); -err2: __db_free(retp); - return (ret); +err2: return (ret); } - /* * txn_commit -- * Commit a transaction. @@ -312,21 +377,43 @@ txn_commit(txnp) DB_TXN *txnp; { DB_LOG *logp; + DB_TXNMGR *mgr; int ret; - if ((ret = __txn_check_running(txnp)) != 0) + mgr = txnp->mgrp; + + TXN_PANIC_CHECK(mgr); + if ((ret = __txn_check_running(txnp, NULL)) != 0) return (ret); /* * If there are any log records, write a log record and sync - * the log, else do no log writes. + * the log, else do no log writes. If the commit is for a child + * transaction, we do not need to commit the child synchronously + * since if its parent aborts, it will abort too and its parent + * (or ultimate ancestor) will write synchronously. */ - if ((logp = txnp->mgrp->dbenv->lg_info) != NULL && - !IS_ZERO_LSN(txnp->last_lsn) && - (ret = __txn_regop_log(logp, txnp, &txnp->last_lsn, - F_ISSET(txnp->mgrp, DB_TXN_NOSYNC) ? 0 : DB_FLUSH, - TXN_COMMIT)) != 0) - return (ret); + if ((logp = mgr->dbenv->lg_info) != NULL && + !IS_ZERO_LSN(txnp->last_lsn)) { + if (txnp->parent == NULL) + ret = __txn_regop_log(logp, txnp, &txnp->last_lsn, + F_ISSET(mgr, DB_TXN_NOSYNC) ? 0 : DB_FLUSH, + TXN_COMMIT); + else + ret = __txn_child_log(logp, txnp, &txnp->last_lsn, 0, + TXN_COMMIT, txnp->parent->txnid); + if (ret != 0) + return (ret); + } + + /* + * If this is the senior ancestor (i.e., it has no children), then we + * can release all the child transactions since everyone is committing. + * Then we can release this transaction. If this is not the ultimate + * ancestor, then we can neither free it or its children. + */ + if (txnp->parent == NULL) + __txn_freekids(txnp); return (__txn_end(txnp, 1)); } @@ -340,10 +427,17 @@ txn_abort(txnp) DB_TXN *txnp; { int ret; + DB_TXN *kids; - if ((ret = __txn_check_running(txnp)) != 0) + TXN_PANIC_CHECK(txnp->mgrp); + if ((ret = __txn_check_running(txnp, NULL)) != 0) return (ret); + for (kids = TAILQ_FIRST(&txnp->kids); + kids != NULL; + kids = TAILQ_FIRST(&txnp->kids)) + txn_abort(kids); + if ((ret = __txn_undo(txnp)) != 0) { __db_err(txnp->mgrp->dbenv, "txn_abort: Log undo failed %s", strerror(ret)); @@ -353,30 +447,45 @@ txn_abort(txnp) } /* - * Flush the log so a future commit is guaranteed to succeed. + * txn_prepare -- + * Flush the log so a future commit is guaranteed to succeed. */ int txn_prepare(txnp) DB_TXN *txnp; { - TXN_DETAIL *tp; + DBT xid; + DB_ENV *dbenv; + TXN_DETAIL *td; int ret; - if ((ret = __txn_check_running(txnp)) != 0) + if ((ret = __txn_check_running(txnp, &td)) != 0) return (ret); - if (txnp->mgrp->dbenv->lg_info != NULL) { - if ((ret = log_flush(txnp->mgrp->dbenv->lg_info, - &txnp->last_lsn)) != 0) - __db_err(txnp->mgrp->dbenv, - "txn_prepare: log_flush failed %s\n", - strerror(ret)); + dbenv = txnp->mgrp->dbenv; + memset(&xid, 0, sizeof(xid)); + xid.data = td->xid; + /* + * We indicate that a transaction is an XA transaction by putting + * a valid size in the xid.size fiels. XA requires that the transaction + * be either ENDED or SUSPENDED when prepare is called, so we know + * that if the xa_status isn't in one of those states, but we are + * calling prepare that we are not an XA transaction. + */ + xid.size = + td->xa_status != TXN_XA_ENDED && td->xa_status != TXN_XA_SUSPENDED ? + 0 : sizeof(td->xid); + if (dbenv->lg_info != NULL && + (ret = __txn_xa_regop_log(dbenv->lg_info, txnp, &txnp->last_lsn, + F_ISSET(txnp->mgrp, DB_TXN_NOSYNC) ? 0 : DB_FLUSH, TXN_PREPARE, + &xid, td->format, td->gtrid, td->bqual, &td->begin_lsn)) != 0) { + __db_err(dbenv, + "txn_prepare: log_write failed %s\n", strerror(ret)); return (ret); } LOCK_TXNTHREAD(txnp->mgrp); - tp = (TXN_DETAIL *)((u_int8_t *)txnp->mgrp->region + txnp->off); - tp->status = TXN_PREPARED; + td->status = TXN_PREPARED; UNLOCK_TXNTHREAD(txnp->mgrp); return (ret); } @@ -402,6 +511,8 @@ txn_close(tmgrp) DB_TXN *txnp; int ret, t_ret; + TXN_PANIC_CHECK(tmgrp); + ret = 0; /* @@ -431,8 +542,8 @@ txn_close(tmgrp) ret = t_ret; if (tmgrp->reginfo.path != NULL) - FREES(tmgrp->reginfo.path); - FREE(tmgrp, sizeof(*tmgrp)); + __os_freestr(tmgrp->reginfo.path); + __os_free(tmgrp, sizeof(*tmgrp)); return (ret); } @@ -453,12 +564,12 @@ txn_unlink(path, force, dbenv) memset(®info, 0, sizeof(reginfo)); reginfo.dbenv = dbenv; reginfo.appname = DB_APP_NONE; - if (path != NULL && (reginfo.path = (char *)__db_strdup(path)) == NULL) - return (ENOMEM); + if (path != NULL && (ret = __os_strdup(path, ®info.path)) != 0) + return (ret); reginfo.file = DEFAULT_TXN_FILE; ret = __db_runlink(®info, force); if (reginfo.path != NULL) - FREES(reginfo.path); + __os_freestr(reginfo.path); return (ret); } @@ -468,16 +579,23 @@ txn_unlink(path, force, dbenv) * Return 0 if the txnp is reasonable, otherwise returns EINVAL. */ static int -__txn_check_running(txnp) +__txn_check_running(txnp, tdp) const DB_TXN *txnp; + TXN_DETAIL **tdp; { TXN_DETAIL *tp; tp = NULL; if (txnp != NULL && txnp->mgrp != NULL && txnp->mgrp->region != NULL) { tp = (TXN_DETAIL *)((u_int8_t *)txnp->mgrp->region + txnp->off); - if (tp->status != TXN_RUNNING) + /* + * Child transactions could be marked committed which is OK. + */ + if (tp->status != TXN_RUNNING && + tp->status != TXN_PREPARED && tp->status != TXN_COMMITTED) tp = NULL; + if (tdp != NULL) + *tdp = tp; } return (tp == NULL ? EINVAL : 0); @@ -488,25 +606,22 @@ __txn_end(txnp, is_commit) DB_TXN *txnp; int is_commit; { + DB_LOCKREQ request; DB_TXNMGR *mgr; TXN_DETAIL *tp; - DB_LOCKREQ request; - int ret; u_int32_t locker; + int ret; mgr = txnp->mgrp; - LOCK_TXNTHREAD(mgr); - TAILQ_REMOVE(&mgr->txn_chain, txnp, links); - UNLOCK_TXNTHREAD(mgr); - /* Release the locks. */ locker = txnp->txnid; - request.op = DB_LOCK_PUT_ALL; + request.op = txnp->parent == NULL || + is_commit == 0 ? DB_LOCK_PUT_ALL : DB_LOCK_INHERIT; if (mgr->dbenv->lk_info) { - ret = lock_vec(mgr->dbenv->lk_info, locker, 0, - &request, 1, NULL); + ret = + lock_tvec(mgr->dbenv->lk_info, txnp, 0, &request, 1, NULL); if (ret != 0 && (ret != DB_LOCK_DEADLOCK || is_commit)) { __db_err(mgr->dbenv, "%s: release locks failed %s", is_commit ? "txn_commit" : "txn_abort", @@ -517,16 +632,44 @@ __txn_end(txnp, is_commit) /* End the transaction. */ LOCK_TXNREGION(mgr); + + /* + * Child transactions that are committing cannot be released until + * the parent commits, since the parent may abort, causing the child + * to abort as well. + */ tp = (TXN_DETAIL *)((u_int8_t *)mgr->region + txnp->off); - SH_TAILQ_REMOVE(&mgr->region->active_txn, tp, links, __txn_detail); - __db_shalloc_free(mgr->mem, tp); + if (txnp->parent == NULL || !is_commit) { + SH_TAILQ_REMOVE(&mgr->region->active_txn, + tp, links, __txn_detail); + + __db_shalloc_free(mgr->mem, tp); + } else + tp->status = is_commit ? TXN_COMMITTED : TXN_ABORTED; + if (is_commit) mgr->region->ncommits++; else mgr->region->naborts++; + UNLOCK_TXNREGION(mgr); - FREE(txnp, sizeof(*txnp)); + /* + * If the transaction aborted, we can remove it from its parent links. + * If it committed, then we need to leave it on, since the parent can + * still abort. + */ + if (txnp->parent != NULL && !is_commit) + TAILQ_REMOVE(&txnp->parent->kids, txnp, klinks); + + /* Free the space. */ + if (F_ISSET(txnp, TXN_MALLOC) && (txnp->parent == NULL || !is_commit)) { + LOCK_TXNTHREAD(mgr); + TAILQ_REMOVE(&mgr->txn_chain, txnp, links); + UNLOCK_TXNTHREAD(mgr); + + __os_free(txnp, sizeof(*txnp)); + } return (0); } @@ -571,7 +714,7 @@ __txn_undo(txnp) ret = mgr->recover(logp, &rdbt, &key_lsn, TXN_UNDO, NULL); if (F_ISSET(logp, DB_AM_THREAD) && rdbt.data != NULL) { - __db_free(rdbt.data); + __os_free(rdbt.data, rdbt.size); rdbt.data = NULL; } } @@ -597,13 +740,15 @@ txn_checkpoint(mgr, kbytes, minutes) const DB_TXNMGR *mgr; u_int32_t kbytes, minutes; { - TXN_DETAIL *txnp; - DB_LSN ckp_lsn, last_ckp; DB_LOG *dblp; - u_int32_t kbytes_written; + DB_LSN ckp_lsn, sync_lsn, last_ckp; + TXN_DETAIL *txnp; time_t last_ckp_time, now; + u_int32_t kbytes_written; int ret; + TXN_PANIC_CHECK(mgr); + /* * Check if we need to run recovery. */ @@ -672,8 +817,13 @@ do_ckp: mgr->region->pending_ckp = ckp_lsn; UNLOCK_TXNREGION(mgr); + /* + * memp_sync may change the lsn you pass it, so don't pass it + * the actual ckp_lsn, pass it a temp instead. + */ + sync_lsn = ckp_lsn; if (mgr->dbenv->mp_info != NULL && - (ret = memp_sync(mgr->dbenv->mp_info, &ckp_lsn)) != 0) { + (ret = memp_sync(mgr->dbenv->mp_info, &sync_lsn)) != 0) { /* * ret == DB_INCOMPLETE means that there are still buffers to * flush, the checkpoint is not complete. Wait and try again. @@ -776,6 +926,9 @@ txn_stat(mgr, statp, db_malloc) TXN_DETAIL *txnp; size_t nbytes; u_int32_t nactive, ndx; + int ret; + + TXN_PANIC_CHECK(mgr); LOCK_TXNREGION(mgr); nactive = mgr->region->nbegins - @@ -787,13 +940,8 @@ txn_stat(mgr, statp, db_malloc) * that have been created since we unlocked the region. */ nbytes = sizeof(DB_TXN_STAT) + sizeof(DB_TXN_ACTIVE) * (nactive + 200); - if (db_malloc == NULL) - stats = (DB_TXN_STAT *)__db_malloc(nbytes); - else - stats = (DB_TXN_STAT *)db_malloc(nbytes); - - if (stats == NULL) - return (ENOMEM); + if ((ret = __os_malloc(nbytes, db_malloc, &stats)) != 0) + return (ret); LOCK_TXNREGION(mgr); stats->st_last_txnid = mgr->region->last_txnid; @@ -831,3 +979,68 @@ txn_stat(mgr, statp, db_malloc) *statp = stats; return (0); } + +static void +__txn_freekids(txnp) + DB_TXN *txnp; +{ + DB_TXNMGR *mgr; + TXN_DETAIL *tp; + DB_TXN *kids; + + mgr = txnp->mgrp; + + for (kids = TAILQ_FIRST(&txnp->kids); + kids != NULL; + kids = TAILQ_FIRST(&txnp->kids)) { + /* Free any children of this transaction. */ + __txn_freekids(kids); + + /* Free the transaction detail in the region. */ + LOCK_TXNREGION(mgr); + tp = (TXN_DETAIL *)((u_int8_t *)mgr->region + kids->off); + SH_TAILQ_REMOVE(&mgr->region->active_txn, + tp, links, __txn_detail); + + __db_shalloc_free(mgr->mem, tp); + UNLOCK_TXNREGION(mgr); + + /* Now remove from its parent. */ + TAILQ_REMOVE(&txnp->kids, kids, klinks); + if (F_ISSET(txnp, TXN_MALLOC)) { + LOCK_TXNTHREAD(mgr); + TAILQ_REMOVE(&mgr->txn_chain, kids, links); + UNLOCK_TXNTHREAD(mgr); + __os_free(kids, sizeof(*kids)); + } + } +} + +/* + * __txn_is_ancestor -- + * Determine if a transaction is an ancestor of another transaction. + * This is used during lock promotion when we do not have the per-process + * data structures that link parents together. Instead, we'll have to + * follow the links in the transaction region. + * + * PUBLIC: int __txn_is_ancestor __P((DB_TXNMGR *, size_t, size_t)); + */ +int +__txn_is_ancestor(mgr, hold_off, req_off) + DB_TXNMGR *mgr; + size_t hold_off, req_off; +{ + TXN_DETAIL *hold_tp, *req_tp; + + hold_tp = (TXN_DETAIL *)((u_int8_t *)mgr->region + hold_off); + req_tp = (TXN_DETAIL *)((u_int8_t *)mgr->region + req_off); + + while (req_tp->parent != 0) { + req_tp = + (TXN_DETAIL *)((u_int8_t *)mgr->region + req_tp->parent); + if (req_tp->txnid == hold_tp->txnid) + return (1); + } + + return (0); +} diff --git a/db2/txn/txn.src b/db2/txn/txn.src index 04809b69d6..c9614f6d6b 100644 --- a/db2/txn/txn.src +++ b/db2/txn/txn.src @@ -4,26 +4,52 @@ * Copyright (c) 1996, 1997, 1998 * Sleepycat Software. All rights reserved. * - * @(#)txn.src 10.3 (Sleepycat) 4/10/98 + * @(#)txn.src 10.6 (Sleepycat) 1/3/99 */ PREFIX txn /* - * Everything except for checkpointing takes the same logging routine. + * This is the standard log operation for commit. */ BEGIN regop ARG opcode u_int32_t lu END /* - * This is the checkpoint record. It contains the lsn that the checkpoint - * guarantees and a pointer to the last checkpoint so that we can walk - * backwards by checkpoint. + * This is the checkpoint record. It contains the lsn that the checkpoint + * guarantees and a pointer to the last checkpoint so we can walk backwards + * by checkpoint. + * * ckp_lsn: + * The lsn in the log of the most recent point at which all begun + * transactions have been aborted. This is the point for which + * the checkpoint is relevant. * last_ckp: + * The previous checkpoint. */ BEGIN ckp POINTER ckp_lsn DB_LSN * lu POINTER last_ckp DB_LSN * lu END + +/* + * This is the standard log operation for prepare (since right now + * we only use prepare in an XA environment). + */ +BEGIN xa_regop +ARG opcode u_int32_t lu +DBT xid DBT s +ARG formatID int32_t ld +ARG gtrid u_int32_t u +ARG bqual u_int32_t u +POINTER begin_lsn DB_LSN * lu +END + +/* + * This is the log operation for a child commit. + */ +BEGIN child +ARG opcode u_int32_t lu +ARG parent u_int32_t lu +END diff --git a/db2/txn/txn_auto.c b/db2/txn/txn_auto.c index f03a52991f..e6d431f089 100644 --- a/db2/txn/txn_auto.c +++ b/db2/txn/txn_auto.c @@ -10,7 +10,6 @@ #endif #include "db_int.h" -#include "shqueue.h" #include "db_page.h" #include "db_dispatch.h" #include "txn.h" @@ -37,15 +36,14 @@ int __txn_regop_log(logp, txnid, ret_lsnp, flags, rectype = DB_txn_regop; txn_num = txnid == NULL ? 0 : txnid->txnid; if (txnid == NULL) { - null_lsn.file = 0; - null_lsn.offset = 0; + ZERO_LSN(null_lsn); lsnp = &null_lsn; } else lsnp = &txnid->last_lsn; logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) + sizeof(opcode); - if ((logrec.data = (void *)__db_malloc(logrec.size)) == NULL) - return (ENOMEM); + if ((ret = __os_malloc(logrec.size, NULL, &logrec.data)) != 0) + return (ret); bp = logrec.data; memcpy(bp, &rectype, sizeof(rectype)); @@ -63,7 +61,7 @@ int __txn_regop_log(logp, txnid, ret_lsnp, flags, ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags); if (txnid != NULL) txnid->last_lsn = *ret_lsnp; - __db_free(logrec.data); + __os_free(logrec.data, 0); return (ret); } @@ -101,7 +99,7 @@ __txn_regop_print(notused1, dbtp, lsnp, notused2, notused3) (u_long)argp->prev_lsn.offset); printf("\topcode: %lu\n", (u_long)argp->opcode); printf("\n"); - __db_free(argp); + __os_free(argp, 0); return (0); } @@ -115,11 +113,12 @@ __txn_regop_read(recbuf, argpp) { __txn_regop_args *argp; u_int8_t *bp; + int ret; - argp = (__txn_regop_args *)__db_malloc(sizeof(__txn_regop_args) + - sizeof(DB_TXN)); - if (argp == NULL) - return (ENOMEM); + ret = __os_malloc(sizeof(__txn_regop_args) + + sizeof(DB_TXN), NULL, &argp); + if (ret != 0) + return (ret); argp->txnid = (DB_TXN *)&argp[1]; bp = recbuf; memcpy(&argp->type, bp, sizeof(argp->type)); @@ -157,16 +156,15 @@ int __txn_ckp_log(logp, txnid, ret_lsnp, flags, rectype = DB_txn_ckp; txn_num = txnid == NULL ? 0 : txnid->txnid; if (txnid == NULL) { - null_lsn.file = 0; - null_lsn.offset = 0; + ZERO_LSN(null_lsn); lsnp = &null_lsn; } else lsnp = &txnid->last_lsn; logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) + sizeof(*ckp_lsn) + sizeof(*last_ckp); - if ((logrec.data = (void *)__db_malloc(logrec.size)) == NULL) - return (ENOMEM); + if ((ret = __os_malloc(logrec.size, NULL, &logrec.data)) != 0) + return (ret); bp = logrec.data; memcpy(bp, &rectype, sizeof(rectype)); @@ -192,7 +190,7 @@ int __txn_ckp_log(logp, txnid, ret_lsnp, flags, ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags); if (txnid != NULL) txnid->last_lsn = *ret_lsnp; - __db_free(logrec.data); + __os_free(logrec.data, 0); return (ret); } @@ -233,7 +231,7 @@ __txn_ckp_print(notused1, dbtp, lsnp, notused2, notused3) printf("\tlast_ckp: [%lu][%lu]\n", (u_long)argp->last_ckp.file, (u_long)argp->last_ckp.offset); printf("\n"); - __db_free(argp); + __os_free(argp, 0); return (0); } @@ -247,11 +245,12 @@ __txn_ckp_read(recbuf, argpp) { __txn_ckp_args *argp; u_int8_t *bp; + int ret; - argp = (__txn_ckp_args *)__db_malloc(sizeof(__txn_ckp_args) + - sizeof(DB_TXN)); - if (argp == NULL) - return (ENOMEM); + ret = __os_malloc(sizeof(__txn_ckp_args) + + sizeof(DB_TXN), NULL, &argp); + if (ret != 0) + return (ret); argp->txnid = (DB_TXN *)&argp[1]; bp = recbuf; memcpy(&argp->type, bp, sizeof(argp->type)); @@ -269,6 +268,310 @@ __txn_ckp_read(recbuf, argpp) } /* + * PUBLIC: int __txn_xa_regop_log + * PUBLIC: __P((DB_LOG *, DB_TXN *, DB_LSN *, u_int32_t, + * PUBLIC: u_int32_t, const DBT *, int32_t, u_int32_t, + * PUBLIC: u_int32_t, DB_LSN *)); + */ +int __txn_xa_regop_log(logp, txnid, ret_lsnp, flags, + opcode, xid, formatID, gtrid, bqual, begin_lsn) + DB_LOG *logp; + DB_TXN *txnid; + DB_LSN *ret_lsnp; + u_int32_t flags; + u_int32_t opcode; + const DBT *xid; + int32_t formatID; + u_int32_t gtrid; + u_int32_t bqual; + DB_LSN * begin_lsn; +{ + DBT logrec; + DB_LSN *lsnp, null_lsn; + u_int32_t zero; + u_int32_t rectype, txn_num; + int ret; + u_int8_t *bp; + + rectype = DB_txn_xa_regop; + txn_num = txnid == NULL ? 0 : txnid->txnid; + if (txnid == NULL) { + ZERO_LSN(null_lsn); + lsnp = &null_lsn; + } else + lsnp = &txnid->last_lsn; + logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) + + sizeof(opcode) + + sizeof(u_int32_t) + (xid == NULL ? 0 : xid->size) + + sizeof(formatID) + + sizeof(gtrid) + + sizeof(bqual) + + sizeof(*begin_lsn); + if ((ret = __os_malloc(logrec.size, NULL, &logrec.data)) != 0) + return (ret); + + bp = logrec.data; + memcpy(bp, &rectype, sizeof(rectype)); + bp += sizeof(rectype); + memcpy(bp, &txn_num, sizeof(txn_num)); + bp += sizeof(txn_num); + memcpy(bp, lsnp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(bp, &opcode, sizeof(opcode)); + bp += sizeof(opcode); + if (xid == NULL) { + zero = 0; + memcpy(bp, &zero, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + } else { + memcpy(bp, &xid->size, sizeof(xid->size)); + bp += sizeof(xid->size); + memcpy(bp, xid->data, xid->size); + bp += xid->size; + } + memcpy(bp, &formatID, sizeof(formatID)); + bp += sizeof(formatID); + memcpy(bp, >rid, sizeof(gtrid)); + bp += sizeof(gtrid); + memcpy(bp, &bqual, sizeof(bqual)); + bp += sizeof(bqual); + if (begin_lsn != NULL) + memcpy(bp, begin_lsn, sizeof(*begin_lsn)); + else + memset(bp, 0, sizeof(*begin_lsn)); + bp += sizeof(*begin_lsn); +#ifdef DIAGNOSTIC + if ((u_int32_t)(bp - (u_int8_t *)logrec.data) != logrec.size) + fprintf(stderr, "Error in log record length"); +#endif + ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags); + if (txnid != NULL) + txnid->last_lsn = *ret_lsnp; + __os_free(logrec.data, 0); + return (ret); +} + +/* + * PUBLIC: int __txn_xa_regop_print + * PUBLIC: __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); + */ +int +__txn_xa_regop_print(notused1, dbtp, lsnp, notused2, notused3) + DB_LOG *notused1; + DBT *dbtp; + DB_LSN *lsnp; + int notused2; + void *notused3; +{ + __txn_xa_regop_args *argp; + u_int32_t i; + u_int ch; + int ret; + + i = 0; + ch = 0; + notused1 = NULL; + notused2 = 0; + notused3 = NULL; + + if ((ret = __txn_xa_regop_read(dbtp->data, &argp)) != 0) + return (ret); + printf("[%lu][%lu]txn_xa_regop: rec: %lu txnid %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, + (u_long)lsnp->offset, + (u_long)argp->type, + (u_long)argp->txnid->txnid, + (u_long)argp->prev_lsn.file, + (u_long)argp->prev_lsn.offset); + printf("\topcode: %lu\n", (u_long)argp->opcode); + printf("\txid: "); + for (i = 0; i < argp->xid.size; i++) { + ch = ((u_int8_t *)argp->xid.data)[i]; + if (isprint(ch) || ch == 0xa) + putchar(ch); + else + printf("%#x ", ch); + } + printf("\n"); + printf("\tformatID: %ld\n", (long)argp->formatID); + printf("\tgtrid: %u\n", argp->gtrid); + printf("\tbqual: %u\n", argp->bqual); + printf("\tbegin_lsn: [%lu][%lu]\n", + (u_long)argp->begin_lsn.file, (u_long)argp->begin_lsn.offset); + printf("\n"); + __os_free(argp, 0); + return (0); +} + +/* + * PUBLIC: int __txn_xa_regop_read __P((void *, __txn_xa_regop_args **)); + */ +int +__txn_xa_regop_read(recbuf, argpp) + void *recbuf; + __txn_xa_regop_args **argpp; +{ + __txn_xa_regop_args *argp; + u_int8_t *bp; + int ret; + + ret = __os_malloc(sizeof(__txn_xa_regop_args) + + sizeof(DB_TXN), NULL, &argp); + if (ret != 0) + return (ret); + argp->txnid = (DB_TXN *)&argp[1]; + bp = recbuf; + memcpy(&argp->type, bp, sizeof(argp->type)); + bp += sizeof(argp->type); + memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid)); + bp += sizeof(argp->txnid->txnid); + memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(&argp->opcode, bp, sizeof(argp->opcode)); + bp += sizeof(argp->opcode); + memcpy(&argp->xid.size, bp, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + argp->xid.data = bp; + bp += argp->xid.size; + memcpy(&argp->formatID, bp, sizeof(argp->formatID)); + bp += sizeof(argp->formatID); + memcpy(&argp->gtrid, bp, sizeof(argp->gtrid)); + bp += sizeof(argp->gtrid); + memcpy(&argp->bqual, bp, sizeof(argp->bqual)); + bp += sizeof(argp->bqual); + memcpy(&argp->begin_lsn, bp, sizeof(argp->begin_lsn)); + bp += sizeof(argp->begin_lsn); + *argpp = argp; + return (0); +} + +/* + * PUBLIC: int __txn_child_log + * PUBLIC: __P((DB_LOG *, DB_TXN *, DB_LSN *, u_int32_t, + * PUBLIC: u_int32_t, u_int32_t)); + */ +int __txn_child_log(logp, txnid, ret_lsnp, flags, + opcode, parent) + DB_LOG *logp; + DB_TXN *txnid; + DB_LSN *ret_lsnp; + u_int32_t flags; + u_int32_t opcode; + u_int32_t parent; +{ + DBT logrec; + DB_LSN *lsnp, null_lsn; + u_int32_t rectype, txn_num; + int ret; + u_int8_t *bp; + + rectype = DB_txn_child; + txn_num = txnid == NULL ? 0 : txnid->txnid; + if (txnid == NULL) { + ZERO_LSN(null_lsn); + lsnp = &null_lsn; + } else + lsnp = &txnid->last_lsn; + logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) + + sizeof(opcode) + + sizeof(parent); + if ((ret = __os_malloc(logrec.size, NULL, &logrec.data)) != 0) + return (ret); + + bp = logrec.data; + memcpy(bp, &rectype, sizeof(rectype)); + bp += sizeof(rectype); + memcpy(bp, &txn_num, sizeof(txn_num)); + bp += sizeof(txn_num); + memcpy(bp, lsnp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(bp, &opcode, sizeof(opcode)); + bp += sizeof(opcode); + memcpy(bp, &parent, sizeof(parent)); + bp += sizeof(parent); +#ifdef DIAGNOSTIC + if ((u_int32_t)(bp - (u_int8_t *)logrec.data) != logrec.size) + fprintf(stderr, "Error in log record length"); +#endif + ret = log_put(logp, ret_lsnp, (DBT *)&logrec, flags); + if (txnid != NULL) + txnid->last_lsn = *ret_lsnp; + __os_free(logrec.data, 0); + return (ret); +} + +/* + * PUBLIC: int __txn_child_print + * PUBLIC: __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); + */ +int +__txn_child_print(notused1, dbtp, lsnp, notused2, notused3) + DB_LOG *notused1; + DBT *dbtp; + DB_LSN *lsnp; + int notused2; + void *notused3; +{ + __txn_child_args *argp; + u_int32_t i; + u_int ch; + int ret; + + i = 0; + ch = 0; + notused1 = NULL; + notused2 = 0; + notused3 = NULL; + + if ((ret = __txn_child_read(dbtp->data, &argp)) != 0) + return (ret); + printf("[%lu][%lu]txn_child: rec: %lu txnid %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, + (u_long)lsnp->offset, + (u_long)argp->type, + (u_long)argp->txnid->txnid, + (u_long)argp->prev_lsn.file, + (u_long)argp->prev_lsn.offset); + printf("\topcode: %lu\n", (u_long)argp->opcode); + printf("\tparent: %lu\n", (u_long)argp->parent); + printf("\n"); + __os_free(argp, 0); + return (0); +} + +/* + * PUBLIC: int __txn_child_read __P((void *, __txn_child_args **)); + */ +int +__txn_child_read(recbuf, argpp) + void *recbuf; + __txn_child_args **argpp; +{ + __txn_child_args *argp; + u_int8_t *bp; + int ret; + + ret = __os_malloc(sizeof(__txn_child_args) + + sizeof(DB_TXN), NULL, &argp); + if (ret != 0) + return (ret); + argp->txnid = (DB_TXN *)&argp[1]; + bp = recbuf; + memcpy(&argp->type, bp, sizeof(argp->type)); + bp += sizeof(argp->type); + memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid)); + bp += sizeof(argp->txnid->txnid); + memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(&argp->opcode, bp, sizeof(argp->opcode)); + bp += sizeof(argp->opcode); + memcpy(&argp->parent, bp, sizeof(argp->parent)); + bp += sizeof(argp->parent); + *argpp = argp; + return (0); +} + +/* * PUBLIC: int __txn_init_print __P((DB_ENV *)); */ int @@ -283,6 +586,12 @@ __txn_init_print(dbenv) if ((ret = __db_add_recovery(dbenv, __txn_ckp_print, DB_txn_ckp)) != 0) return (ret); + if ((ret = __db_add_recovery(dbenv, + __txn_xa_regop_print, DB_txn_xa_regop)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __txn_child_print, DB_txn_child)) != 0) + return (ret); return (0); } @@ -301,6 +610,12 @@ __txn_init_recover(dbenv) if ((ret = __db_add_recovery(dbenv, __txn_ckp_recover, DB_txn_ckp)) != 0) return (ret); + if ((ret = __db_add_recovery(dbenv, + __txn_xa_regop_recover, DB_txn_xa_regop)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __txn_child_recover, DB_txn_child)) != 0) + return (ret); return (0); } diff --git a/db2/txn/txn_rec.c b/db2/txn/txn_rec.c index e53dc5f3b7..f21a0f92c8 100644 --- a/db2/txn/txn_rec.c +++ b/db2/txn/txn_rec.c @@ -40,7 +40,7 @@ #include "config.h" #ifndef lint -static const char sccsid[] = "@(#)txn_rec.c 10.11 (Sleepycat) 5/3/98"; +static const char sccsid[] = "@(#)txn_rec.c 10.15 (Sleepycat) 1/3/99"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -54,10 +54,18 @@ static const char sccsid[] = "@(#)txn_rec.c 10.11 (Sleepycat) 5/3/98"; #include "shqueue.h" #include "txn.h" #include "db_am.h" +#include "log.h" +#include "common_ext.h" +static int __txn_restore_txn __P((DB_ENV *, DB_LSN *, __txn_xa_regop_args *)); + +#define IS_XA_TXN(R) (R->xid.size != 0) + /* * PUBLIC: int __txn_regop_recover - * PUBLIC: __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); + * PUBLIC: __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); + * + * These records are only ever written for commits. */ int __txn_regop_recover(logp, dbtp, lsnp, redo, info) @@ -79,24 +87,80 @@ __txn_regop_recover(logp, dbtp, lsnp, redo, info) if ((ret = __txn_regop_read(dbtp->data, &argp)) != 0) return (ret); - switch (argp->opcode) { - case TXN_COMMIT: - if (__db_txnlist_find(info, - argp->txnid->txnid) == DB_NOTFOUND) - __db_txnlist_add(info, argp->txnid->txnid); - break; - case TXN_PREPARE: /* Nothing to do. */ - /* Call __db_txnlist_find so that we update the maxid. */ - (void)__db_txnlist_find(info, argp->txnid->txnid); - break; - default: + if (argp->opcode != TXN_COMMIT) + ret = EINVAL; + else + if (__db_txnlist_find(info, argp->txnid->txnid) == DB_NOTFOUND) + ret = __db_txnlist_add(info, argp->txnid->txnid); + + if (ret == 0) + *lsnp = argp->prev_lsn; + __os_free(argp, 0); + + return (ret); +} + +/* + * PUBLIC: int __txn_xa_regop_recover + * PUBLIC: __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); + * + * These records are only ever written for prepares. + */ +int +__txn_xa_regop_recover(logp, dbtp, lsnp, redo, info) + DB_LOG *logp; + DBT *dbtp; + DB_LSN *lsnp; + int redo; + void *info; +{ + __txn_xa_regop_args *argp; + int ret; + +#ifdef DEBUG_RECOVER + (void)__txn_xa_regop_print(logp, dbtp, lsnp, redo, info); +#endif + COMPQUIET(redo, 0); + COMPQUIET(logp, NULL); + + if ((ret = __txn_xa_regop_read(dbtp->data, &argp)) != 0) + return (ret); + + if (argp->opcode != TXN_PREPARE) ret = EINVAL; - break; + else { + /* + * Whether we are in XA or not, we need to call + * __db_txnlist_find so that we update the maxid. + * If this is an XA transaction, then we treat + * prepares like commits so that we roll forward to + * a point where we can handle commit/abort calls + * from the TMS. If this isn't XA, then a prepare + * is treated like a No-op; we only care about the + * commit. + */ + ret = __db_txnlist_find(info, argp->txnid->txnid); + if (IS_XA_TXN(argp) && ret == DB_NOTFOUND) { + /* + * This is an XA prepared, but not yet committed + * transaction. We need to add it to the + * transaction list, so that it gets rolled + * forward. We also have to add it to the region's + * internal state so it can be properly aborted + * or recovered. + */ + ret = __db_txnlist_add(info, argp->txnid->txnid); + if (ret == 0) + ret = __txn_restore_txn(logp->dbenv, + lsnp, argp); + } } - *lsnp = argp->prev_lsn; - __db_free(argp); - return (0); + if (ret == 0) + *lsnp = argp->prev_lsn; + __os_free(argp, 0); + + return (ret); } /* @@ -130,7 +194,103 @@ __txn_ckp_recover(logp, dbtp, lsnp, redo, info) if (argp->ckp_lsn.file == lsnp->file && argp->ckp_lsn.offset == lsnp->offset) __db_txnlist_gen(info, redo ? -1 : 1); + *lsnp = argp->last_ckp; - __db_free(argp); + __os_free(argp, 0); return (DB_TXN_CKP); } + +/* + * __txn_child_recover + * Recover a commit record for a child transaction. + * + * PUBLIC: int __txn_child_recover + * PUBLIC: __P((DB_LOG *, DBT *, DB_LSN *, int, void *)); + */ +int +__txn_child_recover(logp, dbtp, lsnp, redo, info) + DB_LOG *logp; + DBT *dbtp; + DB_LSN *lsnp; + int redo; + void *info; +{ + __txn_child_args *argp; + int ret; + +#ifdef DEBUG_RECOVER + (void)__txn_child_print(logp, dbtp, lsnp, redo, info); +#endif + COMPQUIET(redo, 0); + COMPQUIET(logp, NULL); + + if ((ret = __txn_child_read(dbtp->data, &argp)) != 0) + return (ret); + + /* + * We count the child as committed only if its parent committed. + * So, if we are not yet in the transaction list, but our parent + * is, then we should go ahead and commit. + */ + if (argp->opcode != TXN_COMMIT) + ret = EINVAL; + else + if (__db_txnlist_find(info, argp->parent) == 0 && + __db_txnlist_find(info, argp->txnid->txnid) == DB_NOTFOUND) + ret = __db_txnlist_add(info, argp->txnid->txnid); + + if (ret == 0) + *lsnp = argp->prev_lsn; + __os_free(argp, 0); + + return (ret); +} + +/* + * __txn_restore_txn -- + * Using only during XA recovery. If we find any transactions that are + * prepared, but not yet committed, then we need to restore the transaction's + * state into the shared region, because the TM is going to issue a txn_abort + * or txn_commit and we need to respond correctly. + * + * lsnp is the LSN of the returned LSN + * argp is the perpare record (in an appropriate structure) + */ +static int +__txn_restore_txn(dbenv, lsnp, argp) + DB_ENV *dbenv; + DB_LSN *lsnp; + __txn_xa_regop_args *argp; +{ + DB_TXNMGR *mgr; + TXN_DETAIL *td; + int ret; + + if (argp->xid.size == 0) + return(0); + + mgr = dbenv->tx_info; + LOCK_TXNREGION(mgr); + + /* Allocate a new transaction detail structure. */ + if ((ret = __db_shalloc(mgr->mem, sizeof(TXN_DETAIL), 0, &td)) != 0) + return (ret); + + /* Place transaction on active transaction list. */ + SH_TAILQ_INSERT_HEAD(&mgr->region->active_txn, td, links, __txn_detail); + + td->txnid = argp->txnid->txnid; + td->begin_lsn = argp->begin_lsn; + td->last_lsn = *lsnp; + td->last_lock = 0; + td->parent = 0; + td->status = TXN_PREPARED; + td->xa_status = TXN_XA_PREPARED; + memcpy(td->xid, argp->xid.data, argp->xid.size); + td->bqual = argp->bqual; + td->gtrid = argp->gtrid; + td->format = argp->formatID; + + UNLOCK_TXNREGION(mgr); + return (0); +} diff --git a/db2/xa/xa.c b/db2/xa/xa.c new file mode 100644 index 0000000000..94a96e7e09 --- /dev/null +++ b/db2/xa/xa.c @@ -0,0 +1,682 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1998 + * Sleepycat Software. All rights reserved. + */ + +/* XXX Remove the global transaction and hang it off the environment. */ +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)xa.c 10.4 (Sleepycat) 10/11/98"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#endif + +#include "db_int.h" +#include "db_page.h" +#include "shqueue.h" +#include "log.h" +#include "txn.h" +#include "db_auto.h" +#include "db_ext.h" +#include "db_dispatch.h" + +static int __db_xa_close __P((char *, int, long)); +static int __db_xa_commit __P((XID *, int, long)); +static int __db_xa_complete __P((int *, int *, int, long)); +static int __db_xa_end __P((XID *, int, long)); +static int __db_xa_forget __P((XID *, int, long)); +static int __db_xa_open __P((char *, int, long)); +static int __db_xa_prepare __P((XID *, int, long)); +static int __db_xa_recover __P((XID *, long, int, long)); +static int __db_xa_rollback __P((XID *, int, long)); +static int __db_xa_start __P((XID *, int, long)); +static void __xa_txn_end __P((DB_ENV *)); +static void __xa_txn_init __P((DB_ENV *, TXN_DETAIL *, size_t)); + +/* + * Possible flag values: + * Dynamic registration 0 => no dynamic registration + * TMREGISTER => dynamic registration + * Asynchronous operation 0 => no support for asynchrony + * TMUSEASYNC => async support + * Migration support 0 => migration of transactions across + * threads is possible + * TMNOMIGRATE => no migration across threads + */ +const struct xa_switch_t db_xa_switch = { + "Berkeley DB", /* name[RMNAMESZ] */ + TMNOMIGRATE, /* flags */ + 0, /* version */ + __db_xa_open, /* xa_open_entry */ + __db_xa_close, /* xa_close_entry */ + __db_xa_start, /* xa_start_entry */ + __db_xa_end, /* xa_end_entry */ + __db_xa_rollback, /* xa_rollback_entry */ + __db_xa_prepare, /* xa_prepare_entry */ + __db_xa_commit, /* xa_commit_entry */ + __db_xa_recover, /* xa_recover_entry */ + __db_xa_forget, /* xa_forget_entry */ + __db_xa_complete /* xa_complete_entry */ +}; + +/* + * __db_xa_open -- + * The open call in the XA protocol. The rmid field is an id number + * that the TM assigned us and will pass us on every xa call. We need to + * map that rmid number into a dbenv structure that we create during + * initialization. Since this id number is thread specific, we do not + * need to store it in shared memory. The file xa_map.c implements all + * such xa->db mappings. + * The xa_info field is instance specific information. We require + * that the value of DB_HOME be passed in xa_info. Since xa_info is the + * only thing that we get to pass to db_appinit, any config information + * will have to be done via a config file instead of via the db_appinit + * call. + */ +static int +__db_xa_open(xa_info, rmid, flags) + char *xa_info; + int rmid; + long flags; +{ + DB_ENV *env; + + if (LF_ISSET(TMASYNC)) + return (XAER_ASYNC); + if (flags != TMNOFLAGS) + return (XAER_INVAL); + + /* Verify if we already have this environment open. */ + if (__db_rmid_to_env(rmid, &env, 0) == 0) + return (XA_OK); + + /* + * Since we cannot tell whether the environment is OK or not, + * we can't actually do the db_appinit in xa_open. Instead, + * we save the mapping between the rmid and the xa_info. If + * we next get a call to __xa_recover, we do the db_appinit + * with DB_RECOVER set. If we get any other call, then we + * do the db_appinit. + */ + return (__db_map_rmid_name(rmid, xa_info)); +} + +/* + * __db_xa_close -- + * The close call of the XA protocol. The only trickiness here + * is that if there are any active transactions, we must fail. It is + * *not* an error to call close on an environment that has already been + * closed (I am interpreting that to mean it's OK to call close on an + * environment that has never been opened). + */ +static int +__db_xa_close(xa_info, rmid, flags) + char *xa_info; + int rmid; + long flags; +{ + DB_ENV *env; + int ret, t_ret; + + COMPQUIET(xa_info, NULL); + + if (LF_ISSET(TMASYNC)) + return (XAER_ASYNC); + if (flags != TMNOFLAGS) + return (XAER_INVAL); + + /* If the environment is closed, then we're done. */ + if (__db_rmid_to_env(rmid, &env, 0) != 0) + return (XA_OK); + + /* Check if there are any pending transactions. */ + if (env->xa_txn != NULL && env->xa_txn->txnid != TXN_INVALID) + return (XAER_PROTO); + + /* Now, destroy the mapping and close the environment. */ + ret = __db_unmap_rmid(rmid); + if ((t_ret = db_appexit(env)) != 0 && ret == 0) + ret = t_ret; + + __os_free(env, sizeof(DB_ENV)); + + return (ret == 0 ? XA_OK : XAER_RMERR); +} + +/* + * __db_xa_start -- + * Begin a transaction for the current resource manager. + */ +static int +__db_xa_start(xid, rmid, flags) + XID *xid; + int rmid; + long flags; +{ + DB_ENV *env; + TXN_DETAIL *td; + size_t off; + int is_known; + +#define OK_FLAGS (TMJOIN | TMRESUME | TMNOWAIT | TMASYNC | TMNOFLAGS) + if (LF_ISSET(~OK_FLAGS)) + return (XAER_INVAL); + + if (LF_ISSET(TMJOIN) && LF_ISSET(TMRESUME)) + return (XAER_INVAL); + + if (LF_ISSET(TMASYNC)) + return (XAER_ASYNC); + + if (__db_rmid_to_env(rmid, &env, 1) != 0) + return (XAER_PROTO); + + is_known = __db_xid_to_txn(env, xid, &off) == 0; + + if (is_known && !LF_ISSET(TMRESUME) && !LF_ISSET(TMJOIN)) + return (XAER_DUPID); + + if (!is_known && LF_ISSET(TMRESUME | TMJOIN)) + return (XAER_NOTA); + + /* + * This can't block, so we can ignore TMNOWAIT. + * + * Other error conditions: RMERR, RMFAIL, OUTSIDE, PROTO, RB* + */ + if (is_known) { + td = (TXN_DETAIL *)((u_int8_t *)env->tx_info->region + off); + if (td->xa_status == TXN_XA_SUSPENDED && !LF_SET(TMRESUME)) + return (XAER_PROTO); + if (td->xa_status == TXN_XA_DEADLOCKED) + return (XA_RBDEADLOCK); + if (td->xa_status == TXN_XA_ABORTED) + return (XA_RBOTHER); + + /* Now, fill in the global transaction structure. */ + __xa_txn_init(env, td, off); + td->xa_status = TXN_XA_STARTED; + } else { + if (__txn_xa_begin(env, env->xa_txn) != 0) + return (XAER_RMERR); + (void)__db_map_xid(env, xid, env->xa_txn->off); + td = (TXN_DETAIL *) + ((u_int8_t *)env->tx_info->region + env->xa_txn->off); + td->xa_status = TXN_XA_STARTED; + } + return (XA_OK); +} + +/* + * __db_xa_end -- + * Disassociate the current transaction from the current process. + */ +static int +__db_xa_end(xid, rmid, flags) + XID *xid; + int rmid; + long flags; +{ + DB_ENV *env; + DB_TXN *txn; + TXN_DETAIL *td; + size_t off; + + if (flags != TMNOFLAGS && !LF_ISSET(TMSUSPEND | TMSUCCESS | TMFAIL)) + return (XAER_INVAL); + + if (__db_rmid_to_env(rmid, &env, 0) != 0) + return (XAER_PROTO); + + if (__db_xid_to_txn(env, xid, &off) != 0) + return (XAER_NOTA); + + txn = env->xa_txn; + if (off != txn->off) + return (XAER_PROTO); + + td = (TXN_DETAIL *)((u_int8_t *)env->tx_info->region + off); + if (td->xa_status == TXN_XA_DEADLOCKED) + return (XA_RBDEADLOCK); + + if (td->status == TXN_ABORTED) + return (XA_RBOTHER); + + if (td->xa_status != TXN_XA_STARTED) + return (XAER_PROTO); + + /* Update the shared memory last_lsn field */ + td->last_lsn = txn->last_lsn; + + /* + * If we ever support XA migration, we cannot keep SUSPEND/END + * status in the shared region; it would have to be process local. + */ + if (LF_ISSET(TMSUSPEND)) + td->xa_status = TXN_XA_SUSPENDED; + else + td->xa_status = TXN_XA_ENDED; + + txn->txnid = TXN_INVALID; + return (XA_OK); +} + +/* + * __db_xa_prepare -- + * Sync the log to disk so we can guarantee recoverability. + */ +static int +__db_xa_prepare(xid, rmid, flags) + XID *xid; + int rmid; + long flags; +{ + DB_ENV *env; + TXN_DETAIL *td; + size_t off; + + if (LF_ISSET(TMASYNC)) + return (XAER_ASYNC); + if (flags != TMNOFLAGS) + return (XAER_INVAL); + + /* + * We need to know if we've ever called prepare on this. + * As part of the prepare, we set the xa_status field to + * reflect that fact that prepare has been called, and if + * it's ever called again, it's an error. + */ + if (__db_rmid_to_env(rmid, &env, 1) != 0) + return (XAER_PROTO); + + if (__db_xid_to_txn(env, xid, &off) != 0) + return (XAER_NOTA); + + td = (TXN_DETAIL *)((u_int8_t *)env->tx_info->region + off); + + if (td->xa_status == TXN_XA_DEADLOCKED) + return (XA_RBDEADLOCK); + + if (td->xa_status != TXN_XA_ENDED && td->xa_status != TXN_XA_SUSPENDED) + return (XAER_PROTO); + + /* Now, fill in the global transaction structure. */ + __xa_txn_init(env, td, off); + + if (txn_prepare(env->xa_txn) != 0) + return (XAER_RMERR); + + td->xa_status = TXN_XA_PREPARED; + + /* No fatal value that would require an XAER_RMFAIL. */ + __xa_txn_end(env); + return (XA_OK); +} + +/* + * __db_xa_commit -- + * Commit the transaction + */ +static int +__db_xa_commit(xid, rmid, flags) + XID *xid; + int rmid; + long flags; +{ + DB_ENV *env; + TXN_DETAIL *td; + size_t off; + + if (LF_ISSET(TMASYNC)) + return (XAER_ASYNC); +#undef OK_FLAGS +#define OK_FLAGS (TMNOFLAGS | TMNOWAIT | TMONEPHASE) + if (LF_ISSET(~OK_FLAGS)) + return (XAER_INVAL); + + /* + * We need to know if we've ever called prepare on this. + * We can verify this by examining the xa_status field. + */ + if (__db_rmid_to_env(rmid, &env, 1) != 0) + return (XAER_PROTO); + + if (__db_xid_to_txn(env, xid, &off) != 0) + return (XAER_NOTA); + + td = (TXN_DETAIL *)((u_int8_t *)env->tx_info->region + off); + + if (td->xa_status == TXN_XA_DEADLOCKED) + return (XA_RBDEADLOCK); + + if (td->xa_status == TXN_XA_ABORTED) + return (XA_RBOTHER); + + if (LF_SET(TMONEPHASE) && + td->xa_status != TXN_XA_ENDED && td->xa_status != TXN_XA_SUSPENDED) + return (XAER_PROTO); + + if (!LF_SET(TMONEPHASE) && td->xa_status != TXN_XA_PREPARED) + return (XAER_PROTO); + + /* Now, fill in the global transaction structure. */ + __xa_txn_init(env, td, off); + + if (txn_commit(env->xa_txn) != 0) + return (XAER_RMERR); + + /* No fatal value that would require an XAER_RMFAIL. */ + __xa_txn_end(env); + return (XA_OK); +} + +/* + * __db_xa_recover -- + * Returns a list of prepared and heuristically completed transactions. + * + * The return value is the number of xids placed into the xid array (less + * than or equal to the count parameter). The flags are going to indicate + * whether we are starting a scan or continuing one. + */ +static int +__db_xa_recover(xids, count, rmid, flags) + XID *xids; + long count, flags; + int rmid; +{ + __txn_xa_regop_args *argp; + DBT data; + DB_ENV *env; + DB_LOG *log; + XID *xidp; + char *dbhome; + int err, ret; + u_int32_t rectype, txnid; + + ret = 0; + xidp = xids; + + + /* + * If we are starting a scan, then we need to open the environment + * and run recovery. This recovery puts us in a state where we can + * either commit or abort any transactions that were prepared but not + * yet committed. Once we've done that, we need to figure out where + * to begin checking for such transactions. If we are not starting + * a scan, then the environment had better have already been recovered + * and we'll start from * wherever the log cursor is. Since XA apps + * cannot be threaded, we don't have to worry about someone else + * having moved it. + */ + if (LF_ISSET(TMSTARTRSCAN)) { + /* If the environment is open, we have a problem. */ + if (__db_rmid_to_env(rmid, &env, 0) == XA_OK) + return (XAER_PROTO); + + if ((ret = __os_calloc(1, sizeof(DB_ENV), &env)) != 0) + return (XAER_RMERR); + + if (__db_rmid_to_name(rmid, &dbhome) != 0) + goto err1; + +#undef XA_FLAGS +#define XA_FLAGS DB_RECOVER | \ + DB_CREATE | DB_INIT_LOCK | DB_INIT_LOG | DB_INIT_MPOOL | DB_INIT_TXN + if ((ret = db_appinit(dbhome, NULL, env, XA_FLAGS)) != 0) + goto err1; + + if (__db_map_rmid(rmid, env) != 0) + goto err2; + + /* Now figure out from where to begin scan. */ + log = env->lg_info; + if ((err = __log_findckp(log, &log->xa_first)) == DB_NOTFOUND) { + /* + * If there were no log files, then we have no + * transactions to return, so we simply return 0. + */ + return (0); + } + if ((err = __db_txnlist_init(&log->xa_info)) != 0) + goto err3; + } else { + /* We had better already know about this rmid. */ + if (__db_rmid_to_env(rmid, &env, 0) != 0) + return (XAER_PROTO); + /* + * If we are not starting a scan, the log cursor had + * better be set. + */ + log = env->lg_info; + if (IS_ZERO_LSN(log->xa_lsn)) + return (XAER_PROTO); + } + + /* + * At this point log->xa_first contains the point in the log + * to which we need to roll back. If we are starting a scan, + * we'll start at the last record; if we're continuing a scan, + * we'll have to start at log->xa_lsn. + */ + + memset(&data, 0, sizeof(data)); + for (err = log_get(log, &log->xa_lsn, &data, + LF_ISSET(TMSTARTRSCAN) ? DB_LAST : DB_SET); + err == 0 && log_compare(&log->xa_lsn, &log->xa_first) > 0; + err = log_get(log, &log->xa_lsn, &data, DB_PREV)) { + memcpy(&rectype, data.data, sizeof(rectype)); + + /* + * The only record type we care about is an DB_txn_xa_regop. + * If it's a commit, we have to add it to a txnlist. If it's + * a prepare, and we don't have a commit, then we return it. + * We are redoing some of what's in the xa_regop_recovery + * code, but we have to do it here so we can get at the xid + * in the record. + */ + if (rectype != DB_txn_xa_regop && rectype != DB_txn_regop) + continue; + + memcpy(&txnid, (u_int8_t *)data.data + sizeof(rectype), + sizeof(txnid)); + err = __db_txnlist_find(log->xa_info, txnid); + switch (rectype) { + case DB_txn_regop: + if (err == DB_NOTFOUND) + __db_txnlist_add(log->xa_info, txnid); + err = 0; + break; + case DB_txn_xa_regop: + /* + * This transaction is commited, so we needn't read + * the record and do anything. + */ + if (err == 0) + break; + if ((err = + __txn_xa_regop_read(data.data, &argp)) != 0) { + ret = XAER_RMERR; + goto out; + } + + xidp->formatID = argp->formatID; + xidp->gtrid_length = argp->gtrid; + xidp->bqual_length = argp->bqual; + memcpy(xidp->data, argp->xid.data, argp->xid.size); + ret++; + xidp++; + __os_free(argp, sizeof(*argp)); + if (ret == count) + goto done; + break; + } + } + + if (err != 0 && err != DB_NOTFOUND) + goto out; + +done: if (LF_ISSET(TMENDRSCAN)) { + ZERO_LSN(log->xa_lsn); + ZERO_LSN(log->xa_first); + +out: __db_txnlist_end(log->xa_info); + log->xa_info = NULL; + } + return (ret); + +err3: (void)__db_unmap_rmid(rmid); +err2: (void)db_appexit(env); +err1: __os_free(env, sizeof(DB_ENV)); + return (XAER_RMERR); +} + +/* + * __db_xa_rollback + * Abort an XA transaction. + */ +static int +__db_xa_rollback(xid, rmid, flags) + XID *xid; + int rmid; + long flags; +{ + DB_ENV *env; + TXN_DETAIL *td; + size_t off; + + if (LF_ISSET(TMASYNC)) + return (XAER_ASYNC); + if (flags != TMNOFLAGS) + return (XAER_INVAL); + + if (__db_rmid_to_env(rmid, &env, 1) != 0) + return (XAER_PROTO); + + if (__db_xid_to_txn(env, xid, &off) != 0) + return (XAER_NOTA); + + td = (TXN_DETAIL *)((u_int8_t *)env->tx_info->region + off); + + if (td->xa_status == TXN_XA_DEADLOCKED) + return (XA_RBDEADLOCK); + + if (td->xa_status == TXN_XA_ABORTED) + return (XA_RBOTHER); + + if (LF_SET(TMONEPHASE) && + td->xa_status != TXN_XA_ENDED && td->xa_status != TXN_XA_SUSPENDED) + return (XAER_PROTO); + + if (!LF_SET(TMONEPHASE) && td->xa_status != TXN_XA_PREPARED) + return (XAER_PROTO); + + /* Now, fill in the global transaction structure. */ + __xa_txn_init(env, td, off); + if (txn_abort(env->xa_txn) != 0) + return (XAER_RMERR); + + /* No fatal value that would require an XAER_RMFAIL. */ + __xa_txn_end(env); + return (XA_OK); +} + +/* + * __db_xa_forget -- + * Forget about an XID for a transaction that was heuristically + * completed. Since we do not heuristically complete anything, I + * don't think we have to do anything here, but we should make sure + * that we reclaim the slots in the txnid table. + */ +static int +__db_xa_forget(xid, rmid, flags) + XID *xid; + int rmid; + long flags; +{ + DB_ENV *env; + size_t off; + + if (LF_ISSET(TMASYNC)) + return (XAER_ASYNC); + if (flags != TMNOFLAGS) + return (XAER_INVAL); + + if (__db_rmid_to_env(rmid, &env, 1) != 0) + return (XAER_PROTO); + + /* + * If mapping is gone, then we're done. + */ + if (__db_xid_to_txn(env, xid, &off) != 0) + return (XA_OK); + + __db_unmap_xid(env, xid, off); + + /* No fatal value that would require an XAER_RMFAIL. */ + return (XA_OK); +} + +/* + * __db_xa_complete -- + * Used to wait for asynchronous operations to complete. Since we're + * not doing asynch, this is an invalid operation. + */ +static int +__db_xa_complete(handle, retval, rmid, flags) + int *handle, *retval, rmid; + long flags; +{ + COMPQUIET(handle, NULL); + COMPQUIET(retval, NULL); + COMPQUIET(rmid, 0); + COMPQUIET(flags, 0); + + return (XAER_INVAL); +} + +/* + * __xa_txn_init -- + * Fill in the fields of the local transaction structure given + * the detail transaction structure. + */ +static void +__xa_txn_init(env, td, off) + DB_ENV *env; + TXN_DETAIL *td; + size_t off; +{ + DB_TXN *txn; + + txn = env->xa_txn; + txn->mgrp = env->tx_info; + txn->parent = NULL; + txn->last_lsn = td->last_lsn; + txn->txnid = td->txnid; + txn->off = off; + txn->flags = 0; +} + +/* + * __xa_txn_end -- + * Invalidate a transaction structure that was generated by xa_txn_init. + */ +static void +__xa_txn_end(env) + DB_ENV *env; +{ + DB_TXN *txn; + + txn = env->xa_txn; + if (txn != NULL) + txn->txnid = TXN_INVALID; +} + diff --git a/db2/xa/xa_db.c b/db2/xa/xa_db.c new file mode 100644 index 0000000000..4aaaeff108 --- /dev/null +++ b/db2/xa/xa_db.c @@ -0,0 +1,308 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1998 + * Sleepycat Software. All rights reserved. + */ + +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)xa_db.c 10.6 (Sleepycat) 12/19/98"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <errno.h> +#include <stdio.h> +#include <string.h> +#endif + +#undef stat + +#include "db_int.h" +#include "db_page.h" +#include "xa.h" +#include "xa_ext.h" +#include "db_am.h" +#include "db_ext.h" +#include "common_ext.h" + +static int __xa_c_close __P((DBC *)); +static int __xa_c_del __P((DBC *, u_int32_t)); +static int __xa_c_get __P((DBC *, DBT *, DBT *, u_int32_t)); +static int __xa_c_put __P((DBC *, DBT *, DBT *, u_int32_t)); +static int __xa_close __P((DB *, u_int32_t)); +static int __xa_cursor __P((DB *, DB_TXN *, DBC **, u_int32_t)); +static int __xa_del __P((DB *, DB_TXN *, DBT *, u_int32_t)); +static int __xa_fd __P((DB *, int *)); +static int __xa_get __P((DB *, DB_TXN *, DBT *, DBT *, u_int32_t)); +static int __xa_put __P((DB *, DB_TXN *, DBT *, DBT *, u_int32_t)); +static int __xa_stat __P((DB *, void *, void *(*)(size_t), u_int32_t)); +static int __xa_sync __P((DB *, u_int32_t)); + +int +db_xa_open(fname, type, flags, mode, dbinfo, dbpp) + const char *fname; + DBTYPE type; + u_int32_t flags; + int mode; + DB_INFO *dbinfo; + DB **dbpp; +{ + DB *dbp, *real_dbp; + DB_ENV *dbenv; + struct __rmname *rp; + int ret; + + /* + * First try to open up the underlying DB. + * + * !!! + * The dbenv argument is taken from the global list of environments. + * When the transaction manager called xa_start() (__db_xa_start()), + * the "current" DB environment was moved to the start of the list. + * However, if we were called in a tpsvrinit function (which is + * entirely plausible), then it's possible that xa_open was called + * (which simply recorded the name of the environment to open) and + * this is the next call into DB. In that case, we still have to + * open the environment. + * + * The way that we know that xa_open and nothing else was called + * is because the nameq is not NULL. + */ + if ((rp = TAILQ_FIRST(&DB_GLOBAL(db_nameq))) != NULL && + (ret = __db_rmid_to_env(rp->rmid, &dbenv, 1)) != 0) + return (ret); + + dbenv = TAILQ_FIRST(&DB_GLOBAL(db_envq)); + if ((ret = db_open(fname, + type, flags, mode, dbenv, dbinfo, &real_dbp)) != 0) + return (ret); + + /* + * Allocate our own DB handle, and copy the exported fields and + * function pointers into it. The internal pointer references + * the real underlying DB handle. + */ + if ((ret = __os_calloc(1, sizeof(DB), &dbp)) != 0) { + (void)real_dbp->close(real_dbp, 0); + return (ret); + } + dbp->type = real_dbp->type; + dbp->byteswapped = real_dbp->byteswapped; + dbp->dbenv = dbenv; + dbp->internal = real_dbp; + TAILQ_INIT(&dbp->active_queue); + TAILQ_INIT(&dbp->free_queue); + dbp->close = __xa_close; + dbp->cursor = __xa_cursor; + dbp->del = __xa_del; + dbp->fd = __xa_fd; + dbp->get = __xa_get; + dbp->join = real_dbp->join; + dbp->put = __xa_put; + dbp->stat = __xa_stat; + dbp->sync = __xa_sync; + + *dbpp = dbp; + return (0); +} + +static int +__xa_close(dbp, flags) + DB *dbp; + u_int32_t flags; +{ + DB *real_dbp; + DBC *dbc; + int ret; + + /* Close any associated cursors. */ + while ((dbc = TAILQ_FIRST(&dbp->active_queue)) != NULL) + (void)dbc->c_close(dbc); + + /* Close the DB handle. */ + real_dbp = (DB *)dbp->internal; + ret = real_dbp->close(real_dbp, flags); + + __os_free(dbp, sizeof(DB)); + return (ret); +} + +static int +__xa_cursor(dbp, txn, dbcp, flags) + DB *dbp; + DB_TXN *txn; + DBC **dbcp; + u_int32_t flags; +{ + DB *real_dbp; + DBC *real_dbc, *dbc; + int ret; + + real_dbp = (DB *)dbp->internal; + txn = dbp->dbenv->xa_txn; + + if ((ret = real_dbp->cursor(real_dbp, txn, &real_dbc, flags)) != 0) + return (ret); + + /* + * Allocate our own DBC handle, and copy the exported fields and + * function pointers into it. The internal pointer references + * the real underlying DBC handle. + */ + if ((ret = __os_calloc(1, sizeof(DBC), &dbc)) != 0) { + (void)real_dbc->c_close(real_dbc); + return (ret); + } + dbc->dbp = dbp; + dbc->c_close = __xa_c_close; + dbc->c_del = __xa_c_del; + dbc->c_get = __xa_c_get; + dbc->c_put = __xa_c_put; + dbc->internal = real_dbc; + TAILQ_INSERT_TAIL(&dbp->active_queue, dbc, links); + + *dbcp = dbc; + return (0); +} + +static int +__xa_fd(dbp, fdp) + DB *dbp; + int *fdp; +{ + DB *real_dbp; + + COMPQUIET(fdp, NULL); + + real_dbp = (DB *)dbp->internal; + return (__db_eopnotsup(real_dbp->dbenv)); +} + +static int +__xa_del(dbp, txn, key, flags) + DB *dbp; + DB_TXN *txn; + DBT *key; + u_int32_t flags; +{ + DB *real_dbp; + + real_dbp = (DB *)dbp->internal; + txn = dbp->dbenv->xa_txn; + + return (real_dbp->del(real_dbp, txn, key, flags)); +} + +static int +__xa_get(dbp, txn, key, data, flags) + DB *dbp; + DB_TXN *txn; + DBT *key; + DBT *data; + u_int32_t flags; +{ + DB *real_dbp; + + real_dbp = (DB *)dbp->internal; + txn = dbp->dbenv->xa_txn; + + return (real_dbp->get(real_dbp, txn, key, data, flags)); +} + +static int +__xa_put(dbp, txn, key, data, flags) + DB *dbp; + DB_TXN *txn; + DBT *key; + DBT *data; + u_int32_t flags; +{ + DB *real_dbp; + + real_dbp = (DB *)dbp->internal; + txn = dbp->dbenv->xa_txn; + + return (real_dbp->put(real_dbp, txn, key, data, flags)); +} + +static int +__xa_stat(dbp, spp, db_malloc, flags) + DB *dbp; + void *spp; + void *(*db_malloc) __P((size_t)); + u_int32_t flags; +{ + DB *real_dbp; + + real_dbp = (DB *)dbp->internal; + return (real_dbp->stat(real_dbp, spp, db_malloc, flags)); +} + +static int +__xa_sync(dbp, flags) + DB *dbp; + u_int32_t flags; +{ + DB *real_dbp; + + real_dbp = (DB *)dbp->internal; + return (real_dbp->sync(real_dbp, flags)); +} + +static int +__xa_c_close(dbc) + DBC *dbc; +{ + DBC *real_dbc; + int ret; + + real_dbc = (DBC *)dbc->internal; + + ret = real_dbc->c_close(real_dbc); + + TAILQ_REMOVE(&dbc->dbp->active_queue, dbc, links); + __os_free(dbc, sizeof(DBC)); + + return (ret); +} + +static int +__xa_c_del(dbc, flags) + DBC *dbc; + u_int32_t flags; +{ + DBC *real_dbc; + + real_dbc = (DBC *)dbc->internal; + return (real_dbc->c_del(real_dbc, flags)); +} + +static int +__xa_c_get(dbc, key, data, flags) + DBC *dbc; + DBT *key; + DBT *data; + u_int32_t flags; +{ + DBC *real_dbc; + + real_dbc = (DBC *)dbc->internal; + return (real_dbc->c_get(real_dbc, key, data, flags)); +} + +static int +__xa_c_put(dbc, key, data, flags) + DBC *dbc; + DBT *key; + DBT *data; + u_int32_t flags; +{ + DBC *real_dbc; + + real_dbc = (DBC *)dbc->internal; + return (real_dbc->c_put(real_dbc, key, data, flags)); +} diff --git a/db2/xa/xa_map.c b/db2/xa/xa_map.c new file mode 100644 index 0000000000..d4ebbae22f --- /dev/null +++ b/db2/xa/xa_map.c @@ -0,0 +1,305 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997, 1998 + * Sleepycat Software. All rights reserved. + */ + +#include "config.h" + +#ifndef lint +static const char sccsid[] = "@(#)xa_map.c 10.4 (Sleepycat) 10/20/98"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <errno.h> +#include <stdio.h> +#include <string.h> +#endif + +#include "db_int.h" +#include "shqueue.h" +#include "txn.h" + +/* + * This file contains all the mapping information that we need to support + * the DB/XA interface. + */ + +/* + * __db_rmid_to_env + * Return the environment associated with a given XA rmid. + * + * PUBLIC: int __db_rmid_to_env __P((int rmid, DB_ENV **envp, int open_ok)); + */ +int +__db_rmid_to_env(rmid, envp, open_ok) + int rmid; + DB_ENV **envp; + int open_ok; +{ + DB_ENV *env; + char *dbhome; + + env = TAILQ_FIRST(&DB_GLOBAL(db_envq)); + if (env != NULL && env->xa_rmid == rmid) { + *envp = env; + return (0); + } + + /* + * When we map an rmid, move that environment to be the first one in + * the list of environments, so we pass the correct environment from + * the upcoming db_xa_open() call into db_open(). + */ + for (; env != NULL; env = TAILQ_NEXT(env, links)) + if (env->xa_rmid == rmid) { + TAILQ_REMOVE(&DB_GLOBAL(db_envq), env, links); + TAILQ_INSERT_HEAD(&DB_GLOBAL(db_envq), env, links); + *envp = env; + return (0); + } + + /* + * We have not found the rmid on the environment list. If we + * are allowed to do an open, search for the rmid on the name + * list and, if we find it, then open it. + */ + if (!open_ok) + return (1); + + if (__db_rmid_to_name(rmid, &dbhome) != 0) + return (1); +#undef XA_FLAGS +#define XA_FLAGS \ + DB_CREATE | DB_INIT_LOCK | DB_INIT_LOG | DB_INIT_MPOOL | DB_INIT_TXN + + if (__os_calloc(1, sizeof(DB_ENV), &env) != 0) + return (1); + + if (db_appinit(dbhome, NULL, env, XA_FLAGS) != 0) + goto err; + + if (__db_map_rmid(rmid, env) != 0) + goto err1; + + __db_unmap_rmid_name(rmid); + + *envp = env; + return (0); + +err1: (void)db_appexit(env); +err: __os_free(env, sizeof(DB_ENV)); + return (1); +} + +/* + * __db_xid_to_txn + * Return the txn that corresponds to this XID. + * + * PUBLIC: int __db_xid_to_txn __P((DB_ENV *, XID *, size_t *)); + */ +int +__db_xid_to_txn(dbenv, xid, offp) + DB_ENV *dbenv; + XID *xid; + size_t *offp; +{ + DB_TXNREGION *tmr; + struct __txn_detail *td; + + /* + * Search the internal active transaction table to find the + * matching xid. If this is a performance hit, then we + * can create a hash table, but I doubt it's worth it. + */ + tmr = dbenv->tx_info->region; + + LOCK_TXNREGION(dbenv->tx_info); + for (td = SH_TAILQ_FIRST(&tmr->active_txn, __txn_detail); + td != NULL; + td = SH_TAILQ_NEXT(td, links, __txn_detail)) + if (memcmp(xid->data, td->xid, XIDDATASIZE) == 0) + break; + UNLOCK_TXNREGION(dbenv->tx_info); + + if (td == NULL) + return (EINVAL); + + *offp = (u_int8_t *)td - (u_int8_t *)tmr; + return (0); +} + +/* + * __db_map_rmid + * Create a mapping between the specified rmid and environment. + * + * PUBLIC: int __db_map_rmid __P((int, DB_ENV *)); + */ +int +__db_map_rmid(rmid, env) + int rmid; + DB_ENV *env; +{ + if (__os_calloc(1, sizeof(DB_TXN), &env->xa_txn) != 0) + return (XAER_RMERR); + env->xa_txn->txnid = TXN_INVALID; + env->xa_rmid = rmid; + TAILQ_INSERT_HEAD(&DB_GLOBAL(db_envq), env, links); + return (XA_OK); +} + +/* + * __db_unmap_rmid + * Destroy the mapping for the given rmid. + * + * PUBLIC: int __db_unmap_rmid __P((int)); + */ +int +__db_unmap_rmid(rmid) + int rmid; +{ + DB_ENV *e; + + for (e = TAILQ_FIRST(&DB_GLOBAL(db_envq)); + e->xa_rmid != rmid; + e = TAILQ_NEXT(e, links)); + + if (e == NULL) + return (EINVAL); + + TAILQ_REMOVE(&DB_GLOBAL(db_envq), e, links); + if (e->xa_txn != NULL) + __os_free(e->xa_txn, sizeof(DB_TXN)); + return (0); +} + +/* + * __db_map_xid + * Create a mapping between this XID and the transaction at + * "off" in the shared region. + * + * PUBLIC: int __db_map_xid __P((DB_ENV *, XID *, size_t)); + */ +int +__db_map_xid(env, xid, off) + DB_ENV *env; + XID *xid; + size_t off; +{ + DB_TXNMGR *tm; + TXN_DETAIL *td; + + tm = env->tx_info; + td = (TXN_DETAIL *)((u_int8_t *)tm->region + off); + + LOCK_TXNREGION(tm); + memcpy(td->xid, xid->data, XIDDATASIZE); + UNLOCK_TXNREGION(tm); + + return (0); +} + +/* + * __db_unmap_xid + * Destroy the mapping for the specified XID. + * + * PUBLIC: void __db_unmap_xid __P((DB_ENV *, XID *, size_t)); + */ + +void +__db_unmap_xid(env, xid, off) + DB_ENV *env; + XID *xid; + size_t off; +{ + TXN_DETAIL *td; + + COMPQUIET(xid, NULL); + + td = (TXN_DETAIL *)((u_int8_t *)env->tx_info->region + off); + memset(td->xid, 0, sizeof(td->xid)); +} + +/* + * __db_map_rmid_name -- + * Create a mapping from an rmid to a name (the xa_info argument). + * We use this during create and then at some later point when we are + * trying to map an rmid, we might indicate that it's OK to do an open + * in which case, we'll get the xa_info parameter from here and then + * free it up. + * + * PUBLIC: int __db_map_rmid_name __P((int, char *)); + */ + +int +__db_map_rmid_name(rmid, dbhome) + int rmid; + char *dbhome; +{ + struct __rmname *entry; + int ret; + + if ((ret = __os_malloc(sizeof(struct __rmname), NULL, &entry)) != 0) + return (ret); + + if ((ret = __os_strdup(dbhome, &entry->dbhome)) != 0) { + __os_free(entry, sizeof(struct __rmname)); + return (ret); + } + + entry->rmid = rmid; + + TAILQ_INSERT_HEAD(&DB_GLOBAL(db_nameq), entry, links); + return (0); +} + +/* + * __db_rmid_to_name -- + * Given an rmid, return the name of the home directory for that + * rmid. + * + * PUBLIC: int __db_rmid_to_name __P((int, char **)); + */ +int +__db_rmid_to_name(rmid, dbhomep) + int rmid; + char **dbhomep; +{ + struct __rmname *np; + + for (np = TAILQ_FIRST(&DB_GLOBAL(db_nameq)); np != NULL; + np = TAILQ_NEXT(np, links)) { + if (np->rmid == rmid) { + *dbhomep = np->dbhome; + return (0); + } + } + return (1); +} + +/* + * __db_unmap_rmid_name -- + * Given an rmid, remove its entry from the name list. + * + * PUBLIC: void __db_unmap_rmid_name __P((int)); + */ +void +__db_unmap_rmid_name(rmid) + int rmid; +{ + struct __rmname *np, *next; + + for (np = TAILQ_FIRST(&DB_GLOBAL(db_nameq)); np != NULL; np = next) { + next = TAILQ_NEXT(np, links); + if (np->rmid == rmid) { + TAILQ_REMOVE(&DB_GLOBAL(db_nameq), np, links); + __os_freestr(np->dbhome); + __os_free(np, sizeof(struct __rmname)); + return; + } + } + return; +} |