aboutsummaryrefslogtreecommitdiff
path: root/db2/mp
diff options
context:
space:
mode:
Diffstat (limited to 'db2/mp')
-rw-r--r--db2/mp/mp_bh.c437
-rw-r--r--db2/mp/mp_fget.c359
-rw-r--r--db2/mp/mp_fopen.c437
-rw-r--r--db2/mp/mp_fput.c140
-rw-r--r--db2/mp/mp_fset.c72
-rw-r--r--db2/mp/mp_open.c176
-rw-r--r--db2/mp/mp_pr.c313
-rw-r--r--db2/mp/mp_region.c340
-rw-r--r--db2/mp/mp_sync.c205
9 files changed, 2479 insertions, 0 deletions
diff --git a/db2/mp/mp_bh.c b/db2/mp/mp_bh.c
new file mode 100644
index 0000000000..e1b68ce450
--- /dev/null
+++ b/db2/mp/mp_bh.c
@@ -0,0 +1,437 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 1997
+ * Sleepycat Software. All rights reserved.
+ */
+#include "config.h"
+
+#ifndef lint
+static const char sccsid[] = "@(#)mp_bh.c 10.12 (Sleepycat) 8/20/97";
+#endif /* not lint */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+
+#include <errno.h>
+#include <string.h>
+#include <unistd.h>
+#endif
+
+#include "db_int.h"
+#include "shqueue.h"
+#include "db_shash.h"
+#include "mp.h"
+#include "common_ext.h"
+
+/*
+ * __memp_bhwrite --
+ * Write the page associated with a given bucket header.
+ *
+ * PUBLIC: int __memp_bhwrite
+ * PUBLIC: __P((DB_MPOOL *, MPOOLFILE *, BH *, int *, int *));
+ */
+int
+__memp_bhwrite(dbmp, mfp, bhp, restartp, wrotep)
+ DB_MPOOL *dbmp;
+ MPOOLFILE *mfp;
+ BH *bhp;
+ int *restartp, *wrotep;
+{
+ DBT dbt;
+ DB_MPOOLFILE *dbmfp;
+ DB_MPREG *mpreg;
+
+ if (restartp != NULL)
+ *restartp = 0;
+ if (wrotep != NULL)
+ *wrotep = 0;
+
+ /*
+ * Walk the process' DB_MPOOLFILE list and try and find a file
+ * descriptor for this file.
+ */
+ LOCKHANDLE(dbmp, &dbmp->mutex);
+ for (dbmfp = TAILQ_FIRST(&dbmp->dbmfq);
+ dbmfp != NULL; dbmfp = TAILQ_NEXT(dbmfp, q))
+ if (dbmfp->mfp == mfp)
+ break;
+ UNLOCKHANDLE(dbmp, &dbmp->mutex);
+ if (dbmfp != NULL)
+ goto found;
+
+ /*
+ * It's not a page from a file we've opened. If the file requires
+ * input/output processing, see if this process has ever registered
+ * information as to how to write this type of file. If not, there's
+ * nothing we can do.
+ */
+ if (mfp->ftype != 0) {
+ LOCKHANDLE(dbmp, &dbmp->mutex);
+ for (mpreg = LIST_FIRST(&dbmp->dbregq);
+ mpreg != NULL; mpreg = LIST_NEXT(mpreg, q))
+ if (mpreg->ftype == mfp->ftype)
+ break;
+ UNLOCKHANDLE(dbmp, &dbmp->mutex);
+ if (mpreg == NULL)
+ return (0);
+ }
+
+ /*
+ * Try and open the file; ignore any error, assume it's a permissions
+ * problem.
+ */
+ dbt.size = mfp->pgcookie_len;
+ dbt.data = ADDR(dbmp, mfp->pgcookie_off);
+ if (__memp_fopen(dbmp, ADDR(dbmp, mfp->path_off),
+ mfp->ftype, 0, 0, mfp->stat.st_pagesize,
+ mfp->lsn_off, &dbt, ADDR(dbmp, mfp->fileid_off), 0, &dbmfp) != 0)
+ return (0);
+
+found: return (__memp_pgwrite(dbmfp, bhp, restartp, wrotep));
+}
+
+/*
+ * __memp_pgread --
+ * Read a page from a file.
+ *
+ * PUBLIC: int __memp_pgread __P((DB_MPOOLFILE *, BH *, int));
+ */
+int
+__memp_pgread(dbmfp, bhp, can_create)
+ DB_MPOOLFILE *dbmfp;
+ BH *bhp;
+ int can_create;
+{
+ DB_MPOOL *dbmp;
+ MPOOLFILE *mfp;
+ size_t pagesize;
+ ssize_t nr;
+ int ret;
+
+ dbmp = dbmfp->dbmp;
+ mfp = dbmfp->mfp;
+ pagesize = mfp->stat.st_pagesize;
+
+ F_SET(bhp, BH_LOCKED | BH_TRASH);
+ LOCKBUFFER(dbmp, bhp);
+ UNLOCKREGION(dbmp);
+
+ /*
+ * Temporary files may not yet have been created.
+ *
+ * Seek to the page location.
+ */
+ ret = 0;
+ LOCKHANDLE(dbmp, &dbmfp->mutex);
+ if (dbmfp->fd == -1 || (ret =
+ __db_lseek(dbmfp->fd, pagesize, bhp->pgno, 0, SEEK_SET)) != 0) {
+ if (!can_create) {
+ if (dbmfp->fd == -1)
+ ret = EINVAL;
+ UNLOCKHANDLE(dbmp, &dbmfp->mutex);
+ __db_err(dbmp->dbenv,
+ "%s: page %lu doesn't exist, create flag not set",
+ dbmfp->path, (u_long)bhp->pgno);
+ goto err;
+ }
+ UNLOCKHANDLE(dbmp, &dbmfp->mutex);
+
+ /* Clear any uninitialized data. */
+ memset(bhp->buf, 0, pagesize);
+ goto pgin;
+ }
+
+ /*
+ * Read the page; short reads are treated like creates, although
+ * any valid data is preserved.
+ */
+ ret = __db_read(dbmfp->fd, bhp->buf, pagesize, &nr);
+ UNLOCKHANDLE(dbmp, &dbmfp->mutex);
+ if (ret != 0)
+ goto err;
+
+ if (nr == (ssize_t)pagesize)
+ can_create = 0;
+ else {
+ if (!can_create) {
+ ret = EINVAL;
+ goto err;
+ }
+
+ /* Clear any uninitialized data. */
+ memset(bhp->buf + nr, 0, pagesize - nr);
+ }
+
+ /* Call any pgin function. */
+pgin: ret = mfp->ftype == 0 ? 0 : __memp_pg(dbmfp, bhp, 1);
+
+ /* Reacquire the region lock. */
+ LOCKREGION(dbmp);
+
+ /* If the pgin function succeeded, the data is now valid. */
+ if (ret == 0)
+ F_CLR(bhp, BH_TRASH);
+
+ /* Update the statistics. */
+ if (can_create) {
+ ++dbmp->mp->stat.st_page_create;
+ ++mfp->stat.st_page_create;
+ } else {
+ ++dbmp->mp->stat.st_page_in;
+ ++mfp->stat.st_page_in;
+ }
+
+ if (0) {
+err: LOCKREGION(dbmp);
+ }
+
+ /* Release the buffer. */
+ F_CLR(bhp, BH_LOCKED);
+ UNLOCKBUFFER(dbmp, bhp);
+
+ return (ret);
+}
+
+/*
+ * __memp_pgwrite --
+ * Write a page to a file.
+ *
+ * PUBLIC: int __memp_pgwrite __P((DB_MPOOLFILE *, BH *, int *, int *));
+ */
+int
+__memp_pgwrite(dbmfp, bhp, restartp, wrotep)
+ DB_MPOOLFILE *dbmfp;
+ BH *bhp;
+ int *restartp, *wrotep;
+{
+ DB_ENV *dbenv;
+ DB_LOG *lg_info;
+ DB_LSN lsn;
+ DB_MPOOL *dbmp;
+ MPOOL *mp;
+ MPOOLFILE *mfp;
+ size_t pagesize;
+ ssize_t nw;
+ int callpgin, ret;
+ const char *fail;
+
+ dbmp = dbmfp->dbmp;
+ dbenv = dbmp->dbenv;
+ mfp = dbmfp->mfp;
+
+ if (restartp != NULL)
+ *restartp = 0;
+ if (wrotep != NULL)
+ *wrotep = 0;
+ callpgin = 0;
+ pagesize = mfp->stat.st_pagesize;
+
+ F_SET(bhp, BH_LOCKED);
+ LOCKBUFFER(dbmp, bhp);
+ UNLOCKREGION(dbmp);
+
+ if (restartp != NULL)
+ *restartp = 1;
+
+ /* Copy the LSN off the page if we're going to need it. */
+ lg_info = dbenv->lg_info;
+ if (lg_info != NULL || F_ISSET(bhp, BH_WRITE))
+ memcpy(&lsn, bhp->buf + mfp->lsn_off, sizeof(DB_LSN));
+
+ /* Ensure the appropriate log records are on disk. */
+ if (lg_info != NULL && (ret = log_flush(lg_info, &lsn)) != 0)
+ goto err;
+
+ /*
+ * Call any pgout function. We set the callpgin flag so that on
+ * error we flag that the contents of the buffer may be trash.
+ */
+ if (mfp->ftype == 0)
+ ret = 0;
+ else {
+ callpgin = 1;
+ if ((ret = __memp_pg(dbmfp, bhp, 0)) != 0)
+ goto err;
+ }
+
+ /* Temporary files may not yet have been created. */
+ LOCKHANDLE(dbmp, &dbmfp->mutex);
+ if (dbmfp->fd == -1 && ((ret = __db_appname(dbenv, DB_APP_TMP,
+ NULL, NULL, &dbmfp->fd, NULL)) != 0 || dbmfp->fd == -1)) {
+ UNLOCKHANDLE(dbmp, &dbmfp->mutex);
+ __db_err(dbenv, "unable to create temporary backing file");
+ goto err;
+ }
+
+ /* Write the page out. */
+ if ((ret =
+ __db_lseek(dbmfp->fd, pagesize, bhp->pgno, 0, SEEK_SET)) != 0)
+ fail = "seek";
+ else if ((ret = __db_write(dbmfp->fd, bhp->buf, pagesize, &nw)) != 0)
+ fail = "write";
+ UNLOCKHANDLE(dbmp, &dbmfp->mutex);
+ if (ret != 0) {
+ /*
+ * XXX
+ * Shut the compiler up; it doesn't understand the correlation
+ * between the failing clauses to __db_lseek and __db_write and
+ * this ret != 0.
+ */
+ fail = NULL;
+ goto syserr;
+ }
+
+ if (nw != (ssize_t)pagesize) {
+ ret = EIO;
+ fail = "write";
+ goto syserr;
+ }
+
+ if (wrotep != NULL)
+ *wrotep = 1;
+
+ /* Reacquire the region lock. */
+ LOCKREGION(dbmp);
+
+ /* Clean up the flags based on a successful write. */
+ F_SET(bhp, BH_CALLPGIN);
+ F_CLR(bhp, BH_DIRTY | BH_LOCKED);
+ UNLOCKBUFFER(dbmp, bhp);
+
+ /*
+ * If we wrote a buffer which a checkpoint is waiting for, update
+ * the count of pending buffers (both in the mpool as a whole and
+ * for this file). If the count for this file goes to zero, flush
+ * the writes.
+ *
+ * XXX:
+ * We ignore errors from the sync -- it makes no sense to return an
+ * error to the calling process, so set a flag causing the sync to
+ * be retried later.
+ *
+ * If the buffer we wrote has a LSN larger than the current largest
+ * we've written for this checkpoint, update the saved value.
+ */
+ mp = dbmp->mp;
+ if (F_ISSET(bhp, BH_WRITE)) {
+ if (log_compare(&lsn, &mp->lsn) > 0)
+ mp->lsn = lsn;
+ F_CLR(bhp, BH_WRITE);
+
+ --mp->lsn_cnt;
+ if (--mfp->lsn_cnt == 0) {
+ /*
+ * Don't lock -- there are no atomicity issues for
+ * fsync(2).
+ */
+ if (__db_fsync(dbmfp->fd) != 0)
+ F_SET(mp, MP_LSN_RETRY);
+ }
+ }
+
+ /* Update I/O statistics. */
+ ++mp->stat.st_page_out;
+ ++mfp->stat.st_page_out;
+
+ return (0);
+
+syserr: __db_err(dbenv,
+ "%s: %s failed for page %lu", dbmfp->path, fail, (u_long)bhp->pgno);
+
+err: UNLOCKBUFFER(dbmp, bhp);
+ LOCKREGION(dbmp);
+ if (callpgin)
+ F_SET(bhp, BH_CALLPGIN);
+ F_CLR(bhp, BH_LOCKED);
+ return (ret);
+}
+
+/*
+ * __memp_pg --
+ * Call the pgin/pgout routine.
+ *
+ * PUBLIC: int __memp_pg __P((DB_MPOOLFILE *, BH *, int));
+ */
+int
+__memp_pg(dbmfp, bhp, is_pgin)
+ DB_MPOOLFILE *dbmfp;
+ BH *bhp;
+ int is_pgin;
+{
+ DBT dbt, *dbtp;
+ DB_MPOOL *dbmp;
+ DB_MPREG *mpreg;
+ MPOOLFILE *mfp;
+ int ftype, ret;
+
+ dbmp = dbmfp->dbmp;
+ mfp = dbmfp->mfp;
+
+ LOCKHANDLE(dbmp, &dbmp->mutex);
+
+ ftype = mfp->ftype;
+ for (mpreg = LIST_FIRST(&dbmp->dbregq);
+ mpreg != NULL; mpreg = LIST_NEXT(mpreg, q)) {
+ if (ftype != mpreg->ftype)
+ continue;
+ if (mfp->pgcookie_len == 0)
+ dbtp = NULL;
+ else {
+ dbt.size = mfp->pgcookie_len;
+ dbt.data = ADDR(dbmp, mfp->pgcookie_off);
+ dbtp = &dbt;
+ }
+ UNLOCKHANDLE(dbmp, &dbmp->mutex);
+
+ if (is_pgin) {
+ if (mpreg->pgin != NULL && (ret =
+ mpreg->pgin(bhp->pgno, bhp->buf, dbtp)) != 0)
+ goto err;
+ } else
+ if (mpreg->pgout != NULL && (ret =
+ mpreg->pgout(bhp->pgno, bhp->buf, dbtp)) != 0)
+ goto err;
+ break;
+ }
+
+ if (mpreg == NULL)
+ UNLOCKHANDLE(dbmp, &dbmp->mutex);
+
+ return (0);
+
+err: UNLOCKHANDLE(dbmp, &dbmp->mutex);
+ __db_err(dbmp->dbenv, "%s: %s failed for page %lu",
+ dbmfp->path, is_pgin ? "pgin" : "pgout", (u_long)bhp->pgno);
+ return (ret);
+}
+
+/*
+ * __memp_bhfree --
+ * Free a bucket header and its referenced data.
+ *
+ * PUBLIC: void __memp_bhfree __P((DB_MPOOL *, MPOOLFILE *, BH *, int));
+ */
+void
+__memp_bhfree(dbmp, mfp, bhp, free_mem)
+ DB_MPOOL *dbmp;
+ MPOOLFILE *mfp;
+ BH *bhp;
+ int free_mem;
+{
+ size_t off;
+
+ /* Delete the buffer header from the MPOOL hash list. */
+ off = BUCKET(dbmp->mp, OFFSET(dbmp, mfp), bhp->pgno);
+ SH_TAILQ_REMOVE(&dbmp->htab[off], bhp, mq, __bh);
+
+ /* Delete the buffer header from the LRU chain. */
+ SH_TAILQ_REMOVE(&dbmp->mp->bhq, bhp, q, __bh);
+
+ /*
+ * If we're not reusing it immediately, free the buffer header
+ * and data for real.
+ */
+ if (free_mem)
+ __db_shalloc_free(dbmp->addr, bhp);
+}
diff --git a/db2/mp/mp_fget.c b/db2/mp/mp_fget.c
new file mode 100644
index 0000000000..418802a3b9
--- /dev/null
+++ b/db2/mp/mp_fget.c
@@ -0,0 +1,359 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 1997
+ * Sleepycat Software. All rights reserved.
+ */
+#include "config.h"
+
+#ifndef lint
+static const char sccsid[] = "@(#)mp_fget.c 10.22 (Sleepycat) 8/19/97";
+#endif /* not lint */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+#endif
+
+#include "db_int.h"
+#include "shqueue.h"
+#include "db_shash.h"
+#include "mp.h"
+#include "common_ext.h"
+
+int __sleep_on_every_page_get; /* XXX: thread debugging option. */
+
+/*
+ * memp_fget --
+ * Get a page from the file.
+ */
+int
+memp_fget(dbmfp, pgnoaddr, flags, addrp)
+ DB_MPOOLFILE *dbmfp;
+ db_pgno_t *pgnoaddr;
+ u_long flags;
+ void *addrp;
+{
+ BH *bhp, *tbhp;
+ DB_MPOOL *dbmp;
+ MPOOL *mp;
+ MPOOLFILE *mfp;
+ db_pgno_t lastpgno;
+ size_t bucket, mf_offset;
+ off_t size;
+ u_long cnt;
+ int b_incr, b_inserted, readonly_alloc, ret;
+ void *addr;
+
+ dbmp = dbmfp->dbmp;
+
+ /*
+ * Validate arguments.
+ *
+ * !!!
+ * Don't test for DB_MPOOL_CREATE and DB_MPOOL_NEW flags for readonly
+ * files here, and create non-existent pages in readonly files if the
+ * flags are set, later. The reason is that the hash access method
+ * wants to get empty pages that don't really exist in readonly files.
+ * The only alternative is for hash to write the last "bucket" all the
+ * time, which we don't want to do because one of our big goals in life
+ * is to keep database files small. It's sleazy as hell, but we catch
+ * any attempt to actually write the file in memp_fput().
+ */
+#define OKFLAGS (DB_MPOOL_CREATE | DB_MPOOL_LAST | DB_MPOOL_NEW)
+ if (flags != 0) {
+ if ((ret =
+ __db_fchk(dbmp->dbenv, "memp_fget", flags, OKFLAGS)) != 0)
+ return (ret);
+
+ switch (flags) {
+ case DB_MPOOL_CREATE:
+ case DB_MPOOL_LAST:
+ case DB_MPOOL_NEW:
+ case 0:
+ break;
+ default:
+ return (__db_ferr(dbmp->dbenv, "memp_fget", 1));
+ }
+ }
+
+#ifdef DEBUG
+ /*
+ * XXX
+ * We want to switch threads as often as possible. Sleep every time
+ * we get a new page to make it more likely.
+ */
+ if (__sleep_on_every_page_get && (dbmp->dbenv == NULL ||
+ dbmp->dbenv->db_yield == NULL || dbmp->dbenv->db_yield() != 0))
+ __db_sleep(0, 1);
+#endif
+
+ mp = dbmp->mp;
+ mfp = dbmfp->mfp;
+ mf_offset = OFFSET(dbmp, mfp);
+ addr = NULL;
+ bhp = NULL;
+ b_incr = b_inserted = readonly_alloc = ret = 0;
+
+ LOCKREGION(dbmp);
+
+ /*
+ * If mmap'ing the file, just return a pointer. However, if another
+ * process has opened the file for writing since we mmap'd it, start
+ * playing the game by their rules, i.e. everything goes through the
+ * cache. All pages previously returned should be safe, as long as
+ * a locking protocol was observed.
+ *
+ * XXX
+ * We don't discard the map because we don't know when all of the
+ * pages will have been discarded from the process' address space.
+ * It would be possible to do so by reference counting the open
+ * pages from the mmap, but it's unclear to me that it's worth it.
+ */
+ if (dbmfp->addr != NULL && dbmfp->mfp->can_mmap) {
+ lastpgno = dbmfp->len == 0 ?
+ 0 : (dbmfp->len - 1) / mfp->stat.st_pagesize;
+ if (LF_ISSET(DB_MPOOL_LAST))
+ *pgnoaddr = lastpgno;
+ else {
+ /*
+ * !!!
+ * Allocate a page that can never really exist. See
+ * the comment above about non-existent pages and the
+ * hash access method.
+ */
+ if (LF_ISSET(DB_MPOOL_CREATE | DB_MPOOL_NEW))
+ readonly_alloc = 1;
+ else if (*pgnoaddr > lastpgno) {
+ __db_err(dbmp->dbenv,
+ "%s: page %lu doesn't exist",
+ dbmfp->path, (u_long)*pgnoaddr);
+ ret = EINVAL;
+ goto err;
+ }
+ }
+ if (!readonly_alloc) {
+ addr = ADDR(dbmfp, *pgnoaddr * mfp->stat.st_pagesize);
+
+ ++mp->stat.st_map;
+ ++mfp->stat.st_map;
+
+ goto mapret;
+ }
+ }
+
+ /*
+ * If requesting the last page or a new page, find the last page. The
+ * tricky thing is that the user may have created a page already that's
+ * after any page that exists in the file.
+ */
+ if (LF_ISSET(DB_MPOOL_LAST | DB_MPOOL_NEW)) {
+ /*
+ * Temporary files may not yet have been created.
+ *
+ * Don't lock -- there are no atomicity issues for stat(2).
+ */
+ if (dbmfp->fd == -1)
+ size = 0;
+ else if ((ret = __db_stat(dbmp->dbenv,
+ dbmfp->path, dbmfp->fd, &size, NULL)) != 0)
+ goto err;
+
+ *pgnoaddr = size == 0 ? 0 : (size - 1) / mfp->stat.st_pagesize;
+
+ /*
+ * Walk the list of BH's, looking for later pages. Save the
+ * pointer if a later page is found so that we don't have to
+ * search the list twice.
+ *
+ * If requesting a new page, return the page one after the last
+ * page -- which we'll have to create.
+ */
+ for (tbhp = SH_TAILQ_FIRST(&mp->bhq, __bh);
+ tbhp != NULL; tbhp = SH_TAILQ_NEXT(tbhp, q, __bh))
+ if (tbhp->pgno >= *pgnoaddr &&
+ tbhp->mf_offset == mf_offset) {
+ bhp = tbhp;
+ *pgnoaddr = bhp->pgno;
+ }
+ if (LF_ISSET(DB_MPOOL_NEW))
+ ++*pgnoaddr;
+ }
+
+ /* If we already found the right buffer, return it. */
+ if (LF_ISSET(DB_MPOOL_LAST) && bhp != NULL) {
+ addr = bhp->buf;
+ goto found;
+ }
+
+ /* If we haven't checked the BH list yet, do the search. */
+ if (!LF_ISSET(DB_MPOOL_LAST | DB_MPOOL_NEW)) {
+ ++mp->stat.st_hash_searches;
+ bucket = BUCKET(mp, mf_offset, *pgnoaddr);
+ for (cnt = 0,
+ bhp = SH_TAILQ_FIRST(&dbmp->htab[bucket], __bh);
+ bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, mq, __bh)) {
+ ++cnt;
+ if (bhp->pgno == *pgnoaddr &&
+ bhp->mf_offset == mf_offset) {
+ addr = bhp->buf;
+ if (cnt > mp->stat.st_hash_longest)
+ mp->stat.st_hash_longest = cnt;
+ mp->stat.st_hash_examined += cnt;
+ goto found;
+ }
+ }
+ if (cnt > mp->stat.st_hash_longest)
+ mp->stat.st_hash_longest = cnt;
+ mp->stat.st_hash_examined += cnt;
+ }
+
+ /*
+ * Allocate a new buffer header and data space, and mark the contents
+ * as useless.
+ */
+ if ((ret = __memp_ralloc(dbmp, sizeof(BH) -
+ sizeof(u_int8_t) + mfp->stat.st_pagesize, NULL, &bhp)) != 0)
+ goto err;
+ addr = bhp->buf;
+#ifdef DEBUG
+ if ((ALIGNTYPE)addr & (sizeof(size_t) - 1)) {
+ __db_err(dbmp->dbenv,
+ "Internal error: BH data NOT size_t aligned.");
+ abort();
+ }
+#endif
+ memset(bhp, 0, sizeof(BH));
+ LOCKINIT(dbmp, &bhp->mutex);
+
+ /*
+ * Prepend the bucket header to the head of the appropriate MPOOL
+ * bucket hash list. Append the bucket header to the tail of the
+ * MPOOL LRU chain.
+ *
+ * We have to do this before we read in the page so we can discard
+ * our region lock without screwing up the world.
+ */
+ bucket = BUCKET(mp, mf_offset, *pgnoaddr);
+ SH_TAILQ_INSERT_HEAD(&dbmp->htab[bucket], bhp, mq, __bh);
+ SH_TAILQ_INSERT_TAIL(&mp->bhq, bhp, q);
+ b_inserted = 1;
+
+ /* Set the page number, and associated MPOOLFILE. */
+ bhp->mf_offset = mf_offset;
+ bhp->pgno = *pgnoaddr;
+
+ /*
+ * If we know we created the page, zero it out and continue.
+ *
+ * !!!
+ * Note: DB_MPOOL_NEW deliberately doesn't call the pgin function.
+ * If DB_MPOOL_CREATE is used, then the application's pgin function
+ * has to be able to handle pages of 0's -- if it uses DB_MPOOL_NEW,
+ * it can detect all of its page creates, and not bother.
+ *
+ * Otherwise, read the page into memory, optionally creating it if
+ * DB_MPOOL_CREATE is set.
+ *
+ * Increment the reference count for created buffers, but importantly,
+ * increment the reference count for buffers we're about to read so
+ * that the buffer can't move.
+ */
+ ++bhp->ref;
+ b_incr = 1;
+
+ if (LF_ISSET(DB_MPOOL_NEW))
+ memset(addr, 0, mfp->stat.st_pagesize);
+ else {
+ /*
+ * It's possible for the read function to fail, which means
+ * that we fail as well.
+ */
+reread: if ((ret = __memp_pgread(dbmfp,
+ bhp, LF_ISSET(DB_MPOOL_CREATE | DB_MPOOL_NEW))) != 0)
+ goto err;
+
+ /*
+ * !!!
+ * The __memp_pgread call discarded and reacquired the region
+ * lock. Because the buffer reference count was incremented
+ * before the region lock was discarded the buffer didn't move.
+ */
+ ++mp->stat.st_cache_miss;
+ ++mfp->stat.st_cache_miss;
+ }
+
+ if (0) {
+found: /* Increment the reference count. */
+ if (bhp->ref == UINT16_T_MAX) {
+ __db_err(dbmp->dbenv,
+ "%s: too many references to page %lu",
+ dbmfp->path, bhp->pgno);
+ ret = EAGAIN;
+ goto err;
+ }
+ ++bhp->ref;
+ b_incr = 1;
+
+ /*
+ * Any found buffer might be trouble.
+ *
+ * BH_LOCKED --
+ * I/O in progress, wait for it to finish. Because the buffer
+ * reference count was incremented before the region lock was
+ * discarded we know the buffer didn't move.
+ */
+ if (F_ISSET(bhp, BH_LOCKED)) {
+ UNLOCKREGION(dbmp);
+ LOCKBUFFER(dbmp, bhp);
+ /* Waiting for I/O to finish... */
+ UNLOCKBUFFER(dbmp, bhp);
+ LOCKREGION(dbmp);
+ }
+
+ /*
+ * BH_TRASH --
+ * The buffer is garbage.
+ */
+ if (F_ISSET(bhp, BH_TRASH))
+ goto reread;
+
+ /*
+ * BH_CALLPGIN --
+ * The buffer was written, and the contents need to be
+ * converted again.
+ */
+ if (F_ISSET(bhp, BH_CALLPGIN)) {
+ if ((ret = __memp_pg(dbmfp, bhp, 1)) != 0)
+ goto err;
+ F_CLR(bhp, BH_CALLPGIN);
+ }
+
+ ++mp->stat.st_cache_hit;
+ ++mfp->stat.st_cache_hit;
+ }
+
+mapret: LOCKHANDLE(dbmp, &dbmfp->mutex);
+ ++dbmfp->pinref;
+ UNLOCKHANDLE(dbmp, &dbmfp->mutex);
+
+ if (0) {
+err: /*
+ * If no other process is already waiting on a created buffer,
+ * go ahead and discard it, it's not useful.
+ */
+ if (b_incr)
+ --bhp->ref;
+ if (b_inserted && bhp->ref == 0)
+ __memp_bhfree(dbmp, mfp, bhp, 1);
+ }
+
+ UNLOCKREGION(dbmp);
+
+ *(void **)addrp = addr;
+ return (ret);
+}
diff --git a/db2/mp/mp_fopen.c b/db2/mp/mp_fopen.c
new file mode 100644
index 0000000000..7703847b73
--- /dev/null
+++ b/db2/mp/mp_fopen.c
@@ -0,0 +1,437 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 1997
+ * Sleepycat Software. All rights reserved.
+ */
+#include "config.h"
+
+#ifndef lint
+static const char sccsid[] = "@(#)mp_fopen.c 10.24 (Sleepycat) 8/20/97";
+#endif /* not lint */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#endif
+
+#include "db_int.h"
+#include "shqueue.h"
+#include "db_shash.h"
+#include "mp.h"
+#include "common_ext.h"
+
+static int __memp_mf_close __P((DB_MPOOL *, DB_MPOOLFILE *));
+static int __memp_mf_open __P((DB_MPOOL *, DB_MPOOLFILE *,
+ int, int, size_t, int, DBT *, u_int8_t *, int, MPOOLFILE **));
+
+/*
+ * memp_fopen --
+ * Open a backing file for the memory pool.
+ */
+int
+memp_fopen(dbmp, path, ftype,
+ flags, mode, pagesize, lsn_offset, pgcookie, fileid, retp)
+ DB_MPOOL *dbmp;
+ const char *path;
+ int ftype, flags, mode, lsn_offset;
+ size_t pagesize;
+ DBT *pgcookie;
+ u_int8_t *fileid;
+ DB_MPOOLFILE **retp;
+{
+ int ret;
+
+ /* Validate arguments. */
+ if ((ret = __db_fchk(dbmp->dbenv,
+ "memp_fopen", flags, DB_CREATE | DB_NOMMAP | DB_RDONLY)) != 0)
+ return (ret);
+
+ return (__memp_fopen(dbmp, path, ftype,
+ flags, mode, pagesize, lsn_offset, pgcookie, fileid, 1, retp));
+}
+
+/*
+ * __memp_fopen --
+ * Open a backing file for the memory pool; internal version.
+ *
+ * PUBLIC: int __memp_fopen __P((DB_MPOOL *, const char *, int, int,
+ * PUBLIC: int, size_t, int, DBT *, u_int8_t *, int, DB_MPOOLFILE **));
+ */
+int
+__memp_fopen(dbmp, path,
+ ftype, flags, mode, pagesize, lsn_offset, pgcookie, fileid, needlock, retp)
+ DB_MPOOL *dbmp;
+ const char *path;
+ int ftype, flags, mode, lsn_offset, needlock;
+ size_t pagesize;
+ DBT *pgcookie;
+ u_int8_t *fileid;
+ DB_MPOOLFILE **retp;
+{
+ DB_ENV *dbenv;
+ DB_MPOOLFILE *dbmfp;
+ MPOOLFILE *mfp;
+ off_t size;
+ int ret;
+
+ dbenv = dbmp->dbenv;
+ ret = 0;
+
+ /* Require a non-zero pagesize. */
+ if (pagesize == 0) {
+ __db_err(dbenv, "memp_fopen: pagesize not specified");
+ return (EINVAL);
+ }
+
+ /* Allocate and initialize the per-process structure. */
+ if ((dbmfp =
+ (DB_MPOOLFILE *)calloc(1, sizeof(DB_MPOOLFILE))) == NULL) {
+ __db_err(dbenv, "%s: %s",
+ path == NULL ? TEMPORARY : path, strerror(ENOMEM));
+ return (ENOMEM);
+ }
+ LOCKINIT(dbmp, &dbmfp->mutex);
+ dbmfp->dbmp = dbmp;
+ dbmfp->fd = -1;
+ if (LF_ISSET(DB_RDONLY))
+ F_SET(dbmfp, MP_READONLY);
+
+ if (path == NULL) {
+ if (LF_ISSET(DB_RDONLY)) {
+ __db_err(dbenv,
+ "memp_fopen: temporary files can't be readonly");
+ ret = EINVAL;
+ goto err;
+ }
+ dbmfp->path = (char *) TEMPORARY;
+ F_SET(dbmfp, MP_PATH_TEMP);
+ } else {
+ /* Calculate the real name for this file. */
+ if ((ret = __db_appname(dbenv,
+ DB_APP_DATA, NULL, path, NULL, &dbmfp->path)) != 0)
+ goto err;
+ F_SET(dbmfp, MP_PATH_ALLOC);
+
+
+ /* Open the file. */
+ if ((ret = __db_fdopen(dbmfp->path,
+ LF_ISSET(DB_CREATE | DB_RDONLY), DB_CREATE | DB_RDONLY,
+ mode, &dbmfp->fd)) != 0) {
+ __db_err(dbenv, "%s: %s", dbmfp->path, strerror(ret));
+ goto err;
+ }
+
+ /* Don't permit files that aren't a multiple of the pagesize. */
+ if ((ret = __db_stat(dbenv,
+ dbmfp->path, dbmfp->fd, &size, NULL)) != 0)
+ goto err;
+ if (size % pagesize) {
+ __db_err(dbenv,
+ "%s: file size not a multiple of the pagesize",
+ dbmfp->path);
+ ret = EINVAL;
+ goto err;
+ }
+ }
+
+ /* Find/allocate the shared file object. */
+ if (needlock)
+ LOCKREGION(dbmp);
+ ret = __memp_mf_open(dbmp, dbmfp, ftype,
+ F_ISSET(dbmfp, MP_READONLY), pagesize,
+ lsn_offset, pgcookie, fileid, F_ISSET(dbmfp, MP_PATH_TEMP), &mfp);
+ if (needlock)
+ UNLOCKREGION(dbmp);
+ if (ret != 0)
+ goto err;
+
+ dbmfp->mfp = mfp;
+
+ /*
+ * If a file:
+ *
+ * + is read-only
+ * + doesn't require any pgin/pgout support
+ * + is less than mp_mmapsize bytes in size.
+ * + and the DB_NOMMAP flag wasn't set
+ *
+ * we can mmap it instead of reading/writing buffers. Don't do error
+ * checking based on the mmap call failure. We want to do normal I/O
+ * on the file if the reason we failed was because the file was on an
+ * NFS mounted partition, and we can fail in buffer I/O just as easily
+ * as here.
+ *
+ * XXX
+ * We'd like to test to see if the file is too big to mmap. Since we
+ * don't know what size or type off_t's or size_t's are, or the largest
+ * unsigned integral type is, or what random insanity the local C
+ * compiler will perpetrate, doing the comparison in a portable way is
+ * flatly impossible. Hope that mmap fails if the file is too large.
+ */
+#define DB_MAXMMAPSIZE (10 * 1024 * 1024) /* 10 Mb. */
+ dbmfp->addr = NULL;
+ mfp->can_mmap = F_ISSET(dbmfp, MP_READONLY) &&
+ ftype == 0 && !LF_ISSET(DB_NOMMAP) && path != NULL &&
+ size <= (dbenv == NULL || dbenv->mp_mmapsize == 0 ?
+ DB_MAXMMAPSIZE : (off_t)dbenv->mp_mmapsize);
+ if (mfp->can_mmap) {
+ dbmfp->len = size;
+ if (__db_mmap(dbmfp->fd, dbmfp->len, 1, 1, &dbmfp->addr) != 0) {
+ mfp->can_mmap = 0;
+ dbmfp->addr = NULL;
+ }
+ }
+
+ LOCKHANDLE(dbmp, &dbmp->mutex);
+ TAILQ_INSERT_TAIL(&dbmp->dbmfq, dbmfp, q);
+ UNLOCKHANDLE(dbmp, &dbmp->mutex);
+
+ *retp = dbmfp;
+ return (0);
+
+err: if (F_ISSET(dbmfp, MP_PATH_ALLOC))
+ FREES(dbmfp->path);
+ if (dbmfp->fd != -1)
+ (void)__db_close(dbmfp->fd);
+ if (dbmfp != NULL)
+ FREE(dbmfp, sizeof(DB_MPOOLFILE));
+ return (ret);
+}
+
+/*
+ * __memp_mf_open --
+ * Open an MPOOLFILE.
+ */
+static int
+__memp_mf_open(dbmp, dbmfp,
+ ftype, readonly, pagesize, lsn_offset, pgcookie, fileid, istemp, retp)
+ DB_MPOOL *dbmp;
+ DB_MPOOLFILE *dbmfp;
+ int ftype, readonly, lsn_offset, istemp;
+ size_t pagesize;
+ DBT *pgcookie;
+ u_int8_t *fileid;
+ MPOOLFILE **retp;
+{
+ MPOOLFILE *mfp;
+ int ret;
+ u_int8_t idbuf[DB_FILE_ID_LEN];
+ void *p;
+
+ /* Temporary files can't match previous files. */
+ if (istemp)
+ goto alloc;
+
+ /*
+ * Get the file id if we weren't give one. Generated file id's don't
+ * use timestamps, otherwise there'd be no chance of anyone joining
+ * the party.
+ */
+ if (fileid == NULL) {
+ if ((ret =
+ __db_fileid(dbmp->dbenv, dbmfp->path, 0, idbuf)) != 0)
+ return (ret);
+ fileid = idbuf;
+ }
+
+ /* Walk the list of MPOOLFILE's, looking for a matching file. */
+ for (mfp = SH_TAILQ_FIRST(&dbmp->mp->mpfq, __mpoolfile);
+ mfp != NULL; mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile))
+ if (!memcmp(fileid,
+ ADDR(dbmp, mfp->fileid_off), DB_FILE_ID_LEN)) {
+ if (ftype != mfp->ftype ||
+ pagesize != mfp->stat.st_pagesize) {
+ __db_err(dbmp->dbenv,
+ "%s: ftype or pagesize changed",
+ dbmfp->path);
+ ret = EINVAL;
+ mfp = NULL;
+ goto ret1;
+ }
+ /*
+ * Found it: increment the reference count and update
+ * the mmap-able status.
+ */
+ ++mfp->ref;
+ if (!readonly)
+ mfp->can_mmap = 0;
+ goto ret1;
+ }
+
+ /* Allocate a new MPOOLFILE. */
+alloc: if ((ret = __memp_ralloc(dbmp, sizeof(MPOOLFILE), NULL, &mfp)) != 0)
+ goto ret1;
+
+ /* Initialize the structure. */
+ memset(mfp, 0, sizeof(MPOOLFILE));
+ mfp->ref = 1;
+ mfp->ftype = ftype;
+ mfp->lsn_off = lsn_offset;
+ mfp->stat.st_pagesize = pagesize;
+
+ /* Copy the file path into shared memory. */
+ if ((ret = __memp_ralloc(dbmp,
+ strlen(dbmfp->path) + 1, &mfp->path_off, &p)) != 0)
+ goto err;
+ memcpy(p, dbmfp->path, strlen(dbmfp->path) + 1);
+
+ /* Copy the file identification string into shared memory. */
+ if (istemp)
+ mfp->fileid_off = 0;
+ else {
+ if ((ret = __memp_ralloc(dbmp,
+ DB_FILE_ID_LEN, &mfp->fileid_off, &p)) != 0)
+ goto err;
+ memcpy(p, fileid, DB_FILE_ID_LEN);
+ }
+
+ /* Copy the page cookie into shared memory. */
+ if (pgcookie == NULL || pgcookie->size == 0) {
+ mfp->pgcookie_len = 0;
+ mfp->pgcookie_off = 0;
+ } else {
+ if ((ret = __memp_ralloc(dbmp,
+ pgcookie->size, &mfp->pgcookie_off, &p)) != 0)
+ goto err;
+ memcpy(p, pgcookie->data, pgcookie->size);
+ mfp->pgcookie_len = pgcookie->size;
+ }
+
+ /* Prepend the MPOOLFILE to the list of MPOOLFILE's. */
+ SH_TAILQ_INSERT_HEAD(&dbmp->mp->mpfq, mfp, q, __mpoolfile);
+
+ if (0) {
+err: if (mfp->path_off != 0)
+ __db_shalloc_free(dbmp->addr,
+ ADDR(dbmp, mfp->path_off));
+ if (!istemp)
+ __db_shalloc_free(dbmp->addr,
+ ADDR(dbmp, mfp->fileid_off));
+ if (mfp != NULL)
+ __db_shalloc_free(dbmp->addr, mfp);
+ mfp = NULL;
+ }
+
+ret1: *retp = mfp;
+ return (0);
+}
+
+/*
+ * memp_fclose --
+ * Close a backing file for the memory pool.
+ */
+int
+memp_fclose(dbmfp)
+ DB_MPOOLFILE *dbmfp;
+{
+ DB_MPOOL *dbmp;
+ int ret, t_ret;
+
+ dbmp = dbmfp->dbmp;
+ ret = 0;
+
+ /* Complain if pinned blocks never returned. */
+ if (dbmfp->pinref != 0)
+ __db_err(dbmp->dbenv, "%s: close: %lu blocks left pinned",
+ dbmfp->path, (u_long)dbmfp->pinref);
+
+ /* Remove the DB_MPOOLFILE structure from the list. */
+ LOCKHANDLE(dbmp, &dbmp->mutex);
+ TAILQ_REMOVE(&dbmp->dbmfq, dbmfp, q);
+ UNLOCKHANDLE(dbmp, &dbmp->mutex);
+
+ /* Close the underlying MPOOLFILE. */
+ (void)__memp_mf_close(dbmp, dbmfp);
+
+ /* Discard any mmap information. */
+ if (dbmfp->addr != NULL &&
+ (ret = __db_munmap(dbmfp->addr, dbmfp->len)) != 0)
+ __db_err(dbmp->dbenv, "%s: %s", dbmfp->path, strerror(ret));
+
+ /* Close the file; temporary files may not yet have been created. */
+ if (dbmfp->fd != -1 && (t_ret = __db_close(dbmfp->fd)) != 0) {
+ __db_err(dbmp->dbenv, "%s: %s", dbmfp->path, strerror(t_ret));
+ if (ret != 0)
+ t_ret = ret;
+ }
+
+ /* Potentially allocated path. */
+ if (F_ISSET(dbmfp, MP_PATH_ALLOC))
+ FREES(dbmfp->path);
+
+ /* Free the DB_MPOOLFILE structure. */
+ FREE(dbmfp, sizeof(DB_MPOOLFILE));
+
+ return (ret);
+}
+
+/*
+ * __memp_mf_close --
+ * Close down an MPOOLFILE.
+ */
+static int
+__memp_mf_close(dbmp, dbmfp)
+ DB_MPOOL *dbmp;
+ DB_MPOOLFILE *dbmfp;
+{
+ BH *bhp, *nbhp;
+ MPOOL *mp;
+ MPOOLFILE *mfp;
+ size_t mf_offset;
+
+ mp = dbmp->mp;
+ mfp = dbmfp->mfp;
+
+ LOCKREGION(dbmp);
+
+ /* If more than a single reference, simply decrement. */
+ if (mfp->ref > 1) {
+ --mfp->ref;
+ goto ret1;
+ }
+
+ /*
+ * Move any BH's held by the file to the free list. We don't free the
+ * memory itself because we may be discarding the memory pool, and it's
+ * fairly expensive to reintegrate the buffers back into the region for
+ * no purpose.
+ */
+ mf_offset = OFFSET(dbmp, mfp);
+ for (bhp = SH_TAILQ_FIRST(&mp->bhq, __bh); bhp != NULL; bhp = nbhp) {
+ nbhp = SH_TAILQ_NEXT(bhp, q, __bh);
+
+#ifdef DEBUG_NO_DIRTY
+ /* Complain if we find any blocks that were left dirty. */
+ if (F_ISSET(bhp, BH_DIRTY))
+ __db_err(dbmp->dbenv,
+ "%s: close: pgno %lu left dirty; ref %lu",
+ dbmfp->path, (u_long)bhp->pgno, (u_long)bhp->ref);
+#endif
+
+ if (bhp->mf_offset == mf_offset) {
+ __memp_bhfree(dbmp, mfp, bhp, 0);
+ SH_TAILQ_INSERT_HEAD(&mp->bhfq, bhp, q, __bh);
+ }
+ }
+
+ /* Delete from the list of MPOOLFILEs. */
+ SH_TAILQ_REMOVE(&mp->mpfq, mfp, q, __mpoolfile);
+
+ /* Free the space. */
+ __db_shalloc_free(dbmp->addr, mfp);
+ __db_shalloc_free(dbmp->addr, ADDR(dbmp, mfp->path_off));
+ if (mfp->fileid_off != 0)
+ __db_shalloc_free(dbmp->addr, ADDR(dbmp, mfp->fileid_off));
+ if (mfp->pgcookie_off != 0)
+ __db_shalloc_free(dbmp->addr, ADDR(dbmp, mfp->pgcookie_off));
+
+ret1: UNLOCKREGION(dbmp);
+ return (0);
+}
diff --git a/db2/mp/mp_fput.c b/db2/mp/mp_fput.c
new file mode 100644
index 0000000000..5fac8ae76b
--- /dev/null
+++ b/db2/mp/mp_fput.c
@@ -0,0 +1,140 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 1997
+ * Sleepycat Software. All rights reserved.
+ */
+#include "config.h"
+
+#ifndef lint
+static const char sccsid[] = "@(#)mp_fput.c 10.10 (Sleepycat) 7/20/97";
+#endif /* not lint */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+
+#include <errno.h>
+#include <stdlib.h>
+#endif
+
+#include "db_int.h"
+#include "shqueue.h"
+#include "db_shash.h"
+#include "mp.h"
+#include "common_ext.h"
+
+/*
+ * memp_fput --
+ * Mpool file put function.
+ */
+int
+memp_fput(dbmfp, pgaddr, flags)
+ DB_MPOOLFILE *dbmfp;
+ void *pgaddr;
+ u_long flags;
+{
+ BH *bhp;
+ DB_MPOOL *dbmp;
+ MPOOLFILE *mfp;
+ int wrote, ret;
+
+ dbmp = dbmfp->dbmp;
+
+ /* Validate arguments. */
+ if (flags) {
+ if ((ret = __db_fchk(dbmp->dbenv, "memp_fput", flags,
+ DB_MPOOL_CLEAN | DB_MPOOL_DIRTY | DB_MPOOL_DISCARD)) != 0)
+ return (ret);
+ if ((ret = __db_fcchk(dbmp->dbenv, "memp_fput",
+ flags, DB_MPOOL_CLEAN, DB_MPOOL_DIRTY)) != 0)
+ return (ret);
+
+ if (LF_ISSET(DB_MPOOL_DIRTY) && F_ISSET(dbmfp, MP_READONLY)) {
+ __db_err(dbmp->dbenv,
+ "%s: dirty flag set for readonly file page",
+ dbmfp->path);
+ return (EACCES);
+ }
+ }
+
+ /* Decrement the pinned reference count. */
+ LOCKHANDLE(dbmp, &dbmfp->mutex);
+ if (dbmfp->pinref == 0)
+ __db_err(dbmp->dbenv,
+ "%s: put: more blocks returned than retrieved",
+ dbmfp->path);
+ else
+ --dbmfp->pinref;
+ UNLOCKHANDLE(dbmp, &dbmfp->mutex);
+
+ /*
+ * If we're mapping the file, there's nothing to do. Because we can
+ * quit mapping at any time, we have to check on each buffer to see
+ * if it's in the map region.
+ */
+ if (dbmfp->addr != NULL && pgaddr >= dbmfp->addr &&
+ (u_int8_t *)pgaddr <= (u_int8_t *)dbmfp->addr + dbmfp->len)
+ return (0);
+
+ /* Convert the page address to a buffer header. */
+ bhp = (BH *)((u_int8_t *)pgaddr - SSZA(BH, buf));
+
+ LOCKREGION(dbmp);
+
+ /* Set/clear the page bits. */
+ if (LF_ISSET(DB_MPOOL_CLEAN))
+ F_CLR(bhp, BH_DIRTY);
+ if (LF_ISSET(DB_MPOOL_DIRTY))
+ F_SET(bhp, BH_DIRTY);
+ if (LF_ISSET(DB_MPOOL_DISCARD))
+ F_SET(bhp, BH_DISCARD);
+
+ /*
+ * If more than one reference to the page, we're done. Ignore discard
+ * flags (for now) and leave it at its position in the LRU chain. The
+ * rest gets done at last reference close.
+ */
+#ifdef DEBUG
+ if (bhp->ref == 0) {
+ __db_err(dbmp->dbenv,
+ "Internal error: bhp->ref on page %lu went negative.",
+ (u_long)bhp->pgno);
+ abort();
+ }
+#endif
+ if (--bhp->ref > 0) {
+ UNLOCKREGION(dbmp);
+ return (0);
+ }
+
+ /* Move the buffer to the head/tail of the LRU chain. */
+ SH_TAILQ_REMOVE(&dbmp->mp->bhq, bhp, q, __bh);
+ if (F_ISSET(bhp, BH_DISCARD))
+ SH_TAILQ_INSERT_HEAD(&dbmp->mp->bhq, bhp, q, __bh);
+ else
+ SH_TAILQ_INSERT_TAIL(&dbmp->mp->bhq, bhp, q);
+
+ /*
+ * If this buffer is scheduled for writing because of a checkpoint,
+ * write it now. If we can't write it, set a flag so that the next
+ * time the memp_sync function is called we try writing it there,
+ * as the checkpoint application better be able to write all of the
+ * files.
+ */
+ if (F_ISSET(bhp, BH_WRITE))
+ if (F_ISSET(bhp, BH_DIRTY)) {
+ if (__memp_bhwrite(dbmp,
+ dbmfp->mfp, bhp, NULL, &wrote) != 0 || !wrote)
+ F_SET(dbmp->mp, MP_LSN_RETRY);
+ } else {
+ F_CLR(bhp, BH_WRITE);
+
+ mfp = ADDR(dbmp, bhp->mf_offset);
+ --mfp->lsn_cnt;
+
+ --dbmp->mp->lsn_cnt;
+ }
+
+ UNLOCKREGION(dbmp);
+ return (0);
+}
diff --git a/db2/mp/mp_fset.c b/db2/mp/mp_fset.c
new file mode 100644
index 0000000000..588085a358
--- /dev/null
+++ b/db2/mp/mp_fset.c
@@ -0,0 +1,72 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 1997
+ * Sleepycat Software. All rights reserved.
+ */
+#include "config.h"
+
+#ifndef lint
+static const char sccsid[] = "@(#)mp_fset.c 10.8 (Sleepycat) 8/19/97";
+#endif /* not lint */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+
+#include <errno.h>
+#endif
+
+#include "db_int.h"
+#include "shqueue.h"
+#include "db_shash.h"
+#include "mp.h"
+#include "common_ext.h"
+
+/*
+ * memp_fset --
+ * Mpool page set-flag routine.
+ */
+int
+memp_fset(dbmfp, pgaddr, flags)
+ DB_MPOOLFILE *dbmfp;
+ void *pgaddr;
+ u_long flags;
+{
+ BH *bhp;
+ DB_MPOOL *dbmp;
+ int ret;
+
+ dbmp = dbmfp->dbmp;
+
+ /* Validate arguments. */
+ if (flags != 0) {
+ if ((ret = __db_fchk(dbmp->dbenv, "memp_fset", flags,
+ DB_MPOOL_DIRTY | DB_MPOOL_CLEAN | DB_MPOOL_DISCARD)) != 0)
+ return (ret);
+ if ((ret = __db_fcchk(dbmp->dbenv, "memp_fset",
+ flags, DB_MPOOL_CLEAN, DB_MPOOL_DIRTY)) != 0)
+ return (ret);
+
+ if (LF_ISSET(DB_MPOOL_DIRTY) && F_ISSET(dbmfp, MP_READONLY)) {
+ __db_err(dbmp->dbenv,
+ "%s: dirty flag set for readonly file page",
+ dbmfp->path);
+ return (EACCES);
+ }
+ }
+
+ /* Convert the page address to a buffer header. */
+ bhp = (BH *)((u_int8_t *)pgaddr - SSZA(BH, buf));
+
+ LOCKREGION(dbmp);
+
+ if (LF_ISSET(DB_MPOOL_DIRTY))
+ F_SET(bhp, BH_DIRTY);
+ if (LF_ISSET(DB_MPOOL_CLEAN))
+ F_CLR(bhp, BH_DIRTY);
+ if (LF_ISSET(DB_MPOOL_DISCARD))
+ F_SET(bhp, BH_DISCARD);
+
+ UNLOCKREGION(dbmp);
+ return (0);
+}
diff --git a/db2/mp/mp_open.c b/db2/mp/mp_open.c
new file mode 100644
index 0000000000..257ce1b9e9
--- /dev/null
+++ b/db2/mp/mp_open.c
@@ -0,0 +1,176 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 1997
+ * Sleepycat Software. All rights reserved.
+ */
+#include "config.h"
+
+#ifndef lint
+static const char sccsid[] = "@(#)mp_open.c 10.12 (Sleepycat) 7/6/97";
+#endif /* not lint */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#endif
+
+#include "db_int.h"
+#include "shqueue.h"
+#include "db_shash.h"
+#include "mp.h"
+#include "common_ext.h"
+
+/*
+ * memp_open --
+ * Initialize and/or join a memory pool.
+ */
+int
+memp_open(path, flags, mode, dbenv, retp)
+ const char *path;
+ int flags, mode;
+ DB_ENV *dbenv;
+ DB_MPOOL **retp;
+{
+ DB_MPOOL *dbmp;
+ size_t cachesize;
+ int ret;
+
+ /* Validate arguments. */
+#ifdef HAVE_SPINLOCKS
+#define OKFLAGS (DB_CREATE | DB_MPOOL_PRIVATE | DB_NOMMAP | DB_THREAD)
+#else
+#define OKFLAGS (DB_CREATE | DB_MPOOL_PRIVATE | DB_NOMMAP)
+#endif
+ if ((ret = __db_fchk(dbenv, "memp_open", flags, OKFLAGS)) != 0)
+ return (ret);
+
+ /* Extract fields from DB_ENV structure. */
+ cachesize = dbenv == NULL ? 0 : dbenv->mp_size;
+
+ /* Create and initialize the DB_MPOOL structure. */
+ if ((dbmp = (DB_MPOOL *)calloc(1, sizeof(DB_MPOOL))) == NULL)
+ return (ENOMEM);
+ LOCKINIT(dbmp, &dbmp->mutex);
+ LIST_INIT(&dbmp->dbregq);
+ TAILQ_INIT(&dbmp->dbmfq);
+
+ dbmp->dbenv = dbenv;
+
+ /* Decide if it's possible for anyone else to access the pool. */
+ if ((dbenv == NULL && path == NULL) ||
+ (dbenv != NULL && F_ISSET(dbenv, DB_MPOOL_PRIVATE)))
+ F_SET(dbmp, MP_ISPRIVATE);
+
+ /*
+ * Map in the region. We do locking regardless, as portions of it are
+ * implemented in common code (if we put the region in a file, that is).
+ */
+ F_SET(dbmp, MP_LOCKREGION);
+ if ((ret = __memp_ropen(dbmp, path, cachesize, mode, flags)) != 0)
+ goto err;
+ F_CLR(dbmp, MP_LOCKREGION);
+
+ /*
+ * If there's concurrent access, then we have to lock the region.
+ * If it's threaded, then we have to lock both the handles and the
+ * region.
+ */
+ if (!F_ISSET(dbmp, MP_ISPRIVATE))
+ F_SET(dbmp, MP_LOCKREGION);
+ if (LF_ISSET(DB_THREAD))
+ F_SET(dbmp, MP_LOCKHANDLE | MP_LOCKREGION);
+
+ *retp = dbmp;
+ return (0);
+
+err: if (dbmp != NULL)
+ FREE(dbmp, sizeof(DB_MPOOL));
+ return (ret);
+}
+
+/*
+ * memp_close --
+ * Close a memory pool.
+ */
+int
+memp_close(dbmp)
+ DB_MPOOL *dbmp;
+{
+ DB_MPOOLFILE *dbmfp;
+ DB_MPREG *mpreg;
+ int ret, t_ret;
+
+ ret = 0;
+
+ /* Discard DB_MPREGs. */
+ while ((mpreg = LIST_FIRST(&dbmp->dbregq)) != NULL) {
+ LIST_REMOVE(mpreg, q);
+ FREE(mpreg, sizeof(DB_MPREG));
+ }
+
+ /* Discard DB_MPOOLFILEs. */
+ while ((dbmfp = TAILQ_FIRST(&dbmp->dbmfq)) != NULL)
+ if ((t_ret = memp_fclose(dbmfp)) != 0 && ret == 0)
+ ret = t_ret;
+
+ /* Close the region. */
+ if ((t_ret = __memp_rclose(dbmp)) && ret == 0)
+ ret = t_ret;
+
+ /* Free the structure. */
+ FREE(dbmp, sizeof(DB_MPOOL));
+
+ return (ret);
+}
+
+/*
+ * memp_unlink --
+ * Exit a memory pool.
+ */
+int
+memp_unlink(path, force, dbenv)
+ const char *path;
+ int force;
+ DB_ENV *dbenv;
+{
+ return (__db_runlink(dbenv,
+ DB_APP_NONE, path, DB_DEFAULT_MPOOL_FILE, force));
+}
+
+/*
+ * memp_register --
+ * Register a file type's pgin, pgout routines.
+ */
+int
+memp_register(dbmp, ftype, pgin, pgout)
+ DB_MPOOL *dbmp;
+ int ftype;
+ int (*pgin) __P((db_pgno_t, void *, DBT *));
+ int (*pgout) __P((db_pgno_t, void *, DBT *));
+{
+ DB_MPREG *mpr;
+
+ if ((mpr = (DB_MPREG *)malloc(sizeof(DB_MPREG))) == NULL)
+ return (ENOMEM);
+
+ mpr->ftype = ftype;
+ mpr->pgin = pgin;
+ mpr->pgout = pgout;
+
+ /*
+ * Insert at the head. Because we do a linear walk, we'll find
+ * the most recent registry in the case of multiple entries, so
+ * we don't have to check for multiple registries.
+ */
+ LOCKHANDLE(dbmp, &dbmp->mutex);
+ LIST_INSERT_HEAD(&dbmp->dbregq, mpr, q);
+ UNLOCKHANDLE(dbmp, &dbmp->mutex);
+
+ return (0);
+}
diff --git a/db2/mp/mp_pr.c b/db2/mp/mp_pr.c
new file mode 100644
index 0000000000..94eabf5947
--- /dev/null
+++ b/db2/mp/mp_pr.c
@@ -0,0 +1,313 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 1997
+ * Sleepycat Software. All rights reserved.
+ */
+#include "config.h"
+
+#ifndef lint
+static const char sccsid[] = "@(#)mp_pr.c 10.12 (Sleepycat) 7/29/97";
+#endif /* not lint */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#endif
+
+#include "db_int.h"
+#include "shqueue.h"
+#include "db_shash.h"
+#include "mp.h"
+
+void __memp_debug __P((DB_MPOOL *, FILE *, int));
+
+static void __memp_pbh __P((FILE *, DB_MPOOL *, BH *, int));
+static void __memp_pdbmf __P((FILE *, DB_MPOOLFILE *, int));
+static void __memp_pmf __P((FILE *, MPOOLFILE *, int));
+static void __memp_pmp __P((FILE *, DB_MPOOL *, MPOOL *, int));
+
+/*
+ * memp_stat --
+ * Display MPOOL statistics.
+ */
+int
+memp_stat(dbmp, gspp, fspp, db_malloc)
+ DB_MPOOL *dbmp;
+ DB_MPOOL_STAT **gspp;
+ DB_MPOOL_FSTAT ***fspp;
+ void *(*db_malloc) __P((size_t));
+{
+ DB_MPOOL_FSTAT **tfsp;
+ MPOOLFILE *mfp;
+ size_t len, nlen;
+ char *name;
+
+ /* Allocate space for the global statistics. */
+ if (gspp != NULL) {
+ *gspp = NULL;
+
+ if ((*gspp = db_malloc == NULL ?
+ (DB_MPOOL_STAT *)malloc(sizeof(**gspp)) :
+ (DB_MPOOL_STAT *)db_malloc(sizeof(**gspp))) == NULL)
+ return (ENOMEM);
+
+ LOCKREGION(dbmp);
+
+ /* Copy out the global statistics. */
+ **gspp = dbmp->mp->stat;
+ (*gspp)->st_hash_buckets = dbmp->mp->htab_buckets;
+
+ UNLOCKREGION(dbmp);
+ }
+
+ if (fspp != NULL) {
+ *fspp = NULL;
+
+ LOCKREGION(dbmp);
+
+ /* Count the MPOOLFILE structures. */
+ for (len = 0,
+ mfp = SH_TAILQ_FIRST(&dbmp->mp->mpfq, __mpoolfile);
+ mfp != NULL;
+ ++len, mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile));
+
+ UNLOCKREGION(dbmp);
+
+ if (len == 0)
+ return (0);
+
+ /* Allocate space for the pointers. */
+ len = (len + 1) * sizeof(DB_MPOOL_FSTAT *);
+ if ((*fspp = db_malloc == NULL ?
+ (DB_MPOOL_FSTAT **)malloc(len) :
+ (DB_MPOOL_FSTAT **)db_malloc(len)) == NULL)
+ return (ENOMEM);
+
+ LOCKREGION(dbmp);
+
+ /* Build each individual entry. */
+ for (tfsp = *fspp,
+ mfp = SH_TAILQ_FIRST(&dbmp->mp->mpfq, __mpoolfile);
+ mfp != NULL;
+ ++tfsp, mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile)) {
+ name = ADDR(dbmp, mfp->path_off);
+ nlen = strlen(name);
+ len = sizeof(DB_MPOOL_FSTAT) + nlen + 1;
+ if ((*tfsp = db_malloc == NULL ?
+ (DB_MPOOL_FSTAT *)malloc(len) :
+ (DB_MPOOL_FSTAT *)db_malloc(len)) == NULL)
+ return (ENOMEM);
+ **tfsp = mfp->stat;
+ (*tfsp)->file_name = (char *)
+ (u_int8_t *)*tfsp + sizeof(DB_MPOOL_FSTAT);
+ memcpy((*tfsp)->file_name, name, nlen + 1);
+ }
+ *tfsp = NULL;
+
+ UNLOCKREGION(dbmp);
+ }
+ return (0);
+}
+
+/*
+ * __memp_debug --
+ * Display MPOOL structures.
+ *
+ * PUBLIC: void __memp_debug __P((DB_MPOOL *, FILE *, int));
+ */
+void
+__memp_debug(dbmp, fp, data)
+ DB_MPOOL *dbmp;
+ FILE *fp;
+ int data;
+{
+ DB_MPOOLFILE *dbmfp;
+ u_long cnt;
+
+ /* Make it easy to call from the debugger. */
+ if (fp == NULL)
+ fp = stderr;
+
+ /* Welcome message. */
+ (void)fprintf(fp, "%s\nMpool per-process (%lu) statistics\n",
+ DB_LINE, (u_long)getpid());
+
+ if (data)
+ (void)fprintf(fp, " fd: %d; addr %lx; maddr %lx\n",
+ dbmp->fd, (u_long)dbmp->addr, (u_long)dbmp->maddr);
+
+ /* Display the DB_MPOOLFILE structures. */
+ for (cnt = 0, dbmfp = TAILQ_FIRST(&dbmp->dbmfq);
+ dbmfp != NULL; ++cnt, dbmfp = TAILQ_NEXT(dbmfp, q));
+ (void)fprintf(fp, "%lu process-local files\n", cnt);
+ for (dbmfp = TAILQ_FIRST(&dbmp->dbmfq);
+ dbmfp != NULL; dbmfp = TAILQ_NEXT(dbmfp, q)) {
+ (void)fprintf(fp, "%s\n", dbmfp->path);
+ __memp_pdbmf(fp, dbmfp, data);
+ }
+
+ /* Switch to global statistics. */
+ (void)fprintf(fp, "\n%s\nMpool statistics\n", DB_LINE);
+
+ /* Display the MPOOL structure. */
+ __memp_pmp(fp, dbmp, dbmp->mp, data);
+
+ /* Flush in case we're debugging. */
+ (void)fflush(fp);
+}
+
+/*
+ * __memp_pdbmf --
+ * Display a DB_MPOOLFILE structure.
+ */
+static void
+__memp_pdbmf(fp, dbmfp, data)
+ FILE *fp;
+ DB_MPOOLFILE *dbmfp;
+ int data;
+{
+ if (!data)
+ return;
+
+ (void)fprintf(fp, " fd: %d; %s\n",
+ dbmfp->fd, F_ISSET(dbmfp, MP_READONLY) ? "readonly" : "read/write");
+}
+
+/*
+ * __memp_pmp --
+ * Display the MPOOL structure.
+ */
+static void
+__memp_pmp(fp, dbmp, mp, data)
+ FILE *fp;
+ DB_MPOOL *dbmp;
+ MPOOL *mp;
+ int data;
+{
+ BH *bhp;
+ MPOOLFILE *mfp;
+ DB_HASHTAB *htabp;
+ size_t bucket;
+ int cnt;
+ const char *sep;
+
+ (void)fprintf(fp, "references: %lu; cachesize: %lu\n",
+ (u_long)mp->rlayout.refcnt, (u_long)mp->stat.st_cachesize);
+ (void)fprintf(fp,
+ " %lu pages created\n", mp->stat.st_page_create);
+ (void)fprintf(fp,
+ " %lu mmap pages returned\n", mp->stat.st_map);
+ (void)fprintf(fp, " %lu I/O's (%lu read, %lu written)\n",
+ mp->stat.st_page_in + mp->stat.st_page_out,
+ mp->stat.st_page_in, mp->stat.st_page_out);
+ if (mp->stat.st_cache_hit + mp->stat.st_cache_miss != 0)
+ (void)fprintf(fp,
+ " %.0f%% cache hit rate (%lu hit, %lu miss)\n",
+ ((double)mp->stat.st_cache_hit /
+ (mp->stat.st_cache_hit + mp->stat.st_cache_miss)) * 100,
+ mp->stat.st_cache_hit, mp->stat.st_cache_miss);
+
+ /* Display the MPOOLFILE structures. */
+ for (cnt = 0, mfp = SH_TAILQ_FIRST(&dbmp->mp->mpfq, __mpoolfile);
+ mfp != NULL; ++cnt, mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile));
+ (void)fprintf(fp, "%d total files\n", cnt);
+ for (cnt = 1, mfp = SH_TAILQ_FIRST(&dbmp->mp->mpfq, __mpoolfile);
+ mfp != NULL; ++cnt, mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile)) {
+ (void)fprintf(fp, "file %d\n", cnt);
+ __memp_pmf(fp, mfp, data);
+ }
+
+ if (!data)
+ return;
+
+ /* Display the hash table list of BH's. */
+ (void)fprintf(fp, "%s\nHASH table of BH's (%lu buckets):\n",
+ DB_LINE, (u_long)mp->htab_buckets);
+ (void)fprintf(fp,
+ "longest chain searched %lu\n", mp->stat.st_hash_longest);
+ (void)fprintf(fp, "average chain searched %lu (total/calls: %lu/%lu)\n",
+ mp->stat.st_hash_examined /
+ (mp->stat.st_hash_searches ? mp->stat.st_hash_searches : 1),
+ mp->stat.st_hash_examined, mp->stat.st_hash_searches);
+ for (htabp = dbmp->htab,
+ bucket = 0; bucket < mp->htab_buckets; ++htabp, ++bucket) {
+ if (SH_TAILQ_FIRST(&dbmp->htab[bucket], __bh) != NULL)
+ (void)fprintf(fp, "%lu:\n", (u_long)bucket);
+ for (bhp = SH_TAILQ_FIRST(&dbmp->htab[bucket], __bh);
+ bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, mq, __bh))
+ __memp_pbh(fp, dbmp, bhp, data);
+ }
+
+ /* Display the LRU list of BH's. */
+ (void)fprintf(fp, "LRU list of BH's (pgno/offset):");
+ for (sep = "\n ", bhp = SH_TAILQ_FIRST(&dbmp->mp->bhq, __bh);
+ bhp != NULL; sep = ", ", bhp = SH_TAILQ_NEXT(bhp, q, __bh))
+ (void)fprintf(fp, "%s%lu/%lu", sep,
+ (u_long)bhp->pgno, (u_long)OFFSET(dbmp, bhp));
+ (void)fprintf(fp, "\n");
+}
+
+/*
+ * __memp_pmf --
+ * Display an MPOOLFILE structure.
+ */
+static void
+__memp_pmf(fp, mfp, data)
+ FILE *fp;
+ MPOOLFILE *mfp;
+ int data;
+{
+ (void)fprintf(fp, " %lu pages created\n", mfp->stat.st_page_create);
+ (void)fprintf(fp, " %lu I/O's (%lu read, %lu written)\n",
+ mfp->stat.st_page_in + mfp->stat.st_page_out,
+ mfp->stat.st_page_in, mfp->stat.st_page_out);
+ if (mfp->stat.st_cache_hit + mfp->stat.st_cache_miss != 0)
+ (void)fprintf(fp,
+ " %.0f%% cache hit rate (%lu hit, %lu miss)\n",
+ ((double)mfp->stat.st_cache_hit /
+ (mfp->stat.st_cache_hit + mfp->stat.st_cache_miss)) * 100,
+ mfp->stat.st_cache_hit, mfp->stat.st_cache_miss);
+ if (!data)
+ return;
+
+ (void)fprintf(fp, " %d references; %s; pagesize: %lu\n", mfp->ref,
+ mfp->can_mmap ? "mmap" : "read/write",
+ (u_long)mfp->stat.st_pagesize);
+}
+
+/*
+ * __memp_pbh --
+ * Display a BH structure.
+ */
+static void
+__memp_pbh(fp, dbmp, bhp, data)
+ FILE *fp;
+ DB_MPOOL *dbmp;
+ BH *bhp;
+ int data;
+{
+ const char *sep;
+
+ if (!data)
+ return;
+
+ (void)fprintf(fp, " BH @ %lu (mf: %lu): page %lu; ref %lu",
+ (u_long)OFFSET(dbmp, bhp),
+ (u_long)bhp->mf_offset, (u_long)bhp->pgno, (u_long)bhp->ref);
+ sep = "; ";
+ if (F_ISSET(bhp, BH_DIRTY)) {
+ (void)fprintf(fp, "%sdirty", sep);
+ sep = ", ";
+ }
+ if (F_ISSET(bhp, BH_WRITE)) {
+ (void)fprintf(fp, "%schk_write", sep);
+ sep = ", ";
+ }
+ (void)fprintf(fp, "\n");
+}
diff --git a/db2/mp/mp_region.c b/db2/mp/mp_region.c
new file mode 100644
index 0000000000..a5c52123b9
--- /dev/null
+++ b/db2/mp/mp_region.c
@@ -0,0 +1,340 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 1997
+ * Sleepycat Software. All rights reserved.
+ */
+#include "config.h"
+
+#ifndef lint
+static const char sccsid[] = "@(#)mp_region.c 10.11 (Sleepycat) 8/2/97";
+#endif /* not lint */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#endif
+
+#include "db_int.h"
+#include "shqueue.h"
+#include "db_shash.h"
+#include "mp.h"
+#include "common_ext.h"
+
+/*
+ * __memp_ralloc --
+ * Allocate some space in the mpool region.
+ *
+ * PUBLIC: int __memp_ralloc __P((DB_MPOOL *, size_t, size_t *, void *));
+ */
+int
+__memp_ralloc(dbmp, len, offsetp, retp)
+ DB_MPOOL *dbmp;
+ size_t len, *offsetp;
+ void *retp;
+{
+ BH *bhp, *nbhp;
+ MPOOL *mp;
+ MPOOLFILE *mfp;
+ size_t fsize, total;
+ int nomore, restart, ret, wrote;
+ void *p;
+
+ mp = dbmp->mp;
+
+ nomore = 0;
+alloc: if ((ret = __db_shalloc(dbmp->addr, len, MUTEX_ALIGNMENT, &p)) == 0) {
+ if (offsetp != NULL)
+ *offsetp = OFFSET(dbmp, p);
+ *(void **)retp = p;
+ return (0);
+ }
+ if (nomore) {
+ __db_err(dbmp->dbenv, "%s", strerror(ret));
+ return (ret);
+ }
+
+ /* Look for a buffer on the free list that's the right size. */
+ for (bhp =
+ SH_TAILQ_FIRST(&mp->bhfq, __bh); bhp != NULL; bhp = nbhp) {
+ nbhp = SH_TAILQ_NEXT(bhp, q, __bh);
+
+ if (__db_shsizeof(bhp) == len) {
+ SH_TAILQ_REMOVE(&mp->bhfq, bhp, q, __bh);
+ if (offsetp != NULL)
+ *offsetp = OFFSET(dbmp, bhp);
+ *(void **)retp = bhp;
+ return (0);
+ }
+ }
+
+ /* Discard from the free list until we've freed enough memory. */
+ total = 0;
+ for (bhp =
+ SH_TAILQ_FIRST(&mp->bhfq, __bh); bhp != NULL; bhp = nbhp) {
+ nbhp = SH_TAILQ_NEXT(bhp, q, __bh);
+
+ SH_TAILQ_REMOVE(&mp->bhfq, bhp, q, __bh);
+ __db_shalloc_free(dbmp->addr, bhp);
+
+ /*
+ * Retry as soon as we've freed up sufficient space. If we
+ * have to coalesce of memory to satisfy the request, don't
+ * try until it's likely (possible?) that we'll succeed.
+ */
+ total += fsize = __db_shsizeof(bhp);
+ if (fsize >= len || total >= 3 * len)
+ goto alloc;
+ }
+
+retry: /* Find a buffer we can flush; pure LRU. */
+ total = 0;
+ for (bhp =
+ SH_TAILQ_FIRST(&mp->bhq, __bh); bhp != NULL; bhp = nbhp) {
+ nbhp = SH_TAILQ_NEXT(bhp, q, __bh);
+
+ /* Ignore pinned or locked (I/O in progress) buffers. */
+ if (bhp->ref != 0 || F_ISSET(bhp, BH_LOCKED))
+ continue;
+
+ /* Find the associated MPOOLFILE. */
+ mfp = ADDR(dbmp, bhp->mf_offset);
+
+ /*
+ * Write the page if it's dirty.
+ *
+ * If we wrote the page, fall through and free the buffer. We
+ * don't have to rewalk the list to acquire the buffer because
+ * it was never available for any other process to modify it.
+ * If we didn't write the page, but we discarded and reacquired
+ * the region lock, restart the buffer list walk. If we neither
+ * wrote the buffer nor discarded the region lock, continue down
+ * the buffer list.
+ */
+ if (F_ISSET(bhp, BH_DIRTY)) {
+ if ((ret = __memp_bhwrite(dbmp,
+ mfp, bhp, &restart, &wrote)) != 0)
+ return (ret);
+
+ /*
+ * It's possible that another process wants this buffer
+ * and incremented the ref count while we were writing
+ * it.
+ */
+ if (bhp->ref != 0)
+ goto retry;
+
+ if (wrote)
+ ++mp->stat.st_rw_evict;
+ else {
+ if (restart)
+ goto retry;
+ else
+ continue;
+ }
+ } else
+ ++mp->stat.st_ro_evict;
+
+ /*
+ * Check to see if the buffer is the size we're looking for.
+ * If it is, simply reuse it.
+ */
+ total += fsize = __db_shsizeof(bhp);
+ if (fsize == len) {
+ __memp_bhfree(dbmp, mfp, bhp, 0);
+
+ if (offsetp != NULL)
+ *offsetp = OFFSET(dbmp, bhp);
+ *(void **)retp = bhp;
+ return (0);
+ }
+
+ /* Free the buffer. */
+ __memp_bhfree(dbmp, mfp, bhp, 1);
+
+ /*
+ * Retry as soon as we've freed up sufficient space. If we
+ * have to coalesce of memory to satisfy the request, don't
+ * try until it's likely (possible?) that we'll succeed.
+ */
+ if (fsize >= len || total >= 3 * len)
+ goto alloc;
+
+ /* Restart the walk if we discarded the region lock. */
+ if (restart)
+ goto retry;
+ }
+ nomore = 1;
+ goto alloc;
+}
+
+/*
+ * __memp_ropen --
+ * Attach to, and optionally create, the mpool region.
+ *
+ * PUBLIC: int __memp_ropen
+ * PUBLIC: __P((DB_MPOOL *, const char *, size_t, int, int));
+ */
+int
+__memp_ropen(dbmp, path, cachesize, mode, flags)
+ DB_MPOOL *dbmp;
+ const char *path;
+ size_t cachesize;
+ int mode, flags;
+{
+ MPOOL *mp;
+ size_t rlen;
+ int fd, newregion, ret, retry_cnt;
+
+ /*
+ * Unlike other DB subsystems, mpool can't simply grow the region
+ * because it returns pointers into the region to its clients. To
+ * "grow" the region, we'd have to allocate a new region and then
+ * store a region number in the structures that reference regional
+ * objects. It's reasonable that we fail regardless, as clients
+ * shouldn't have every page in the region pinned, so the only
+ * "failure" mode should be a performance penalty because we don't
+ * find a page in the cache that we'd like to have found.
+ *
+ * Up the user's cachesize by 25% to account for our overhead.
+ */
+ if (cachesize < DB_CACHESIZE_MIN)
+ if (cachesize == 0)
+ cachesize = DB_CACHESIZE_DEF;
+ else
+ cachesize = DB_CACHESIZE_MIN;
+ rlen = cachesize + cachesize / 4;
+
+ /* Map in the region. */
+ retry_cnt = newregion = 0;
+retry: if (LF_ISSET(DB_CREATE)) {
+ /*
+ * If it's a private mpool, use malloc, it's a lot faster than
+ * instantiating a region.
+ *
+ * XXX
+ * If we're doing locking and don't have spinlocks for this
+ * architecture, we'd have to instantiate the file, we need
+ * the file descriptor for locking. However, it should not
+ * be possible for DB_THREAD to be set if HAVE_SPINLOCKS aren't
+ * defined.
+ */
+ if (F_ISSET(dbmp, MP_ISPRIVATE))
+ ret = (dbmp->maddr = malloc(rlen)) == NULL ? ENOMEM : 0;
+ else
+ ret = __db_rcreate(dbmp->dbenv, DB_APP_NONE, path,
+ DB_DEFAULT_MPOOL_FILE, mode, rlen, &fd,
+ &dbmp->maddr);
+ if (ret == 0) {
+ /* Put the MPOOL structure first in the region. */
+ mp = dbmp->maddr;
+
+ SH_TAILQ_INIT(&mp->bhq);
+ SH_TAILQ_INIT(&mp->bhfq);
+ SH_TAILQ_INIT(&mp->mpfq);
+
+ /* Initialize the rest of the region as free space. */
+ dbmp->addr = (u_int8_t *)dbmp->maddr + sizeof(MPOOL);
+ __db_shalloc_init(dbmp->addr, rlen - sizeof(MPOOL));
+
+ /*
+ *
+ * Pretend that the cache will be broken up into 4K
+ * pages, and that we want to keep it under, say, 10
+ * pages on each chain. This means a 256MB cache will
+ * allocate ~6500 offset pairs.
+ */
+ mp->htab_buckets =
+ __db_tablesize((cachesize / (4 * 1024)) / 10);
+
+ /* Allocate hash table space and initialize it. */
+ if ((ret = __db_shalloc(dbmp->addr,
+ mp->htab_buckets * sizeof(DB_HASHTAB),
+ 0, &dbmp->htab)) != 0)
+ goto err;
+ __db_hashinit(dbmp->htab, mp->htab_buckets);
+ mp->htab = OFFSET(dbmp, dbmp->htab);
+
+ memset(&mp->stat, 0, sizeof(mp->stat));
+ mp->stat.st_cachesize = cachesize;
+
+ mp->flags = 0;
+
+ newregion = 1;
+ } else if (ret != EEXIST)
+ return (ret);
+ }
+
+ /* If we didn't or couldn't create the region, try and join it. */
+ if (!newregion &&
+ (ret = __db_ropen(dbmp->dbenv, DB_APP_NONE,
+ path, DB_DEFAULT_MPOOL_FILE, 0, &fd, &dbmp->maddr)) != 0) {
+ /*
+ * If we failed because the file wasn't available, wait a
+ * second and try again.
+ */
+ if (ret == EAGAIN && ++retry_cnt < 3) {
+ (void)__db_sleep(1, 0);
+ goto retry;
+ }
+ return (ret);
+ }
+
+ /* Set up the common pointers. */
+ dbmp->mp = dbmp->maddr;
+ dbmp->addr = (u_int8_t *)dbmp->maddr + sizeof(MPOOL);
+
+ /*
+ * If not already locked, lock the region -- if it's a new region,
+ * then either __db_rcreate() locked it for us or we malloc'd it
+ * instead of creating a region, neither of which requires locking
+ * here.
+ */
+ if (!newregion)
+ LOCKREGION(dbmp);
+
+ /*
+ * Get the hash table address; it's on the shared page, so we have
+ * to lock first.
+ */
+ dbmp->htab = ADDR(dbmp, dbmp->mp->htab);
+
+ dbmp->fd = fd;
+
+ /* If we locked the region, release it now. */
+ if (!F_ISSET(dbmp, MP_ISPRIVATE))
+ UNLOCKREGION(dbmp);
+ return (0);
+
+err: if (fd != -1) {
+ dbmp->fd = fd;
+ (void)__memp_rclose(dbmp);
+ }
+
+ if (newregion)
+ (void)memp_unlink(path, 1, dbmp->dbenv);
+ return (ret);
+}
+
+/*
+ * __memp_rclose --
+ * Close the mpool region.
+ *
+ * PUBLIC: int __memp_rclose __P((DB_MPOOL *));
+ */
+int
+__memp_rclose(dbmp)
+ DB_MPOOL *dbmp;
+{
+ if (F_ISSET(dbmp, MP_ISPRIVATE)) {
+ free(dbmp->maddr);
+ return (0);
+ }
+ return (__db_rclose(dbmp->dbenv, dbmp->fd, dbmp->maddr));
+}
diff --git a/db2/mp/mp_sync.c b/db2/mp/mp_sync.c
new file mode 100644
index 0000000000..4f1205661a
--- /dev/null
+++ b/db2/mp/mp_sync.c
@@ -0,0 +1,205 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 1997
+ * Sleepycat Software. All rights reserved.
+ */
+#include "config.h"
+
+#ifndef lint
+static const char sccsid[] = "@(#)mp_sync.c 10.8 (Sleepycat) 7/2/97";
+#endif /* not lint */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+
+#include <errno.h>
+#include <string.h>
+#endif
+
+#include "db_int.h"
+#include "shqueue.h"
+#include "db_shash.h"
+#include "mp.h"
+#include "common_ext.h"
+
+/*
+ * memp_sync --
+ * Mpool sync function.
+ */
+int
+memp_sync(dbmp, lsnp)
+ DB_MPOOL *dbmp;
+ DB_LSN *lsnp;
+{
+ BH *bhp;
+ DB_ENV *dbenv;
+ MPOOL *mp;
+ MPOOLFILE *mfp;
+ int can_write, wrote, lsn_cnt, restart, ret;
+
+ dbenv = dbmp->dbenv;
+
+ if (dbmp->dbenv->lg_info == NULL) {
+ __db_err(dbenv, "memp_sync requires logging");
+ return (EINVAL);
+ }
+
+ LOCKREGION(dbmp);
+
+ /*
+ * If the application is asking about a previous call, and we haven't
+ * found any buffers that the application holding the pin couldn't
+ * write, return yes or no based on the current count. Note, if the
+ * application is asking about a LSN *smaller* than one we've already
+ * handled, then we return based on the count for that LSN.
+ */
+ mp = dbmp->mp;
+ if (!F_ISSET(mp, MP_LSN_RETRY) && log_compare(lsnp, &mp->lsn) <= 0) {
+ if (mp->lsn_cnt == 0) {
+ *lsnp = mp->lsn;
+ ret = 0;
+ } else
+ ret = DB_INCOMPLETE;
+
+ UNLOCKREGION(dbmp);
+ return (ret);
+ }
+
+ /* Else, it's a new checkpoint. */
+ F_CLR(mp, MP_LSN_RETRY);
+
+ /*
+ * Save the LSN. We know that it's a new LSN or larger than the one
+ * for which we were already doing a checkpoint. (BTW, I don't expect
+ * to see multiple LSN's from the same or multiple processes, but You
+ * Just Never Know. Responding as if they all called with the largest
+ * of the LSNs specified makes everything work.
+ *
+ * We don't currently use the LSN we save. We could potentially save
+ * the last-written LSN in each buffer header and use it to determine
+ * what buffers need to be written. The problem with this is that it's
+ * sizeof(LSN) more bytes of buffer header. We currently write all the
+ * dirty buffers instead.
+ *
+ * Walk the list of shared memory segments clearing the count of
+ * buffers waiting to be written.
+ */
+ mp->lsn = *lsnp;
+ mp->lsn_cnt = 0;
+ for (mfp = SH_TAILQ_FIRST(&dbmp->mp->mpfq, __mpoolfile);
+ mfp != NULL; mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile))
+ mfp->lsn_cnt = 0;
+
+ /*
+ * Walk the list of buffers and mark all dirty buffers to be written
+ * and all pinned buffers to be potentially written. We do this in
+ * single fell swoop while holding the region locked so that processes
+ * can't make new buffers dirty, causing us to never finish. Since
+ * the application may have restarted the sync, clear any BH_WRITE
+ * flags that appear to be left over.
+ */
+ can_write = lsn_cnt = 0;
+ for (lsn_cnt = 0, bhp = SH_TAILQ_FIRST(&mp->bhq, __bh);
+ bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, q, __bh))
+ if (F_ISSET(bhp, BH_DIRTY) || bhp->ref != 0) {
+ F_SET(bhp, BH_WRITE);
+
+ if (bhp->ref == 0)
+ can_write = 1;
+
+ mfp = ADDR(dbmp, bhp->mf_offset);
+ ++mfp->lsn_cnt;
+
+ ++lsn_cnt;
+ } else
+ F_CLR(bhp, BH_WRITE);
+
+ mp->lsn_cnt = lsn_cnt;
+
+ /* If there no buffers we can write, we're done. */
+ if (!can_write) {
+ UNLOCKREGION(dbmp);
+ return (mp->lsn_cnt ? DB_INCOMPLETE : 0);
+ }
+
+ /*
+ * Write any buffers that we can. Restart the walk after each write,
+ * __memp_pgwrite() discards and reacquires the region lock during I/O.
+ */
+retry: for (bhp = SH_TAILQ_FIRST(&mp->bhq, __bh);
+ bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, q, __bh)) {
+ /* Ignore pinned or locked buffers. */
+ if (!F_ISSET(bhp, BH_WRITE) ||
+ bhp->ref != 0 || F_ISSET(bhp, BH_LOCKED))
+ continue;
+
+ mfp = ADDR(dbmp, bhp->mf_offset);
+ if ((ret =
+ __memp_bhwrite(dbmp, mfp, bhp, &restart, &wrote)) != 0)
+ goto err;
+ if (wrote) {
+ if (restart)
+ goto retry;
+ continue;
+ }
+ __db_err(dbenv, "%s: unable to flush page: %lu",
+ ADDR(dbmp, mfp->path_off), (u_long)bhp->pgno);
+ ret = EPERM;
+ goto err;
+ }
+ ret = mp->lsn_cnt ? DB_INCOMPLETE : 0;
+
+err: UNLOCKREGION(dbmp);
+ return (ret);
+}
+
+/*
+ * memp_fsync --
+ * Mpool file sync function.
+ */
+int
+memp_fsync(dbmfp)
+ DB_MPOOLFILE *dbmfp;
+{
+ BH *bhp;
+ DB_MPOOL *dbmp;
+ size_t mf_offset;
+ int pincnt, restart, ret, wrote;
+
+ /* We don't sync temporary files -- what's the use? */
+ if (F_ISSET(dbmfp, MP_PATH_TEMP))
+ return (0);
+
+ dbmp = dbmfp->dbmp;
+ ret = 0;
+
+ mf_offset = OFFSET(dbmp, dbmfp->mfp);
+
+ LOCKREGION(dbmp);
+
+ /*
+ * Walk the list of buffer headers for the MPOOLFILE, and write out any
+ * dirty buffers that we can.
+ */
+retry: pincnt = 0;
+ for (bhp = SH_TAILQ_FIRST(&dbmp->mp->bhq, __bh);
+ bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, q, __bh))
+ if (F_ISSET(bhp, BH_DIRTY) && bhp->mf_offset == mf_offset) {
+ if (bhp->ref != 0 || F_ISSET(bhp, BH_LOCKED)) {
+ ++pincnt;
+ continue;
+ }
+ if ((ret =
+ __memp_pgwrite(dbmfp, bhp, &restart, &wrote)) != 0)
+ goto err;
+ if (!wrote)
+ ++pincnt;
+ if (restart)
+ goto retry;
+ }
+
+ UNLOCKREGION(dbmp);
+
+err: return (ret == 0 ? (pincnt ? DB_INCOMPLETE : 0) : ret);
+}