aboutsummaryrefslogtreecommitdiff
path: root/db2/mp/mp_sync.c
diff options
context:
space:
mode:
Diffstat (limited to 'db2/mp/mp_sync.c')
-rw-r--r--db2/mp/mp_sync.c166
1 files changed, 101 insertions, 65 deletions
diff --git a/db2/mp/mp_sync.c b/db2/mp/mp_sync.c
index 33218eef1a..535348517c 100644
--- a/db2/mp/mp_sync.c
+++ b/db2/mp/mp_sync.c
@@ -7,7 +7,7 @@
#include "config.h"
#ifndef lint
-static const char sccsid[] = "@(#)mp_sync.c 10.25 (Sleepycat) 4/26/98";
+static const char sccsid[] = "@(#)mp_sync.c 10.31 (Sleepycat) 12/11/98";
#endif /* not lint */
#ifndef NO_SYSTEM_INCLUDES
@@ -39,9 +39,12 @@ memp_sync(dbmp, lsnp)
DB_ENV *dbenv;
MPOOL *mp;
MPOOLFILE *mfp;
- int ar_cnt, cnt, nalloc, next, ret, wrote;
+ int ar_cnt, nalloc, next, maxpin, ret, wrote;
+
+ MP_PANIC_CHECK(dbmp);
dbenv = dbmp->dbenv;
+ mp = dbmp->mp;
if (dbenv->lg_info == NULL) {
__db_err(dbenv, "memp_sync: requires logging");
@@ -49,16 +52,19 @@ memp_sync(dbmp, lsnp)
}
/*
- * We try and write the buffers in page order so that the underlying
- * filesystem doesn't have to seek and can write contiguous blocks,
- * plus, we don't want to hold the region lock while we write the
- * buffers. Get memory to hold the buffer pointers. Get a good-size
- * block, too, because we realloc while holding the region lock if we
- * run out.
+ * We try and write the buffers in page order: it should reduce seeks
+ * by the underlying filesystem and possibly reduce the actual number
+ * of writes. We don't want to hold the region lock while we write
+ * the buffers, so only hold it lock while we create a list. Get a
+ * good-size block of memory to hold buffer pointers, we don't want
+ * to run out.
*/
- if ((bharray =
- (BH **)__db_malloc((nalloc = 1024) * sizeof(BH *))) == NULL)
- return (ENOMEM);
+ LOCKREGION(dbmp);
+ nalloc = mp->stat.st_page_dirty + mp->stat.st_page_dirty / 2 + 10;
+ UNLOCKREGION(dbmp);
+
+ if ((ret = __os_malloc(nalloc * sizeof(BH *), NULL, &bharray)) != 0)
+ return (ret);
LOCKREGION(dbmp);
@@ -70,7 +76,6 @@ memp_sync(dbmp, lsnp)
* we've already handled or are currently handling, then we return a
* result based on the count for the larger LSN.
*/
- mp = dbmp->mp;
if (!F_ISSET(mp, MP_LSN_RETRY) && log_compare(lsnp, &mp->lsn) <= 0) {
if (mp->lsn_cnt == 0) {
*lsnp = mp->lsn;
@@ -114,10 +119,15 @@ memp_sync(dbmp, lsnp)
* finish. Since the application may have restarted the sync, clear
* any BH_WRITE flags that appear to be left over from previous calls.
*
+ * We don't want to pin down the entire buffer cache, otherwise we'll
+ * starve threads needing new pages. Don't pin down more than 80% of
+ * the cache.
+ *
* Keep a count of the total number of buffers we need to write in
* MPOOL->lsn_cnt, and for each file, in MPOOLFILE->lsn_count.
*/
ar_cnt = 0;
+ maxpin = ((mp->stat.st_page_dirty + mp->stat.st_page_clean) * 8) / 10;
for (bhp = SH_TAILQ_FIRST(&mp->bhq, __bh);
bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, q, __bh))
if (F_ISSET(bhp, BH_DIRTY) || bhp->ref != 0) {
@@ -130,19 +140,27 @@ memp_sync(dbmp, lsnp)
/*
* If the buffer isn't in use, we should be able to
- * write it immediately, so save a reference to it.
+ * write it immediately, so increment the reference
+ * count to lock it and its contents down, and then
+ * save a reference to it.
+ *
+ * If we've run out space to store buffer references,
+ * we're screwed. We don't want to realloc the array
+ * while holding a region lock, so we set the flag to
+ * force the checkpoint to be done again, from scratch,
+ * later.
+ *
+ * If we've pinned down too much of the cache stop, and
+ * set a flag to force the checkpoint to be tried again
+ * later.
*/
if (bhp->ref == 0) {
- if (ar_cnt == nalloc) {
- nalloc *= 2;
- if ((bharray =
- (BH **)__db_realloc(bharray,
- nalloc * sizeof(BH *))) == NULL) {
- ret = ENOMEM;
- goto err;
- }
+ ++bhp->ref;
+ bharray[ar_cnt] = bhp;
+ if (++ar_cnt >= nalloc || ar_cnt >= maxpin) {
+ F_SET(mp, MP_LSN_RETRY);
+ break;
}
- bharray[ar_cnt++] = bhp;
}
} else
if (F_ISSET(bhp, BH_WRITE))
@@ -154,10 +172,6 @@ memp_sync(dbmp, lsnp)
goto done;
}
- /* Lock down the buffers and their contents. */
- for (cnt = 0; cnt < ar_cnt; ++cnt)
- ++bharray[cnt]->ref;
-
UNLOCKREGION(dbmp);
/* Sort the buffers we're going to write. */
@@ -205,7 +219,8 @@ memp_sync(dbmp, lsnp)
goto err;
}
}
- ret = mp->lsn_cnt ? DB_INCOMPLETE : 0;
+ ret = mp->lsn_cnt != 0 ||
+ F_ISSET(mp, MP_LSN_RETRY) ? DB_INCOMPLETE : 0;
done:
if (0) {
@@ -224,7 +239,7 @@ err: /*
F_CLR(bhp, BH_WRITE);
}
UNLOCKREGION(dbmp);
- __db_free(bharray);
+ __os_free(bharray, nalloc * sizeof(BH *));
return (ret);
}
@@ -241,6 +256,8 @@ memp_fsync(dbmfp)
dbmp = dbmfp->dbmp;
+ MP_PANIC_CHECK(dbmp);
+
/*
* If this handle doesn't have a file descriptor that's open for
* writing, or if the file is a temporary, there's no reason to
@@ -300,25 +317,29 @@ __memp_fsync(dbmfp)
{
BH *bhp, **bharray;
DB_MPOOL *dbmp;
+ MPOOL *mp;
size_t mf_offset;
- int ar_cnt, cnt, nalloc, next, pincnt, ret, wrote;
+ int ar_cnt, incomplete, nalloc, next, ret, wrote;
ret = 0;
dbmp = dbmfp->dbmp;
+ mp = dbmp->mp;
mf_offset = R_OFFSET(dbmp, dbmfp->mfp);
/*
- * We try and write the buffers in page order so that the underlying
- * filesystem doesn't have to seek and can write contiguous blocks,
- * plus, we don't want to hold the region lock while we write the
- * buffers. Get memory to hold the buffer pointers. Get a good-size
- * block, too, because we realloc while holding the region lock if we
- * run out.
+ * We try and write the buffers in page order: it should reduce seeks
+ * by the underlying filesystem and possibly reduce the actual number
+ * of writes. We don't want to hold the region lock while we write
+ * the buffers, so only hold it lock while we create a list. Get a
+ * good-size block of memory to hold buffer pointers, we don't want
+ * to run out.
*/
- nalloc = 1024;
- if ((bharray =
- (BH **)__db_malloc((size_t)nalloc * sizeof(BH *))) == NULL)
- return (ENOMEM);
+ LOCKREGION(dbmp);
+ nalloc = mp->stat.st_page_dirty + mp->stat.st_page_dirty / 2 + 10;
+ UNLOCKREGION(dbmp);
+
+ if ((ret = __os_malloc(nalloc * sizeof(BH *), NULL, &bharray)) != 0)
+ return (ret);
LOCKREGION(dbmp);
@@ -326,36 +347,37 @@ __memp_fsync(dbmfp)
* Walk the LRU list of buffer headers, and get a list of buffers to
* write for this MPOOLFILE.
*/
- ar_cnt = pincnt = 0;
- for (bhp = SH_TAILQ_FIRST(&dbmp->mp->bhq, __bh);
+ ar_cnt = incomplete = 0;
+ for (bhp = SH_TAILQ_FIRST(&mp->bhq, __bh);
bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, q, __bh)) {
if (!F_ISSET(bhp, BH_DIRTY) || bhp->mf_offset != mf_offset)
continue;
if (bhp->ref != 0 || F_ISSET(bhp, BH_LOCKED)) {
- ++pincnt;
+ incomplete = 1;
continue;
}
- if (ar_cnt == nalloc) {
- nalloc *= 2;
- if ((bharray = (BH **)__db_realloc(bharray,
- nalloc * sizeof(BH *))) == NULL) {
- ret = ENOMEM;
- goto err;
- }
- }
+ ++bhp->ref;
+ bharray[ar_cnt] = bhp;
- bharray[ar_cnt++] = bhp;
+ /*
+ * If we've run out space to store buffer references, we're
+ * screwed, as we don't want to realloc the array holding a
+ * region lock. Set the incomplete flag -- the only way we
+ * can get here is if the file is active in the buffer cache,
+ * which is the same thing as finding pinned buffers.
+ */
+ if (++ar_cnt >= nalloc) {
+ incomplete = 1;
+ break;
+ }
}
- /* Lock down the buffers and their contents. */
- for (cnt = 0; cnt < ar_cnt; ++cnt)
- ++bharray[cnt]->ref;
-
UNLOCKREGION(dbmp);
/* Sort the buffers we're going to write. */
- qsort(bharray, ar_cnt, sizeof(BH *), __bhcmp);
+ if (ar_cnt != 0)
+ qsort(bharray, ar_cnt, sizeof(BH *), __bhcmp);
LOCKREGION(dbmp);
@@ -365,11 +387,10 @@ __memp_fsync(dbmfp)
* It's possible for a thread to have gotten the buffer since
* we listed it for writing. If the reference count is still
* 1, we're the only ones using the buffer, go ahead and write.
- * If it's >1, then skip the buffer and assume that it will be
- * written when it's returned to the cache.
+ * If it's >1, then skip the buffer.
*/
if (bharray[next]->ref > 1) {
- ++pincnt;
+ incomplete = 1;
--bharray[next]->ref;
continue;
@@ -387,13 +408,18 @@ __memp_fsync(dbmfp)
--bharray[next]->ref;
goto err;
}
+
+ /*
+ * If we didn't write the buffer for some reason, don't return
+ * success.
+ */
if (!wrote)
- ++pincnt;
+ incomplete = 1;
}
err: UNLOCKREGION(dbmp);
- __db_free(bharray);
+ __os_free(bharray, nalloc * sizeof(BH *));
/*
* Sync the underlying file as the last thing we do, so that the OS
@@ -404,7 +430,7 @@ err: UNLOCKREGION(dbmp);
* issues.
*/
if (ret == 0)
- return (pincnt == 0 ? __db_fsync(dbmfp->fd) : DB_INCOMPLETE);
+ return (incomplete ? DB_INCOMPLETE : __os_fsync(dbmfp->fd));
return (ret);
}
@@ -423,6 +449,8 @@ memp_trickle(dbmp, pct, nwrotep)
u_long total;
int ret, wrote;
+ MP_PANIC_CHECK(dbmp);
+
mp = dbmp->mp;
if (nwrotep != NULL)
*nwrotep = 0;
@@ -487,7 +515,7 @@ loop: total = mp->stat.st_page_clean + mp->stat.st_page_dirty;
}
/* No more buffers to write. */
- return (0);
+ ret = 0;
err: UNLOCKREGION(dbmp);
return (ret);
@@ -508,6 +536,14 @@ __bhcmp(p1, p2)
if (bhp1->mf_offset > bhp2->mf_offset)
return (1);
- /* Sort by page in file. */
- return (bhp1->pgno < bhp2->pgno ? -1 : 1);
+ /*
+ * !!!
+ * Defend against badly written quicksort code calling the comparison
+ * function with two identical pointers (e.g., WATCOM C++ (Power++)).
+ */
+ if (bhp1->pgno < bhp2->pgno)
+ return (-1);
+ if (bhp1->pgno > bhp2->pgno)
+ return (1);
+ return (0);
}