diff options
Diffstat (limited to 'db2/mp/mp_sync.c')
-rw-r--r-- | db2/mp/mp_sync.c | 166 |
1 files changed, 101 insertions, 65 deletions
diff --git a/db2/mp/mp_sync.c b/db2/mp/mp_sync.c index 33218eef1a..535348517c 100644 --- a/db2/mp/mp_sync.c +++ b/db2/mp/mp_sync.c @@ -7,7 +7,7 @@ #include "config.h" #ifndef lint -static const char sccsid[] = "@(#)mp_sync.c 10.25 (Sleepycat) 4/26/98"; +static const char sccsid[] = "@(#)mp_sync.c 10.31 (Sleepycat) 12/11/98"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -39,9 +39,12 @@ memp_sync(dbmp, lsnp) DB_ENV *dbenv; MPOOL *mp; MPOOLFILE *mfp; - int ar_cnt, cnt, nalloc, next, ret, wrote; + int ar_cnt, nalloc, next, maxpin, ret, wrote; + + MP_PANIC_CHECK(dbmp); dbenv = dbmp->dbenv; + mp = dbmp->mp; if (dbenv->lg_info == NULL) { __db_err(dbenv, "memp_sync: requires logging"); @@ -49,16 +52,19 @@ memp_sync(dbmp, lsnp) } /* - * We try and write the buffers in page order so that the underlying - * filesystem doesn't have to seek and can write contiguous blocks, - * plus, we don't want to hold the region lock while we write the - * buffers. Get memory to hold the buffer pointers. Get a good-size - * block, too, because we realloc while holding the region lock if we - * run out. + * We try and write the buffers in page order: it should reduce seeks + * by the underlying filesystem and possibly reduce the actual number + * of writes. We don't want to hold the region lock while we write + * the buffers, so only hold it lock while we create a list. Get a + * good-size block of memory to hold buffer pointers, we don't want + * to run out. */ - if ((bharray = - (BH **)__db_malloc((nalloc = 1024) * sizeof(BH *))) == NULL) - return (ENOMEM); + LOCKREGION(dbmp); + nalloc = mp->stat.st_page_dirty + mp->stat.st_page_dirty / 2 + 10; + UNLOCKREGION(dbmp); + + if ((ret = __os_malloc(nalloc * sizeof(BH *), NULL, &bharray)) != 0) + return (ret); LOCKREGION(dbmp); @@ -70,7 +76,6 @@ memp_sync(dbmp, lsnp) * we've already handled or are currently handling, then we return a * result based on the count for the larger LSN. */ - mp = dbmp->mp; if (!F_ISSET(mp, MP_LSN_RETRY) && log_compare(lsnp, &mp->lsn) <= 0) { if (mp->lsn_cnt == 0) { *lsnp = mp->lsn; @@ -114,10 +119,15 @@ memp_sync(dbmp, lsnp) * finish. Since the application may have restarted the sync, clear * any BH_WRITE flags that appear to be left over from previous calls. * + * We don't want to pin down the entire buffer cache, otherwise we'll + * starve threads needing new pages. Don't pin down more than 80% of + * the cache. + * * Keep a count of the total number of buffers we need to write in * MPOOL->lsn_cnt, and for each file, in MPOOLFILE->lsn_count. */ ar_cnt = 0; + maxpin = ((mp->stat.st_page_dirty + mp->stat.st_page_clean) * 8) / 10; for (bhp = SH_TAILQ_FIRST(&mp->bhq, __bh); bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, q, __bh)) if (F_ISSET(bhp, BH_DIRTY) || bhp->ref != 0) { @@ -130,19 +140,27 @@ memp_sync(dbmp, lsnp) /* * If the buffer isn't in use, we should be able to - * write it immediately, so save a reference to it. + * write it immediately, so increment the reference + * count to lock it and its contents down, and then + * save a reference to it. + * + * If we've run out space to store buffer references, + * we're screwed. We don't want to realloc the array + * while holding a region lock, so we set the flag to + * force the checkpoint to be done again, from scratch, + * later. + * + * If we've pinned down too much of the cache stop, and + * set a flag to force the checkpoint to be tried again + * later. */ if (bhp->ref == 0) { - if (ar_cnt == nalloc) { - nalloc *= 2; - if ((bharray = - (BH **)__db_realloc(bharray, - nalloc * sizeof(BH *))) == NULL) { - ret = ENOMEM; - goto err; - } + ++bhp->ref; + bharray[ar_cnt] = bhp; + if (++ar_cnt >= nalloc || ar_cnt >= maxpin) { + F_SET(mp, MP_LSN_RETRY); + break; } - bharray[ar_cnt++] = bhp; } } else if (F_ISSET(bhp, BH_WRITE)) @@ -154,10 +172,6 @@ memp_sync(dbmp, lsnp) goto done; } - /* Lock down the buffers and their contents. */ - for (cnt = 0; cnt < ar_cnt; ++cnt) - ++bharray[cnt]->ref; - UNLOCKREGION(dbmp); /* Sort the buffers we're going to write. */ @@ -205,7 +219,8 @@ memp_sync(dbmp, lsnp) goto err; } } - ret = mp->lsn_cnt ? DB_INCOMPLETE : 0; + ret = mp->lsn_cnt != 0 || + F_ISSET(mp, MP_LSN_RETRY) ? DB_INCOMPLETE : 0; done: if (0) { @@ -224,7 +239,7 @@ err: /* F_CLR(bhp, BH_WRITE); } UNLOCKREGION(dbmp); - __db_free(bharray); + __os_free(bharray, nalloc * sizeof(BH *)); return (ret); } @@ -241,6 +256,8 @@ memp_fsync(dbmfp) dbmp = dbmfp->dbmp; + MP_PANIC_CHECK(dbmp); + /* * If this handle doesn't have a file descriptor that's open for * writing, or if the file is a temporary, there's no reason to @@ -300,25 +317,29 @@ __memp_fsync(dbmfp) { BH *bhp, **bharray; DB_MPOOL *dbmp; + MPOOL *mp; size_t mf_offset; - int ar_cnt, cnt, nalloc, next, pincnt, ret, wrote; + int ar_cnt, incomplete, nalloc, next, ret, wrote; ret = 0; dbmp = dbmfp->dbmp; + mp = dbmp->mp; mf_offset = R_OFFSET(dbmp, dbmfp->mfp); /* - * We try and write the buffers in page order so that the underlying - * filesystem doesn't have to seek and can write contiguous blocks, - * plus, we don't want to hold the region lock while we write the - * buffers. Get memory to hold the buffer pointers. Get a good-size - * block, too, because we realloc while holding the region lock if we - * run out. + * We try and write the buffers in page order: it should reduce seeks + * by the underlying filesystem and possibly reduce the actual number + * of writes. We don't want to hold the region lock while we write + * the buffers, so only hold it lock while we create a list. Get a + * good-size block of memory to hold buffer pointers, we don't want + * to run out. */ - nalloc = 1024; - if ((bharray = - (BH **)__db_malloc((size_t)nalloc * sizeof(BH *))) == NULL) - return (ENOMEM); + LOCKREGION(dbmp); + nalloc = mp->stat.st_page_dirty + mp->stat.st_page_dirty / 2 + 10; + UNLOCKREGION(dbmp); + + if ((ret = __os_malloc(nalloc * sizeof(BH *), NULL, &bharray)) != 0) + return (ret); LOCKREGION(dbmp); @@ -326,36 +347,37 @@ __memp_fsync(dbmfp) * Walk the LRU list of buffer headers, and get a list of buffers to * write for this MPOOLFILE. */ - ar_cnt = pincnt = 0; - for (bhp = SH_TAILQ_FIRST(&dbmp->mp->bhq, __bh); + ar_cnt = incomplete = 0; + for (bhp = SH_TAILQ_FIRST(&mp->bhq, __bh); bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, q, __bh)) { if (!F_ISSET(bhp, BH_DIRTY) || bhp->mf_offset != mf_offset) continue; if (bhp->ref != 0 || F_ISSET(bhp, BH_LOCKED)) { - ++pincnt; + incomplete = 1; continue; } - if (ar_cnt == nalloc) { - nalloc *= 2; - if ((bharray = (BH **)__db_realloc(bharray, - nalloc * sizeof(BH *))) == NULL) { - ret = ENOMEM; - goto err; - } - } + ++bhp->ref; + bharray[ar_cnt] = bhp; - bharray[ar_cnt++] = bhp; + /* + * If we've run out space to store buffer references, we're + * screwed, as we don't want to realloc the array holding a + * region lock. Set the incomplete flag -- the only way we + * can get here is if the file is active in the buffer cache, + * which is the same thing as finding pinned buffers. + */ + if (++ar_cnt >= nalloc) { + incomplete = 1; + break; + } } - /* Lock down the buffers and their contents. */ - for (cnt = 0; cnt < ar_cnt; ++cnt) - ++bharray[cnt]->ref; - UNLOCKREGION(dbmp); /* Sort the buffers we're going to write. */ - qsort(bharray, ar_cnt, sizeof(BH *), __bhcmp); + if (ar_cnt != 0) + qsort(bharray, ar_cnt, sizeof(BH *), __bhcmp); LOCKREGION(dbmp); @@ -365,11 +387,10 @@ __memp_fsync(dbmfp) * It's possible for a thread to have gotten the buffer since * we listed it for writing. If the reference count is still * 1, we're the only ones using the buffer, go ahead and write. - * If it's >1, then skip the buffer and assume that it will be - * written when it's returned to the cache. + * If it's >1, then skip the buffer. */ if (bharray[next]->ref > 1) { - ++pincnt; + incomplete = 1; --bharray[next]->ref; continue; @@ -387,13 +408,18 @@ __memp_fsync(dbmfp) --bharray[next]->ref; goto err; } + + /* + * If we didn't write the buffer for some reason, don't return + * success. + */ if (!wrote) - ++pincnt; + incomplete = 1; } err: UNLOCKREGION(dbmp); - __db_free(bharray); + __os_free(bharray, nalloc * sizeof(BH *)); /* * Sync the underlying file as the last thing we do, so that the OS @@ -404,7 +430,7 @@ err: UNLOCKREGION(dbmp); * issues. */ if (ret == 0) - return (pincnt == 0 ? __db_fsync(dbmfp->fd) : DB_INCOMPLETE); + return (incomplete ? DB_INCOMPLETE : __os_fsync(dbmfp->fd)); return (ret); } @@ -423,6 +449,8 @@ memp_trickle(dbmp, pct, nwrotep) u_long total; int ret, wrote; + MP_PANIC_CHECK(dbmp); + mp = dbmp->mp; if (nwrotep != NULL) *nwrotep = 0; @@ -487,7 +515,7 @@ loop: total = mp->stat.st_page_clean + mp->stat.st_page_dirty; } /* No more buffers to write. */ - return (0); + ret = 0; err: UNLOCKREGION(dbmp); return (ret); @@ -508,6 +536,14 @@ __bhcmp(p1, p2) if (bhp1->mf_offset > bhp2->mf_offset) return (1); - /* Sort by page in file. */ - return (bhp1->pgno < bhp2->pgno ? -1 : 1); + /* + * !!! + * Defend against badly written quicksort code calling the comparison + * function with two identical pointers (e.g., WATCOM C++ (Power++)). + */ + if (bhp1->pgno < bhp2->pgno) + return (-1); + if (bhp1->pgno > bhp2->pgno) + return (1); + return (0); } |