mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2024-12-27 03:19:35 +03:00
Merge branch 'arc-changes'
This stack of patches has been empirically shown to drastically improve the hit rate of the ARC for certain workloads. As a result, fewer reads to disk are required, which is generally a good thing and can drastically improve performance if the workload is disk limited. For the impatient, I'll summarize the results of the tests performed: * Test 1 - Creating many empty directories. This test saw 99.9% fewer reads and 12.8% more inodes created when running *with* these changes. * Test 2 - Creating many empty files. This test saw 4% fewer reads and 0% more inodes created when running *with* these changes. * Test 3 - Creating many 4 KiB files. This test saw 96.7% fewer reads and 4.9% more inodes created when running *with* these changes. * Test 4 - Creating many 4096 KiB files. This test saw 99.4% fewer reads and 0% more inodes created (but took 6.9% fewer seconds to complete) when running *with* these changes. * Test 5 - Rsync'ing a dataset with many empty directories. This test saw 36.2% fewer reads and 66.2% more inodes created when running *with* these changes. * Test 6 - Rsync'ing a dataset with many empty files. This test saw 30.9% fewer reads and 0% more inodes created (but took 24.3% fewer seconds to complete) when running *with* these changes. * Test 7 - Rsync'ing a dataset with many 4 KiB files. This test saw 30.8% fewer reads and 173.3% more inodes created when running *with* these changes. For the patient, the following consists of more a more detailed description of the tests performed and the results gathered. All the tests were run using identical machines, each with a pool consisting of 5 mirror pairs with 2TB 7200 RPM disks. Each test was run twice, once *without* this set of patches and again *with* this set of patches to highlight the performance changes introduced. The first four workloads tested were: ** NOTE: None of these tests were run to completion. They ran for a set amount of time and then were terminated or hit ENOSPC. 1. Creating many empty directories: * fdtree -d 10 -l 8 -s 0 -f 0 -C -> 111,111,111 Directories -> 0 Files -> 0 KiB File Data 2. Creating many empty files: * fdtree -d 10 -l 5 -s 0 -f 10000 -C -> 111,111 Directories -> 1,111,110,000 Files -> 0 KiB File Data 3. Creating many 4 KiB files: * fdtree -d 10 -l 5 -s 1 -f 10000 -C -> 111,111 Directories -> 1,111,110,000 Files -> 4,444,440,000 KiB File Data 4. Creating many 4096 KiB files: * fdtree -d 10 -l 5 -s 1024 -f 10000 -C -> 111,111 Directories -> 1,111,110,000 Files -> 4,551,106,560,000 KiB File Data Results for these first four tests are below: | Time (s) | inodes | reads | writes | --+----------+----------+--------+-----------+ Test 1 Before | 65069 | 37845363 | 831975 | 3214646 | Test 1 After | 65069 | 42703608 | 778 | 3327674 | --+----------+----------+--------+-----------+ Test 2 Before | 65073 | 54257583 | 208647 | 2413056 | Test 2 After | 65069 | 54255782 | 200038 | 2533759 | --+----------+----------+--------+-----------+ Test 3 Before | 65068 | 49857744 | 487130 | 5533348 | Test 3 After | 65071 | 52294311 | 16078 | 5648354 | --+----------+----------+--------+-----------+ Test 4 Before | 34854 | 2448329 | 385870 | 162116572 | Test 4 After | 32419 | 2448329 | 2339 | 162175706 | --+----------+----------+--------+-----------+ * "Time (s)" - The run time of the test in seconds * "inodes" - The number of inodes created by the test * "reads" - The number of reads performed by the test * "writes" - The number of writes performed by the test As you can see from the table above, running with this patch stack *significantly* reduced the number of reads performed in 3 out of the 4 tests (due to an improved ARC hit rate). In addition to the tests described above, which specifically targeted creates only, three other workloads were tested. These additional tests were targeting rsync performance against the datasets created in the previous tests. A brief description of the workloads and results for these tests are below: ** NOTE: Aside from (6), these tests didn't run to completion. They ran for a set amount of time and then were terminated. 5. Rsync the dataset created in Test 1 to a new dataset: * rsync -a /tank/test-1 /tank/test-5 6. Rsync the dataset created in Test 2 to a new dataset: * rsync -a /tank/test-2 /tank/test-6 7. Rsync the dataset created in Test 3 to a new dataset: * rsync -a /tank/test-3 /tank/test-7 Results for Test 5, 6, and 7 are below: | Time (s) | inodes | reads | writes | --+----------+----------+----------+---------+ Test 5 Before | 93041 | 17921014 | 47632823 | 4094848 | Test 5 After | 93029 | 29785847 | 30376206 | 4484459 | --+----------+----------+----------+---------+ Test 6 Before | 15290 | 54264474 | 6018331 | 733087 | Test 6 After | 11573 | 54260826 | 4155661 | 617285 | --+----------+----------+----------+---------+ Test 7 Before | 93057 | 10093749 | 41561635 | 3659098 | Test 7 After | 93045 | 27587043 | 28773151 | 5612234 | --+----------+----------+----------+---------+ * "Time (s)" - The run time of the test in seconds * "inodes" - The number of inodes created by the test * "reads" - The number of reads performed by the test * "writes" - The number of writes performed by the test Signed-off-by: Prakash Surya <surya1@llnl.gov> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Closes #2110
This commit is contained in:
commit
0ad85ed91e
@ -86,6 +86,7 @@ typedef enum arc_buf_contents {
|
|||||||
*/
|
*/
|
||||||
typedef enum arc_space_type {
|
typedef enum arc_space_type {
|
||||||
ARC_SPACE_DATA,
|
ARC_SPACE_DATA,
|
||||||
|
ARC_SPACE_META,
|
||||||
ARC_SPACE_HDRS,
|
ARC_SPACE_HDRS,
|
||||||
ARC_SPACE_L2HDRS,
|
ARC_SPACE_L2HDRS,
|
||||||
ARC_SPACE_OTHER,
|
ARC_SPACE_OTHER,
|
||||||
@ -160,7 +161,6 @@ void arc_freed(spa_t *spa, const blkptr_t *bp);
|
|||||||
void arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private);
|
void arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private);
|
||||||
int arc_buf_evict(arc_buf_t *buf);
|
int arc_buf_evict(arc_buf_t *buf);
|
||||||
|
|
||||||
void arc_adjust_meta(int64_t adjustment, boolean_t may_prune);
|
|
||||||
void arc_flush(spa_t *spa);
|
void arc_flush(spa_t *spa);
|
||||||
void arc_tempreserve_clear(uint64_t reserve);
|
void arc_tempreserve_clear(uint64_t reserve);
|
||||||
int arc_tempreserve_space(uint64_t reserve, uint64_t txg);
|
int arc_tempreserve_space(uint64_t reserve, uint64_t txg);
|
||||||
|
@ -296,12 +296,23 @@ Default value: \fB100\fR.
|
|||||||
.sp
|
.sp
|
||||||
.ne 2
|
.ne 2
|
||||||
.na
|
.na
|
||||||
\fBzfs_arc_p_min_shift\fR (int)
|
\fBzfs_arc_p_aggressive_disable\fR (int)
|
||||||
.ad
|
.ad
|
||||||
.RS 12n
|
.RS 12n
|
||||||
arc_c shift to calc min/max arc_p
|
Disable aggressive arc_p growth
|
||||||
.sp
|
.sp
|
||||||
Default value: \fB4\fR.
|
Use \fB1\fR for yes (default) and \fB0\fR to disable.
|
||||||
|
.RE
|
||||||
|
|
||||||
|
.sp
|
||||||
|
.ne 2
|
||||||
|
.na
|
||||||
|
\fBzfs_arc_p_dampener_disable\fR (int)
|
||||||
|
.ad
|
||||||
|
.RS 12n
|
||||||
|
Disable arc_p adapt dampener
|
||||||
|
.sp
|
||||||
|
Use \fB1\fR for yes (default) and \fB0\fR to disable.
|
||||||
.RE
|
.RE
|
||||||
|
|
||||||
.sp
|
.sp
|
||||||
|
227
module/zfs/arc.c
227
module/zfs/arc.c
@ -172,8 +172,11 @@ int arc_evict_iterations = 100;
|
|||||||
/* number of seconds before growing cache again */
|
/* number of seconds before growing cache again */
|
||||||
int zfs_arc_grow_retry = 5;
|
int zfs_arc_grow_retry = 5;
|
||||||
|
|
||||||
/* shift of arc_c for calculating both min and max arc_p */
|
/* disable anon data aggressively growing arc_p */
|
||||||
int zfs_arc_p_min_shift = 4;
|
int zfs_arc_p_aggressive_disable = 1;
|
||||||
|
|
||||||
|
/* disable arc_p adapt dampener in arc_adapt */
|
||||||
|
int zfs_arc_p_dampener_disable = 1;
|
||||||
|
|
||||||
/* log2(fraction of arc to reclaim) */
|
/* log2(fraction of arc to reclaim) */
|
||||||
int zfs_arc_shrink_shift = 5;
|
int zfs_arc_shrink_shift = 5;
|
||||||
@ -305,6 +308,7 @@ typedef struct arc_stats {
|
|||||||
kstat_named_t arcstat_size;
|
kstat_named_t arcstat_size;
|
||||||
kstat_named_t arcstat_hdr_size;
|
kstat_named_t arcstat_hdr_size;
|
||||||
kstat_named_t arcstat_data_size;
|
kstat_named_t arcstat_data_size;
|
||||||
|
kstat_named_t arcstat_meta_size;
|
||||||
kstat_named_t arcstat_other_size;
|
kstat_named_t arcstat_other_size;
|
||||||
kstat_named_t arcstat_anon_size;
|
kstat_named_t arcstat_anon_size;
|
||||||
kstat_named_t arcstat_anon_evict_data;
|
kstat_named_t arcstat_anon_evict_data;
|
||||||
@ -392,6 +396,7 @@ static arc_stats_t arc_stats = {
|
|||||||
{ "size", KSTAT_DATA_UINT64 },
|
{ "size", KSTAT_DATA_UINT64 },
|
||||||
{ "hdr_size", KSTAT_DATA_UINT64 },
|
{ "hdr_size", KSTAT_DATA_UINT64 },
|
||||||
{ "data_size", KSTAT_DATA_UINT64 },
|
{ "data_size", KSTAT_DATA_UINT64 },
|
||||||
|
{ "meta_size", KSTAT_DATA_UINT64 },
|
||||||
{ "other_size", KSTAT_DATA_UINT64 },
|
{ "other_size", KSTAT_DATA_UINT64 },
|
||||||
{ "anon_size", KSTAT_DATA_UINT64 },
|
{ "anon_size", KSTAT_DATA_UINT64 },
|
||||||
{ "anon_evict_data", KSTAT_DATA_UINT64 },
|
{ "anon_evict_data", KSTAT_DATA_UINT64 },
|
||||||
@ -1364,6 +1369,9 @@ arc_space_consume(uint64_t space, arc_space_type_t type)
|
|||||||
case ARC_SPACE_DATA:
|
case ARC_SPACE_DATA:
|
||||||
ARCSTAT_INCR(arcstat_data_size, space);
|
ARCSTAT_INCR(arcstat_data_size, space);
|
||||||
break;
|
break;
|
||||||
|
case ARC_SPACE_META:
|
||||||
|
ARCSTAT_INCR(arcstat_meta_size, space);
|
||||||
|
break;
|
||||||
case ARC_SPACE_OTHER:
|
case ARC_SPACE_OTHER:
|
||||||
ARCSTAT_INCR(arcstat_other_size, space);
|
ARCSTAT_INCR(arcstat_other_size, space);
|
||||||
break;
|
break;
|
||||||
@ -1375,7 +1383,9 @@ arc_space_consume(uint64_t space, arc_space_type_t type)
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
ARCSTAT_INCR(arcstat_meta_used, space);
|
if (type != ARC_SPACE_DATA)
|
||||||
|
ARCSTAT_INCR(arcstat_meta_used, space);
|
||||||
|
|
||||||
atomic_add_64(&arc_size, space);
|
atomic_add_64(&arc_size, space);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1390,6 +1400,9 @@ arc_space_return(uint64_t space, arc_space_type_t type)
|
|||||||
case ARC_SPACE_DATA:
|
case ARC_SPACE_DATA:
|
||||||
ARCSTAT_INCR(arcstat_data_size, -space);
|
ARCSTAT_INCR(arcstat_data_size, -space);
|
||||||
break;
|
break;
|
||||||
|
case ARC_SPACE_META:
|
||||||
|
ARCSTAT_INCR(arcstat_meta_size, -space);
|
||||||
|
break;
|
||||||
case ARC_SPACE_OTHER:
|
case ARC_SPACE_OTHER:
|
||||||
ARCSTAT_INCR(arcstat_other_size, -space);
|
ARCSTAT_INCR(arcstat_other_size, -space);
|
||||||
break;
|
break;
|
||||||
@ -1401,10 +1414,13 @@ arc_space_return(uint64_t space, arc_space_type_t type)
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
ASSERT(arc_meta_used >= space);
|
if (type != ARC_SPACE_DATA) {
|
||||||
if (arc_meta_max < arc_meta_used)
|
ASSERT(arc_meta_used >= space);
|
||||||
arc_meta_max = arc_meta_used;
|
if (arc_meta_max < arc_meta_used)
|
||||||
ARCSTAT_INCR(arcstat_meta_used, -space);
|
arc_meta_max = arc_meta_used;
|
||||||
|
ARCSTAT_INCR(arcstat_meta_used, -space);
|
||||||
|
}
|
||||||
|
|
||||||
ASSERT(arc_size >= space);
|
ASSERT(arc_size >= space);
|
||||||
atomic_add_64(&arc_size, -space);
|
atomic_add_64(&arc_size, -space);
|
||||||
}
|
}
|
||||||
@ -1601,12 +1617,11 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all)
|
|||||||
if (!recycle) {
|
if (!recycle) {
|
||||||
if (type == ARC_BUFC_METADATA) {
|
if (type == ARC_BUFC_METADATA) {
|
||||||
arc_buf_data_free(buf, zio_buf_free);
|
arc_buf_data_free(buf, zio_buf_free);
|
||||||
arc_space_return(size, ARC_SPACE_DATA);
|
arc_space_return(size, ARC_SPACE_META);
|
||||||
} else {
|
} else {
|
||||||
ASSERT(type == ARC_BUFC_DATA);
|
ASSERT(type == ARC_BUFC_DATA);
|
||||||
arc_buf_data_free(buf, zio_data_buf_free);
|
arc_buf_data_free(buf, zio_data_buf_free);
|
||||||
ARCSTAT_INCR(arcstat_data_size, -size);
|
arc_space_return(size, ARC_SPACE_DATA);
|
||||||
atomic_add_64(&arc_size, -size);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (list_link_active(&buf->b_hdr->b_arc_node)) {
|
if (list_link_active(&buf->b_hdr->b_arc_node)) {
|
||||||
@ -1887,6 +1902,7 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
|
|||||||
|
|
||||||
evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
|
evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
|
||||||
|
|
||||||
|
top:
|
||||||
mutex_enter(&state->arcs_mtx);
|
mutex_enter(&state->arcs_mtx);
|
||||||
mutex_enter(&evicted_state->arcs_mtx);
|
mutex_enter(&evicted_state->arcs_mtx);
|
||||||
|
|
||||||
@ -2002,6 +2018,15 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
|
|||||||
mutex_exit(&evicted_state->arcs_mtx);
|
mutex_exit(&evicted_state->arcs_mtx);
|
||||||
mutex_exit(&state->arcs_mtx);
|
mutex_exit(&state->arcs_mtx);
|
||||||
|
|
||||||
|
if (list == &state->arcs_list[ARC_BUFC_DATA] &&
|
||||||
|
(bytes < 0 || bytes_evicted < bytes)) {
|
||||||
|
/* Prevent second pass from recycling metadata into data */
|
||||||
|
recycle = FALSE;
|
||||||
|
type = ARC_BUFC_METADATA;
|
||||||
|
list = &state->arcs_list[type];
|
||||||
|
goto top;
|
||||||
|
}
|
||||||
|
|
||||||
if (bytes_evicted < bytes)
|
if (bytes_evicted < bytes)
|
||||||
dprintf("only evicted %lld bytes from %x\n",
|
dprintf("only evicted %lld bytes from %x\n",
|
||||||
(longlong_t)bytes_evicted, state);
|
(longlong_t)bytes_evicted, state);
|
||||||
@ -2141,19 +2166,11 @@ arc_adjust(void)
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
adjustment = MIN((int64_t)(arc_size - arc_c),
|
adjustment = MIN((int64_t)(arc_size - arc_c),
|
||||||
(int64_t)(arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used -
|
(int64_t)(arc_anon->arcs_size + arc_mru->arcs_size - arc_p));
|
||||||
arc_p));
|
|
||||||
|
|
||||||
if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) {
|
if (adjustment > 0 && arc_mru->arcs_size > 0) {
|
||||||
delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment);
|
delta = MIN(arc_mru->arcs_size, adjustment);
|
||||||
(void) arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_DATA);
|
(void) arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_DATA);
|
||||||
adjustment -= delta;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) {
|
|
||||||
delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment);
|
|
||||||
(void) arc_evict(arc_mru, 0, delta, FALSE,
|
|
||||||
ARC_BUFC_METADATA);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -2162,17 +2179,9 @@ arc_adjust(void)
|
|||||||
|
|
||||||
adjustment = arc_size - arc_c;
|
adjustment = arc_size - arc_c;
|
||||||
|
|
||||||
if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) {
|
if (adjustment > 0 && arc_mfu->arcs_size > 0) {
|
||||||
delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]);
|
delta = MIN(arc_mfu->arcs_size, adjustment);
|
||||||
(void) arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_DATA);
|
(void) arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_DATA);
|
||||||
adjustment -= delta;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) {
|
|
||||||
int64_t delta = MIN(adjustment,
|
|
||||||
arc_mfu->arcs_lsize[ARC_BUFC_METADATA]);
|
|
||||||
(void) arc_evict(arc_mfu, 0, delta, FALSE,
|
|
||||||
ARC_BUFC_METADATA);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -2265,24 +2274,61 @@ arc_do_user_evicts(void)
|
|||||||
* This is only used to enforce the tunable arc_meta_limit, if we are
|
* This is only used to enforce the tunable arc_meta_limit, if we are
|
||||||
* unable to evict enough buffers notify the user via the prune callback.
|
* unable to evict enough buffers notify the user via the prune callback.
|
||||||
*/
|
*/
|
||||||
void
|
static void
|
||||||
arc_adjust_meta(int64_t adjustment, boolean_t may_prune)
|
arc_adjust_meta(void)
|
||||||
{
|
{
|
||||||
int64_t delta;
|
int64_t adjustmnt, delta;
|
||||||
|
|
||||||
if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) {
|
/*
|
||||||
delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment);
|
* This slightly differs than the way we evict from the mru in
|
||||||
|
* arc_adjust because we don't have a "target" value (i.e. no
|
||||||
|
* "meta" arc_p). As a result, I think we can completely
|
||||||
|
* cannibalize the metadata in the MRU before we evict the
|
||||||
|
* metadata from the MFU. I think we probably need to implement a
|
||||||
|
* "metadata arc_p" value to do this properly.
|
||||||
|
*/
|
||||||
|
adjustmnt = arc_meta_used - arc_meta_limit;
|
||||||
|
|
||||||
|
if (adjustmnt > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) {
|
||||||
|
delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustmnt);
|
||||||
arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_METADATA);
|
arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_METADATA);
|
||||||
adjustment -= delta;
|
adjustmnt -= delta;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) {
|
/*
|
||||||
delta = MIN(arc_mfu->arcs_lsize[ARC_BUFC_METADATA], adjustment);
|
* We can't afford to recalculate adjustmnt here. If we do,
|
||||||
|
* new metadata buffers can sneak into the MRU or ANON lists,
|
||||||
|
* thus penalize the MFU metadata. Although the fudge factor is
|
||||||
|
* small, it has been empirically shown to be significant for
|
||||||
|
* certain workloads (e.g. creating many empty directories). As
|
||||||
|
* such, we use the original calculation for adjustmnt, and
|
||||||
|
* simply decrement the amount of data evicted from the MRU.
|
||||||
|
*/
|
||||||
|
|
||||||
|
if (adjustmnt > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) {
|
||||||
|
delta = MIN(arc_mfu->arcs_lsize[ARC_BUFC_METADATA], adjustmnt);
|
||||||
arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_METADATA);
|
arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_METADATA);
|
||||||
adjustment -= delta;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (may_prune && (adjustment > 0) && (arc_meta_used > arc_meta_limit))
|
adjustmnt = arc_mru->arcs_lsize[ARC_BUFC_METADATA] +
|
||||||
|
arc_mru_ghost->arcs_lsize[ARC_BUFC_METADATA] - arc_meta_limit;
|
||||||
|
|
||||||
|
if (adjustmnt > 0 && arc_mru_ghost->arcs_lsize[ARC_BUFC_METADATA] > 0) {
|
||||||
|
delta = MIN(adjustmnt,
|
||||||
|
arc_mru_ghost->arcs_lsize[ARC_BUFC_METADATA]);
|
||||||
|
arc_evict_ghost(arc_mru_ghost, 0, delta, ARC_BUFC_METADATA);
|
||||||
|
}
|
||||||
|
|
||||||
|
adjustmnt = arc_mru_ghost->arcs_lsize[ARC_BUFC_METADATA] +
|
||||||
|
arc_mfu_ghost->arcs_lsize[ARC_BUFC_METADATA] - arc_meta_limit;
|
||||||
|
|
||||||
|
if (adjustmnt > 0 && arc_mfu_ghost->arcs_lsize[ARC_BUFC_METADATA] > 0) {
|
||||||
|
delta = MIN(adjustmnt,
|
||||||
|
arc_mfu_ghost->arcs_lsize[ARC_BUFC_METADATA]);
|
||||||
|
arc_evict_ghost(arc_mfu_ghost, 0, delta, ARC_BUFC_METADATA);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (arc_meta_used > arc_meta_limit)
|
||||||
arc_do_user_prune(zfs_arc_meta_prune);
|
arc_do_user_prune(zfs_arc_meta_prune);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2341,7 +2387,13 @@ arc_shrink(uint64_t bytes)
|
|||||||
else
|
else
|
||||||
arc_c = arc_c_min;
|
arc_c = arc_c_min;
|
||||||
|
|
||||||
atomic_add_64(&arc_p, -(arc_p >> zfs_arc_shrink_shift));
|
to_free = bytes ? bytes : arc_p >> zfs_arc_shrink_shift;
|
||||||
|
|
||||||
|
if (arc_p > to_free)
|
||||||
|
atomic_add_64(&arc_p, -to_free);
|
||||||
|
else
|
||||||
|
arc_p = 0;
|
||||||
|
|
||||||
if (arc_c > arc_size)
|
if (arc_c > arc_size)
|
||||||
arc_c = MAX(arc_size, arc_c_min);
|
arc_c = MAX(arc_size, arc_c_min);
|
||||||
if (arc_p > arc_c)
|
if (arc_p > arc_c)
|
||||||
@ -2396,7 +2448,6 @@ static void
|
|||||||
arc_adapt_thread(void)
|
arc_adapt_thread(void)
|
||||||
{
|
{
|
||||||
callb_cpr_t cpr;
|
callb_cpr_t cpr;
|
||||||
int64_t prune;
|
|
||||||
|
|
||||||
CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG);
|
CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG);
|
||||||
|
|
||||||
@ -2432,14 +2483,7 @@ arc_adapt_thread(void)
|
|||||||
if (arc_no_grow && ddi_get_lbolt() >= arc_grow_time)
|
if (arc_no_grow && ddi_get_lbolt() >= arc_grow_time)
|
||||||
arc_no_grow = FALSE;
|
arc_no_grow = FALSE;
|
||||||
|
|
||||||
/*
|
arc_adjust_meta();
|
||||||
* Keep meta data usage within limits, arc_shrink() is not
|
|
||||||
* used to avoid collapsing the arc_c value when only the
|
|
||||||
* arc_meta_limit is being exceeded.
|
|
||||||
*/
|
|
||||||
prune = (int64_t)arc_meta_used - (int64_t)arc_meta_limit;
|
|
||||||
if (prune > 0)
|
|
||||||
arc_adjust_meta(prune, B_TRUE);
|
|
||||||
|
|
||||||
arc_adjust();
|
arc_adjust();
|
||||||
|
|
||||||
@ -2574,8 +2618,10 @@ __arc_shrinker_func(struct shrinker *shrink, struct shrink_control *sc)
|
|||||||
*/
|
*/
|
||||||
if (pages > 0) {
|
if (pages > 0) {
|
||||||
arc_kmem_reap_now(ARC_RECLAIM_AGGR, ptob(sc->nr_to_scan));
|
arc_kmem_reap_now(ARC_RECLAIM_AGGR, ptob(sc->nr_to_scan));
|
||||||
|
pages = btop(arc_evictable_memory());
|
||||||
} else {
|
} else {
|
||||||
arc_kmem_reap_now(ARC_RECLAIM_CONS, ptob(sc->nr_to_scan));
|
arc_kmem_reap_now(ARC_RECLAIM_CONS, ptob(sc->nr_to_scan));
|
||||||
|
pages = -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -2595,7 +2641,7 @@ __arc_shrinker_func(struct shrinker *shrink, struct shrink_control *sc)
|
|||||||
|
|
||||||
mutex_exit(&arc_reclaim_thr_lock);
|
mutex_exit(&arc_reclaim_thr_lock);
|
||||||
|
|
||||||
return (-1);
|
return (pages);
|
||||||
}
|
}
|
||||||
SPL_SHRINKER_CALLBACK_WRAPPER(arc_shrinker_func);
|
SPL_SHRINKER_CALLBACK_WRAPPER(arc_shrinker_func);
|
||||||
|
|
||||||
@ -2611,7 +2657,6 @@ static void
|
|||||||
arc_adapt(int bytes, arc_state_t *state)
|
arc_adapt(int bytes, arc_state_t *state)
|
||||||
{
|
{
|
||||||
int mult;
|
int mult;
|
||||||
uint64_t arc_p_min = (arc_c >> zfs_arc_p_min_shift);
|
|
||||||
|
|
||||||
if (state == arc_l2c_only)
|
if (state == arc_l2c_only)
|
||||||
return;
|
return;
|
||||||
@ -2628,18 +2673,22 @@ arc_adapt(int bytes, arc_state_t *state)
|
|||||||
if (state == arc_mru_ghost) {
|
if (state == arc_mru_ghost) {
|
||||||
mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ?
|
mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ?
|
||||||
1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size));
|
1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size));
|
||||||
mult = MIN(mult, 10); /* avoid wild arc_p adjustment */
|
|
||||||
|
|
||||||
arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult);
|
if (!zfs_arc_p_dampener_disable)
|
||||||
|
mult = MIN(mult, 10); /* avoid wild arc_p adjustment */
|
||||||
|
|
||||||
|
arc_p = MIN(arc_c, arc_p + bytes * mult);
|
||||||
} else if (state == arc_mfu_ghost) {
|
} else if (state == arc_mfu_ghost) {
|
||||||
uint64_t delta;
|
uint64_t delta;
|
||||||
|
|
||||||
mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ?
|
mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ?
|
||||||
1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size));
|
1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size));
|
||||||
mult = MIN(mult, 10);
|
|
||||||
|
if (!zfs_arc_p_dampener_disable)
|
||||||
|
mult = MIN(mult, 10);
|
||||||
|
|
||||||
delta = MIN(bytes * mult, arc_p);
|
delta = MIN(bytes * mult, arc_p);
|
||||||
arc_p = MAX(arc_p_min, arc_p - delta);
|
arc_p = MAX(0, arc_p - delta);
|
||||||
}
|
}
|
||||||
ASSERT((int64_t)arc_p >= 0);
|
ASSERT((int64_t)arc_p >= 0);
|
||||||
|
|
||||||
@ -2710,6 +2759,8 @@ arc_get_data_buf(arc_buf_t *buf)
|
|||||||
arc_state_t *state = buf->b_hdr->b_state;
|
arc_state_t *state = buf->b_hdr->b_state;
|
||||||
uint64_t size = buf->b_hdr->b_size;
|
uint64_t size = buf->b_hdr->b_size;
|
||||||
arc_buf_contents_t type = buf->b_hdr->b_type;
|
arc_buf_contents_t type = buf->b_hdr->b_type;
|
||||||
|
arc_buf_contents_t evict = ARC_BUFC_DATA;
|
||||||
|
boolean_t recycle = TRUE;
|
||||||
|
|
||||||
arc_adapt(size, state);
|
arc_adapt(size, state);
|
||||||
|
|
||||||
@ -2720,12 +2771,11 @@ arc_get_data_buf(arc_buf_t *buf)
|
|||||||
if (!arc_evict_needed(type)) {
|
if (!arc_evict_needed(type)) {
|
||||||
if (type == ARC_BUFC_METADATA) {
|
if (type == ARC_BUFC_METADATA) {
|
||||||
buf->b_data = zio_buf_alloc(size);
|
buf->b_data = zio_buf_alloc(size);
|
||||||
arc_space_consume(size, ARC_SPACE_DATA);
|
arc_space_consume(size, ARC_SPACE_META);
|
||||||
} else {
|
} else {
|
||||||
ASSERT(type == ARC_BUFC_DATA);
|
ASSERT(type == ARC_BUFC_DATA);
|
||||||
buf->b_data = zio_data_buf_alloc(size);
|
buf->b_data = zio_data_buf_alloc(size);
|
||||||
ARCSTAT_INCR(arcstat_data_size, size);
|
arc_space_consume(size, ARC_SPACE_DATA);
|
||||||
atomic_add_64(&arc_size, size);
|
|
||||||
}
|
}
|
||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
@ -2750,10 +2800,27 @@ arc_get_data_buf(arc_buf_t *buf)
|
|||||||
mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
|
mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
|
||||||
}
|
}
|
||||||
|
|
||||||
if ((buf->b_data = arc_evict(state, 0, size, TRUE, type)) == NULL) {
|
/*
|
||||||
|
* Evict data buffers prior to metadata buffers, unless we're
|
||||||
|
* over the metadata limit and adding a metadata buffer.
|
||||||
|
*/
|
||||||
|
if (type == ARC_BUFC_METADATA) {
|
||||||
|
if (arc_meta_used >= arc_meta_limit)
|
||||||
|
evict = ARC_BUFC_METADATA;
|
||||||
|
else
|
||||||
|
/*
|
||||||
|
* In this case, we're evicting data while
|
||||||
|
* adding metadata. Thus, to prevent recycling a
|
||||||
|
* data buffer into a metadata buffer, recycling
|
||||||
|
* is disabled in the following arc_evict call.
|
||||||
|
*/
|
||||||
|
recycle = FALSE;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ((buf->b_data = arc_evict(state, 0, size, recycle, evict)) == NULL) {
|
||||||
if (type == ARC_BUFC_METADATA) {
|
if (type == ARC_BUFC_METADATA) {
|
||||||
buf->b_data = zio_buf_alloc(size);
|
buf->b_data = zio_buf_alloc(size);
|
||||||
arc_space_consume(size, ARC_SPACE_DATA);
|
arc_space_consume(size, ARC_SPACE_META);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If we are unable to recycle an existing meta buffer
|
* If we are unable to recycle an existing meta buffer
|
||||||
@ -2761,16 +2828,19 @@ arc_get_data_buf(arc_buf_t *buf)
|
|||||||
* via the prune callback to drop references. The
|
* via the prune callback to drop references. The
|
||||||
* prune callback in run in the context of the reclaim
|
* prune callback in run in the context of the reclaim
|
||||||
* thread to avoid deadlocking on the hash_lock.
|
* thread to avoid deadlocking on the hash_lock.
|
||||||
|
* Of course, only do this when recycle is true.
|
||||||
*/
|
*/
|
||||||
cv_signal(&arc_reclaim_thr_cv);
|
if (recycle)
|
||||||
|
cv_signal(&arc_reclaim_thr_cv);
|
||||||
} else {
|
} else {
|
||||||
ASSERT(type == ARC_BUFC_DATA);
|
ASSERT(type == ARC_BUFC_DATA);
|
||||||
buf->b_data = zio_data_buf_alloc(size);
|
buf->b_data = zio_data_buf_alloc(size);
|
||||||
ARCSTAT_INCR(arcstat_data_size, size);
|
arc_space_consume(size, ARC_SPACE_DATA);
|
||||||
atomic_add_64(&arc_size, size);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
ARCSTAT_BUMP(arcstat_recycle_miss);
|
/* Only bump this if we tried to recycle and failed */
|
||||||
|
if (recycle)
|
||||||
|
ARCSTAT_BUMP(arcstat_recycle_miss);
|
||||||
}
|
}
|
||||||
ASSERT(buf->b_data != NULL);
|
ASSERT(buf->b_data != NULL);
|
||||||
out:
|
out:
|
||||||
@ -2790,7 +2860,8 @@ out:
|
|||||||
* If we are growing the cache, and we are adding anonymous
|
* If we are growing the cache, and we are adding anonymous
|
||||||
* data, and we have outgrown arc_p, update arc_p
|
* data, and we have outgrown arc_p, update arc_p
|
||||||
*/
|
*/
|
||||||
if (arc_size < arc_c && hdr->b_state == arc_anon &&
|
if (!zfs_arc_p_aggressive_disable &&
|
||||||
|
arc_size < arc_c && hdr->b_state == arc_anon &&
|
||||||
arc_anon->arcs_size + arc_mru->arcs_size > arc_p)
|
arc_anon->arcs_size + arc_mru->arcs_size > arc_p)
|
||||||
arc_p = MIN(arc_c, arc_p + size);
|
arc_p = MIN(arc_c, arc_p + size);
|
||||||
}
|
}
|
||||||
@ -4025,8 +4096,8 @@ arc_init(void)
|
|||||||
spl_register_shrinker(&arc_shrinker);
|
spl_register_shrinker(&arc_shrinker);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/* set min cache to 1/32 of all memory, or 64MB, whichever is more */
|
/* set min cache to zero */
|
||||||
arc_c_min = MAX(arc_c / 4, 64<<20);
|
arc_c_min = 4<<20;
|
||||||
/* set max to 1/2 of all memory */
|
/* set max to 1/2 of all memory */
|
||||||
arc_c_max = arc_c * 4;
|
arc_c_max = arc_c * 4;
|
||||||
|
|
||||||
@ -4036,23 +4107,20 @@ arc_init(void)
|
|||||||
*/
|
*/
|
||||||
if (zfs_arc_max > 64<<20 && zfs_arc_max < physmem * PAGESIZE)
|
if (zfs_arc_max > 64<<20 && zfs_arc_max < physmem * PAGESIZE)
|
||||||
arc_c_max = zfs_arc_max;
|
arc_c_max = zfs_arc_max;
|
||||||
if (zfs_arc_min > 64<<20 && zfs_arc_min <= arc_c_max)
|
if (zfs_arc_min > 0 && zfs_arc_min <= arc_c_max)
|
||||||
arc_c_min = zfs_arc_min;
|
arc_c_min = zfs_arc_min;
|
||||||
|
|
||||||
arc_c = arc_c_max;
|
arc_c = arc_c_max;
|
||||||
arc_p = (arc_c >> 1);
|
arc_p = (arc_c >> 1);
|
||||||
|
|
||||||
/* limit meta-data to 1/4 of the arc capacity */
|
/* limit meta-data to 3/4 of the arc capacity */
|
||||||
arc_meta_limit = arc_c_max / 4;
|
arc_meta_limit = (3 * arc_c_max) / 4;
|
||||||
arc_meta_max = 0;
|
arc_meta_max = 0;
|
||||||
|
|
||||||
/* Allow the tunable to override if it is reasonable */
|
/* Allow the tunable to override if it is reasonable */
|
||||||
if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max)
|
if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max)
|
||||||
arc_meta_limit = zfs_arc_meta_limit;
|
arc_meta_limit = zfs_arc_meta_limit;
|
||||||
|
|
||||||
if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0)
|
|
||||||
arc_c_min = arc_meta_limit / 2;
|
|
||||||
|
|
||||||
/* if kmem_flags are set, lets try to use less memory */
|
/* if kmem_flags are set, lets try to use less memory */
|
||||||
if (kmem_debugging())
|
if (kmem_debugging())
|
||||||
arc_c = arc_c / 2;
|
arc_c = arc_c / 2;
|
||||||
@ -5548,12 +5616,15 @@ MODULE_PARM_DESC(zfs_arc_meta_prune, "Bytes of meta data to prune");
|
|||||||
module_param(zfs_arc_grow_retry, int, 0644);
|
module_param(zfs_arc_grow_retry, int, 0644);
|
||||||
MODULE_PARM_DESC(zfs_arc_grow_retry, "Seconds before growing arc size");
|
MODULE_PARM_DESC(zfs_arc_grow_retry, "Seconds before growing arc size");
|
||||||
|
|
||||||
|
module_param(zfs_arc_p_aggressive_disable, int, 0644);
|
||||||
|
MODULE_PARM_DESC(zfs_arc_p_aggressive_disable, "disable aggressive arc_p grow");
|
||||||
|
|
||||||
|
module_param(zfs_arc_p_dampener_disable, int, 0644);
|
||||||
|
MODULE_PARM_DESC(zfs_arc_p_dampener_disable, "disable arc_p adapt dampener");
|
||||||
|
|
||||||
module_param(zfs_arc_shrink_shift, int, 0644);
|
module_param(zfs_arc_shrink_shift, int, 0644);
|
||||||
MODULE_PARM_DESC(zfs_arc_shrink_shift, "log2(fraction of arc to reclaim)");
|
MODULE_PARM_DESC(zfs_arc_shrink_shift, "log2(fraction of arc to reclaim)");
|
||||||
|
|
||||||
module_param(zfs_arc_p_min_shift, int, 0644);
|
|
||||||
MODULE_PARM_DESC(zfs_arc_p_min_shift, "arc_c shift to calc min/max arc_p");
|
|
||||||
|
|
||||||
module_param(zfs_disable_dup_eviction, int, 0644);
|
module_param(zfs_disable_dup_eviction, int, 0644);
|
||||||
MODULE_PARM_DESC(zfs_disable_dup_eviction, "disable duplicate buffer eviction");
|
MODULE_PARM_DESC(zfs_disable_dup_eviction, "disable duplicate buffer eviction");
|
||||||
|
|
||||||
|
@ -342,7 +342,7 @@ zpl_nr_cached_objects(struct super_block *sb)
|
|||||||
static void
|
static void
|
||||||
zpl_free_cached_objects(struct super_block *sb, int nr_to_scan)
|
zpl_free_cached_objects(struct super_block *sb, int nr_to_scan)
|
||||||
{
|
{
|
||||||
arc_adjust_meta(nr_to_scan * sizeof (znode_t), B_FALSE);
|
/* noop */
|
||||||
}
|
}
|
||||||
#endif /* HAVE_FREE_CACHED_OBJECTS */
|
#endif /* HAVE_FREE_CACHED_OBJECTS */
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user