diff --git a/include/sys/arc.h b/include/sys/arc.h index 8f0f6cb55..d8a85e830 100644 --- a/include/sys/arc.h +++ b/include/sys/arc.h @@ -134,7 +134,9 @@ typedef enum arc_space_type { ARC_SPACE_META, ARC_SPACE_HDRS, ARC_SPACE_L2HDRS, - ARC_SPACE_OTHER, + ARC_SPACE_DBUF, + ARC_SPACE_DNODE, + ARC_SPACE_BONUS, ARC_SPACE_NUMTYPES } arc_space_type_t; diff --git a/man/man5/zfs-module-parameters.5 b/man/man5/zfs-module-parameters.5 index cd92851de..41fc20deb 100644 --- a/man/man5/zfs-module-parameters.5 +++ b/man/man5/zfs-module-parameters.5 @@ -381,6 +381,37 @@ Min time before an active prefetch stream can be reclaimed Default value: \fB2\fR. .RE +.sp +.ne 2 +.na +\fBzfs_arc_dnode_limit\fR (ulong) +.ad +.RS 12n +When the number of bytes consumed by dnodes in the ARC exceeds this number of +bytes, try to unpin some of it in response to demand for non-metadata. This +value acts as a floor to the amount of dnode metadata. + +See also \fBzfs_arc_meta_prune\fR which serves a similar purpose but is used +when the amount of metadata in the ARC exceeds \fBzfs_arc_meta_limit\fR rather +than in response to overall demand for non-metadata. + +.sp +Default value: \fB10% of zfs_arc_meta_limit\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_arc_dnode_reduce_percent\fR (ulong) +.ad +.RS 12n +Percentage of ARC dnodes to try to scan in response to demand for non-metadata +when the number of bytes consumed by dnodes exceeds \fBzfs_arc_dnode_limit\fB. + +.sp +Default value: \fB10% of the number of dnodes in the ARC\fR. +.RE + .sp .ne 2 .na diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 2dbca8da9..6d8bd48a3 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -231,6 +231,8 @@ unsigned long zfs_arc_max = 0; unsigned long zfs_arc_min = 0; unsigned long zfs_arc_meta_limit = 0; unsigned long zfs_arc_meta_min = 0; +unsigned long zfs_arc_dnode_limit = 0; +unsigned long zfs_arc_dnode_reduce_percent = 10; int zfs_arc_grow_retry = 0; int zfs_arc_shrink_shift = 0; int zfs_arc_p_min_shift = 0; @@ -328,13 +330,17 @@ typedef struct arc_stats { */ kstat_named_t arcstat_metadata_size; /* - * Number of bytes consumed by various buffers and structures - * not actually backed with ARC buffers. This includes bonus - * buffers (allocated directly via zio_buf_* functions), - * dmu_buf_impl_t structures (allocated via dmu_buf_impl_t - * cache), and dnode_t structures (allocated via dnode_t cache). + * Number of bytes consumed by dmu_buf_impl_t objects. */ - kstat_named_t arcstat_other_size; + kstat_named_t arcstat_dbuf_size; + /* + * Number of bytes consumed by dnode_t objects. + */ + kstat_named_t arcstat_dnode_size; + /* + * Number of bytes consumed by bonus buffers. + */ + kstat_named_t arcstat_bonus_size; /* * Total number of bytes consumed by ARC buffers residing in the * arc_anon state. This includes *all* buffers in the arc_anon @@ -473,6 +479,7 @@ typedef struct arc_stats { kstat_named_t arcstat_prune; kstat_named_t arcstat_meta_used; kstat_named_t arcstat_meta_limit; + kstat_named_t arcstat_dnode_limit; kstat_named_t arcstat_meta_max; kstat_named_t arcstat_meta_min; kstat_named_t arcstat_sync_wait_for_async; @@ -517,7 +524,9 @@ static arc_stats_t arc_stats = { { "hdr_size", KSTAT_DATA_UINT64 }, { "data_size", KSTAT_DATA_UINT64 }, { "metadata_size", KSTAT_DATA_UINT64 }, - { "other_size", KSTAT_DATA_UINT64 }, + { "dbuf_size", KSTAT_DATA_UINT64 }, + { "dnode_size", KSTAT_DATA_UINT64 }, + { "bonus_size", KSTAT_DATA_UINT64 }, { "anon_size", KSTAT_DATA_UINT64 }, { "anon_evictable_data", KSTAT_DATA_UINT64 }, { "anon_evictable_metadata", KSTAT_DATA_UINT64 }, @@ -570,6 +579,7 @@ static arc_stats_t arc_stats = { { "arc_prune", KSTAT_DATA_UINT64 }, { "arc_meta_used", KSTAT_DATA_UINT64 }, { "arc_meta_limit", KSTAT_DATA_UINT64 }, + { "arc_dnode_limit", KSTAT_DATA_UINT64 }, { "arc_meta_max", KSTAT_DATA_UINT64 }, { "arc_meta_min", KSTAT_DATA_UINT64 }, { "sync_wait_for_async", KSTAT_DATA_UINT64 }, @@ -641,9 +651,13 @@ static arc_state_t *arc_l2c_only; #define arc_tempreserve ARCSTAT(arcstat_tempreserve) #define arc_loaned_bytes ARCSTAT(arcstat_loaned_bytes) #define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */ +#define arc_dnode_limit ARCSTAT(arcstat_dnode_limit) /* max size for dnodes */ #define arc_meta_min ARCSTAT(arcstat_meta_min) /* min size for metadata */ #define arc_meta_used ARCSTAT(arcstat_meta_used) /* size of metadata */ #define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */ +#define arc_dbuf_size ARCSTAT(arcstat_dbuf_size) /* dbuf metadata */ +#define arc_dnode_size ARCSTAT(arcstat_dnode_size) /* dnode metadata */ +#define arc_bonus_size ARCSTAT(arcstat_bonus_size) /* bonus buffer metadata */ #define arc_need_free ARCSTAT(arcstat_need_free) /* bytes to be freed */ #define arc_sys_free ARCSTAT(arcstat_sys_free) /* target system free bytes */ @@ -803,6 +817,7 @@ static void arc_access(arc_buf_hdr_t *, kmutex_t *); static boolean_t arc_is_overflowing(void); static void arc_buf_watch(arc_buf_t *); static void arc_tuning_update(void); +static void arc_prune_async(int64_t); static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *); static uint32_t arc_bufc_to_flags(arc_buf_contents_t); @@ -1680,8 +1695,14 @@ arc_space_consume(uint64_t space, arc_space_type_t type) case ARC_SPACE_META: ARCSTAT_INCR(arcstat_metadata_size, space); break; - case ARC_SPACE_OTHER: - ARCSTAT_INCR(arcstat_other_size, space); + case ARC_SPACE_BONUS: + ARCSTAT_INCR(arcstat_bonus_size, space); + break; + case ARC_SPACE_DNODE: + ARCSTAT_INCR(arcstat_dnode_size, space); + break; + case ARC_SPACE_DBUF: + ARCSTAT_INCR(arcstat_dbuf_size, space); break; case ARC_SPACE_HDRS: ARCSTAT_INCR(arcstat_hdr_size, space); @@ -1711,8 +1732,14 @@ arc_space_return(uint64_t space, arc_space_type_t type) case ARC_SPACE_META: ARCSTAT_INCR(arcstat_metadata_size, -space); break; - case ARC_SPACE_OTHER: - ARCSTAT_INCR(arcstat_other_size, -space); + case ARC_SPACE_BONUS: + ARCSTAT_INCR(arcstat_bonus_size, -space); + break; + case ARC_SPACE_DNODE: + ARCSTAT_INCR(arcstat_dnode_size, -space); + break; + case ARC_SPACE_DBUF: + ARCSTAT_INCR(arcstat_dbuf_size, -space); break; case ARC_SPACE_HDRS: ARCSTAT_INCR(arcstat_hdr_size, -space); @@ -2599,6 +2626,18 @@ arc_evict_state(arc_state_t *state, uint64_t spa, int64_t bytes, * we're evicting all available buffers. */ while (total_evicted < bytes || bytes == ARC_EVICT_ALL) { + int sublist_idx = multilist_get_random_index(ml); + uint64_t scan_evicted = 0; + + /* + * Try to reduce pinned dnodes with a floor of arc_dnode_limit. + * Request that 10% of the LRUs be scanned by the superblock + * shrinker. + */ + if (type == ARC_BUFC_DATA && arc_dnode_size > arc_dnode_limit) + arc_prune_async((arc_dnode_size - arc_dnode_limit) / + sizeof (dnode_t) / zfs_arc_dnode_reduce_percent); + /* * Start eviction using a randomly selected sublist, * this is to try and evenly balance eviction across all @@ -2606,9 +2645,6 @@ arc_evict_state(arc_state_t *state, uint64_t spa, int64_t bytes, * (e.g. index 0) would cause evictions to favor certain * sublists over others. */ - int sublist_idx = multilist_get_random_index(ml); - uint64_t scan_evicted = 0; - for (i = 0; i < num_sublists; i++) { uint64_t bytes_remaining; uint64_t bytes_evicted; @@ -5329,6 +5365,7 @@ arc_tuning_update(void) arc_c = arc_c_max; arc_p = (arc_c >> 1); arc_meta_limit = MIN(arc_meta_limit, (3 * arc_c_max) / 4); + arc_dnode_limit = arc_meta_limit / 10; } /* Valid range: 32M - */ @@ -5345,6 +5382,7 @@ arc_tuning_update(void) (zfs_arc_meta_min <= arc_c_max)) { arc_meta_min = zfs_arc_meta_min; arc_meta_limit = MAX(arc_meta_limit, arc_meta_min); + arc_dnode_limit = arc_meta_limit / 10; } /* Valid range: - */ @@ -5353,6 +5391,12 @@ arc_tuning_update(void) (zfs_arc_meta_limit <= arc_c_max)) arc_meta_limit = zfs_arc_meta_limit; + /* Valid range: - */ + if ((zfs_arc_dnode_limit) && (zfs_arc_dnode_limit != arc_dnode_limit) && + (zfs_arc_dnode_limit >= zfs_arc_meta_min) && + (zfs_arc_dnode_limit <= arc_c_max)) + arc_dnode_limit = zfs_arc_dnode_limit; + /* Valid range: 1 - N */ if (zfs_arc_grow_retry) arc_grow_retry = zfs_arc_grow_retry; @@ -5451,6 +5495,8 @@ arc_init(void) arc_meta_max = 0; /* Set limit to 3/4 of arc_c_max with a floor of arc_meta_min */ arc_meta_limit = MAX((3 * arc_c_max) / 4, arc_meta_min); + /* Default dnode limit is 10% of overall meta limit */ + arc_dnode_limit = arc_meta_limit / 10; /* Apply user specified tunings */ arc_tuning_update(); @@ -7204,4 +7250,11 @@ MODULE_PARM_DESC(zfs_arc_lotsfree_percent, module_param(zfs_arc_sys_free, ulong, 0644); MODULE_PARM_DESC(zfs_arc_sys_free, "System free memory target size in bytes"); +module_param(zfs_arc_dnode_limit, ulong, 0644); +MODULE_PARM_DESC(zfs_arc_dnode_limit, "Minimum bytes of dnodes in arc"); + +module_param(zfs_arc_dnode_reduce_percent, ulong, 0644); +MODULE_PARM_DESC(zfs_arc_dnode_reduce_percent, + "Percentage of excess dnodes to try to unpin"); + #endif diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 61cc83e41..af2f20d63 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -738,7 +738,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) ASSERT3U(bonuslen, <=, db->db.db_size); db->db.db_data = zio_buf_alloc(max_bonuslen); - arc_space_consume(max_bonuslen, ARC_SPACE_OTHER); + arc_space_consume(max_bonuslen, ARC_SPACE_BONUS); if (bonuslen < max_bonuslen) bzero(db->db.db_data, max_bonuslen); if (bonuslen) @@ -969,7 +969,7 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) dnode_t *dn = DB_DNODE(db); int bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots); dr->dt.dl.dr_data = zio_buf_alloc(bonuslen); - arc_space_consume(bonuslen, ARC_SPACE_OTHER); + arc_space_consume(bonuslen, ARC_SPACE_BONUS); bcopy(db->db.db_data, dr->dt.dl.dr_data, bonuslen); } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) { int size = db->db.db_size; @@ -1867,7 +1867,7 @@ dbuf_clear(dmu_buf_impl_t *db) int slots = DB_DNODE(db)->dn_num_slots; int bonuslen = DN_SLOTS_TO_BONUSLEN(slots); zio_buf_free(db->db.db_data, bonuslen); - arc_space_return(bonuslen, ARC_SPACE_OTHER); + arc_space_return(bonuslen, ARC_SPACE_BONUS); } db->db.db_data = NULL; db->db_state = DB_UNCACHED; @@ -2032,7 +2032,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, db->db.db_offset = DMU_BONUS_BLKID; db->db_state = DB_UNCACHED; /* the bonus dbuf is not placed in the hash table */ - arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); + arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF); return (db); } else if (blkid == DMU_SPILL_BLKID) { db->db.db_size = (blkptr != NULL) ? @@ -2066,7 +2066,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, dn->dn_unlisted_l0_blkid = db->db_blkid + 1; db->db_state = DB_UNCACHED; mutex_exit(&dn->dn_dbufs_mtx); - arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); + arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF); if (parent && parent != dn->dn_dbuf) dbuf_add_ref(parent, db); @@ -2143,7 +2143,7 @@ dbuf_destroy(dmu_buf_impl_t *db) ASSERT(db->db_data_pending == NULL); kmem_cache_free(dbuf_cache, db); - arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); + arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF); } typedef struct dbuf_prefetch_arg { @@ -2983,7 +2983,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) int slots = DB_DNODE(db)->dn_num_slots; int bonuslen = DN_SLOTS_TO_BONUSLEN(slots); zio_buf_free(*datap, bonuslen); - arc_space_return(bonuslen, ARC_SPACE_OTHER); + arc_space_return(bonuslen, ARC_SPACE_BONUS); } db->db_data_pending = NULL; drp = &db->db_last_dirty; diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c index 975bd5fb8..8015f54ed 100644 --- a/module/zfs/dnode.c +++ b/module/zfs/dnode.c @@ -475,7 +475,7 @@ dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db, dnh->dnh_dnode = dn; mutex_exit(&os->os_lock); - arc_space_consume(sizeof (dnode_t), ARC_SPACE_OTHER); + arc_space_consume(sizeof (dnode_t), ARC_SPACE_DNODE); return (dn); } @@ -531,7 +531,7 @@ dnode_destroy(dnode_t *dn) dmu_zfetch_fini(&dn->dn_zfetch); kmem_cache_free(dnode_cache, dn); - arc_space_return(sizeof (dnode_t), ARC_SPACE_OTHER); + arc_space_return(sizeof (dnode_t), ARC_SPACE_DNODE); if (complete_os_eviction) dmu_objset_evict_done(os);