From 37c22948e5b1ce9f9f11c66676ecca15caf1f264 Mon Sep 17 00:00:00 2001 From: George Amanakis Date: Tue, 31 Mar 2020 13:46:48 -0400 Subject: [PATCH] Reset l2ad_hand and l2ad_first in l2arc_evict Increasing l2arc_write_size or l2arc_write_boost can result in l2arc_write_buffers() not having enough space to perform its writes and panic zio_write_phys(). Instead of resetting l2ad_hand to l2ad_start at the end of l2arc_write_buffers() and not taking into account a possible user-mediated increase of l2arc_write_max, we do this in l2arc_evict(), right after l2arc_write_size() has run. If there is not enough space to evict (ie we will exceed l2ad_end) we evict to the end of the device, reset l2ad_hand to l2ad_start, set l2ad_first to 0 and iterate l2arc_evict(). We avoid infinite iteration of l2arc_evict() by making sure in l2arc_write_size() that l2ad_start + size does not exceed l2ad_end. Reviewed-by: Brian Behlendorf Reviewed-by: Ryan Moeller Signed-off-by: George Amanakis Closes #10154 --- module/zfs/arc.c | 77 ++++++++---- tests/runfiles/common.run | 2 +- tests/zfs-tests/include/libtest.shlib | 13 +++ tests/zfs-tests/include/tunables.cfg | 3 + .../tests/functional/cache/Makefile.am | 3 +- .../tests/functional/cache/cache_012_pos.ksh | 110 ++++++++++++++++++ 6 files changed, 181 insertions(+), 27 deletions(-) create mode 100755 tests/zfs-tests/tests/functional/cache/cache_012_pos.ksh diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 8a0c1a4a7..9c5ee9829 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -7467,9 +7467,9 @@ l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr) } static uint64_t -l2arc_write_size(void) +l2arc_write_size(l2arc_dev_t *dev) { - uint64_t size; + uint64_t size, dev_size; /* * Make sure our globals have meaningful values in case the user @@ -7486,6 +7486,23 @@ l2arc_write_size(void) if (arc_warm == B_FALSE) size += l2arc_write_boost; + /* + * Make sure the write size does not exceed the size of the cache + * device. This is important in l2arc_evict(), otherwise infinite + * iteration can occur. + */ + dev_size = dev->l2ad_end - dev->l2ad_start; + if (size >= dev_size) { + cmn_err(CE_NOTE, "l2arc_write_max or l2arc_write_boost " + "exceeds the size of the cache device (guid %llu), " + "resetting them to the default (%d)", + dev->l2ad_vdev->vdev_guid, L2ARC_WRITE_SIZE); + size = l2arc_write_max = l2arc_write_boost = L2ARC_WRITE_SIZE; + + if (arc_warm == B_FALSE) + size += l2arc_write_boost; + } + return (size); } @@ -8008,22 +8025,22 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) arc_buf_hdr_t *hdr, *hdr_prev; kmutex_t *hash_lock; uint64_t taddr; + boolean_t rerun; buflist = &dev->l2ad_buflist; - if (!all && dev->l2ad_first) { +top: + rerun = B_FALSE; + if (dev->l2ad_hand >= (dev->l2ad_end - distance)) { /* - * This is the first sweep through the device. There is - * nothing to evict. - */ - return; - } - - if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) { - /* - * When nearing the end of the device, evict to the end - * before the device write hand jumps to the start. + * When there is no space to accomodate upcoming writes, + * evict to the end. Then bump the write hand to the start + * and iterate. This iteration does not happen indefinitely + * as we make sure in l2arc_write_size() that when l2ad_hand + * is reset, the write size does not exceed the end of the + * device. */ + rerun = B_TRUE; taddr = dev->l2ad_end; } else { taddr = dev->l2ad_hand + distance; @@ -8031,7 +8048,15 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist, uint64_t, taddr, boolean_t, all); -top: + if (!all && dev->l2ad_first) { + /* + * This is the first sweep through the device. There is + * nothing to evict. + */ + goto out; + } + +retry: mutex_enter(&dev->l2ad_mtx); for (hdr = list_tail(buflist); hdr; hdr = hdr_prev) { hdr_prev = list_prev(buflist, hdr); @@ -8052,7 +8077,7 @@ top: mutex_exit(&dev->l2ad_mtx); mutex_enter(hash_lock); mutex_exit(hash_lock); - goto top; + goto retry; } /* @@ -8101,6 +8126,17 @@ top: mutex_exit(hash_lock); } mutex_exit(&dev->l2ad_mtx); + +out: + if (rerun) { + /* + * Bump device hand to the device start if it is approaching the + * end. l2arc_evict() has already evicted ahead for this case. + */ + dev->l2ad_hand = dev->l2ad_start; + dev->l2ad_first = B_FALSE; + goto top; + } } /* @@ -8448,15 +8484,6 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) ARCSTAT_INCR(arcstat_l2_lsize, write_lsize); ARCSTAT_INCR(arcstat_l2_psize, write_psize); - /* - * Bump device hand to the device start if it is approaching the end. - * l2arc_evict() will already have evicted ahead for this case. - */ - if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) { - dev->l2ad_hand = dev->l2ad_start; - dev->l2ad_first = B_FALSE; - } - dev->l2ad_writing = B_TRUE; (void) zio_wait(pio); dev->l2ad_writing = B_FALSE; @@ -8539,7 +8566,7 @@ l2arc_feed_thread(void *unused) ARCSTAT_BUMP(arcstat_l2_feeds); - size = l2arc_write_size(); + size = l2arc_write_size(dev); /* * Evict L2ARC buffers that will be overwritten. diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 96700becb..84ea70f07 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -49,7 +49,7 @@ post = [tests/functional/cache] tests = ['cache_001_pos', 'cache_002_pos', 'cache_003_pos', 'cache_004_neg', 'cache_005_neg', 'cache_006_pos', 'cache_007_neg', 'cache_008_neg', - 'cache_009_pos', 'cache_010_neg', 'cache_011_pos'] + 'cache_009_pos', 'cache_010_neg', 'cache_011_pos', 'cache_012_pos'] tags = ['functional', 'cache'] [tests/functional/cachefile] diff --git a/tests/zfs-tests/include/libtest.shlib b/tests/zfs-tests/include/libtest.shlib index 1f24b4271..a641a0b7a 100644 --- a/tests/zfs-tests/include/libtest.shlib +++ b/tests/zfs-tests/include/libtest.shlib @@ -4053,3 +4053,16 @@ function ls_xattr # path ;; esac } + +function get_arcstat # stat +{ + if is_linux; then + typeset stat=$1 + typeset zfs_arcstats="/proc/spl/kstat/zfs/arcstats" + [[ -f "$zfs_arcstats" ]] || return 1 + grep $stat $zfs_arcstats | awk '{print $3}' + return $? + else + return 1 + fi +} diff --git a/tests/zfs-tests/include/tunables.cfg b/tests/zfs-tests/include/tunables.cfg index 01176c781..62d335abe 100644 --- a/tests/zfs-tests/include/tunables.cfg +++ b/tests/zfs-tests/include/tunables.cfg @@ -35,6 +35,9 @@ DISABLE_IVSET_GUID_CHECK disable_ivset_guid_check zfs_disable_ivset_guid_check INITIALIZE_CHUNK_SIZE initialize_chunk_size zfs_initialize_chunk_size INITIALIZE_VALUE initialize_value zfs_initialize_value KEEP_LOG_SPACEMAPS_AT_EXPORT keep_log_spacemaps_at_export zfs_keep_log_spacemaps_at_export +L2ARC_NOPREFETCH l2arc.noprefetch l2arc_noprefetch +L2ARC_WRITE_BOOST l2arc.write_boost l2arc_write_boost +L2ARC_WRITE_MAX l2arc.write_max l2arc_write_max LIVELIST_CONDENSE_NEW_ALLOC livelist.condense.new_alloc zfs_livelist_condense_new_alloc LIVELIST_CONDENSE_SYNC_CANCEL livelist.condense.sync_cancel zfs_livelist_condense_sync_cancel LIVELIST_CONDENSE_SYNC_PAUSE livelist.condense.sync_pause zfs_livelist_condense_sync_pause diff --git a/tests/zfs-tests/tests/functional/cache/Makefile.am b/tests/zfs-tests/tests/functional/cache/Makefile.am index 18dd9c198..406a92817 100644 --- a/tests/zfs-tests/tests/functional/cache/Makefile.am +++ b/tests/zfs-tests/tests/functional/cache/Makefile.am @@ -12,7 +12,8 @@ dist_pkgdata_SCRIPTS = \ cache_008_neg.ksh \ cache_009_pos.ksh \ cache_010_neg.ksh \ - cache_011_pos.ksh + cache_011_pos.ksh \ + cache_012_pos.ksh dist_pkgdata_DATA = \ cache.cfg \ diff --git a/tests/zfs-tests/tests/functional/cache/cache_012_pos.ksh b/tests/zfs-tests/tests/functional/cache/cache_012_pos.ksh new file mode 100755 index 000000000..edefe9c1b --- /dev/null +++ b/tests/zfs-tests/tests/functional/cache/cache_012_pos.ksh @@ -0,0 +1,110 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020, George Amanakis. All rights reserved. +# + +. $STF_SUITE/tests/functional/cache/cache.cfg +. $STF_SUITE/tests/functional/cache/cache.kshlib + +# +# DESCRIPTION: +# Looping around a cache device with l2arc_write_size exceeding +# the device size succeeds. +# +# STRATEGY: +# 1. Create pool with a cache device. +# 2. Set l2arc_write_max to a value larger than the cache device. +# 3. Create a file larger than the cache device and random read +# for 10 sec. +# 4. Verify that l2arc_write_max is set back to the default. +# 5. Set l2arc_write_max to a value less than the cache device size but +# larger than the default (64MB). +# 6. Record the l2_size. +# 7. Random read for 1 sec. +# 8. Record the l2_size again. +# 9. If (6) <= (8) then we have not looped around yet. +# 10. If (6) > (8) then we looped around. Break out of the loop and test. +# 11. Destroy pool. +# + +verify_runnable "global" + +log_assert "Looping around a cache device succeeds." + +function cleanup +{ + if poolexists $TESTPOOL ; then + destroy_pool $TESTPOOL + fi + + log_must set_tunable32 L2ARC_WRITE_MAX $write_max + log_must set_tunable32 L2ARC_NOPREFETCH $noprefetch +} +log_onexit cleanup + +typeset write_max=$(get_tunable L2ARC_WRITE_MAX) +typeset noprefetch=$(get_tunable L2ARC_NOPREFETCH) +log_must set_tunable32 L2ARC_NOPREFETCH 0 + +typeset VDEV="$VDIR/vdev.disk" +typeset VDEV_SZ=$(( 4 * 1024 * 1024 * 1024 )) +typeset VCACHE="$VDIR/vdev.cache" +typeset VCACHE_SZ=$(( $VDEV_SZ / 2 )) + +typeset fill_mb=$(( floor($VDEV_SZ * 3 / 4 ) )) +export DIRECTORY=/$TESTPOOL +export NUMJOBS=4 +export RUNTIME=10 +export PERF_RANDSEED=1234 +export PERF_COMPPERCENT=66 +export PERF_COMPCHUNK=0 +export BLOCKSIZE=128K +export SYNC_TYPE=0 +export DIRECT=1 +export FILE_SIZE=$(( floor($fill_mb / $NUMJOBS) )) + +log_must set_tunable32 L2ARC_WRITE_MAX $(( $VCACHE_SZ * 2 )) + +log_must truncate -s $VCACHE_SZ $VCACHE +log_must truncate -s $VDEV_SZ $VDEV + +log_must zpool create -f $TESTPOOL $VDEV cache $VCACHE + +log_must fio $FIO_SCRIPTS/mkfiles.fio +log_must fio $FIO_SCRIPTS/random_reads.fio + +typeset write_max2=$(get_tunable L2ARC_WRITE_MAX) + +log_must test $write_max2 -eq $write_max + +log_must set_tunable32 L2ARC_WRITE_MAX $(( 64 * 1024 * 1024 )) +export RUNTIME=1 + +typeset do_once=true +while $do_once || [[ $l2_size1 -le $l2_size2 ]]; do + typeset l2_size1=$(get_arcstat l2_size) + log_must fio $FIO_SCRIPTS/random_reads.fio + typeset l2_size2=$(get_arcstat l2_size) + do_once=false +done + +log_must test $l2_size1 -gt $l2_size2 + +log_must zpool destroy $TESTPOOL + +log_pass "Looping around a cache device succeeds."