From 009ff83548200c277f135e6c3a0d484acbe53a9f Mon Sep 17 00:00:00 2001 From: Mark Roper Date: Thu, 12 Mar 2020 13:24:43 -0400 Subject: [PATCH] Prevent deadlock in arc_read in Linux memory reclaim callback Using zfs with Lustre, an arc_read can trigger kernel memory allocation that in turn leads to a memory reclaim callback and a deadlock within a single zfs process. This change uses spl_fstrans_mark and spl_trans_unmark to prevent the reclaim attempt and the deadlock (https://zfsonlinux.topicbox.com/groups/zfs-devel/T4db2c705ec1804ba). The stack trace observed is: __schedule at ffffffff81610f2e schedule at ffffffff81611558 schedule_preempt_disabled at ffffffff8161184a __mutex_lock at ffffffff816131e8 arc_buf_destroy at ffffffffa0bf37d7 [zfs] dbuf_destroy at ffffffffa0bfa6fe [zfs] dbuf_evict_one at ffffffffa0bfaa96 [zfs] dbuf_rele_and_unlock at ffffffffa0bfa561 [zfs] dbuf_rele_and_unlock at ffffffffa0bfa32b [zfs] osd_object_delete at ffffffffa0b64ecc [osd_zfs] lu_object_free at ffffffffa06d6a74 [obdclass] lu_site_purge_objects at ffffffffa06d7fc1 [obdclass] lu_cache_shrink_scan at ffffffffa06d81b8 [obdclass] shrink_slab at ffffffff811ca9d8 shrink_node at ffffffff811cfd94 do_try_to_free_pages at ffffffff811cfe63 try_to_free_pages at ffffffff811d01c4 __alloc_pages_slowpath at ffffffff811be7f2 __alloc_pages_nodemask at ffffffff811bf3ed new_slab at ffffffff81226304 ___slab_alloc at ffffffff812272ab __slab_alloc at ffffffff8122740c kmem_cache_alloc at ffffffff81227578 spl_kmem_cache_alloc at ffffffffa048a1fd [spl] arc_buf_alloc_impl at ffffffffa0befba2 [zfs] arc_read at ffffffffa0bf0924 [zfs] dbuf_read at ffffffffa0bf9083 [zfs] dmu_buf_hold_by_dnode at ffffffffa0c04869 [zfs] Reviewed-by: Brian Behlendorf Signed-off-by: Mark Roper Closes #9987 --- module/zfs/arc.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/module/zfs/arc.c b/module/zfs/arc.c index a16689dc6..ceb1e7a9d 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -6178,6 +6178,17 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, ASSERT(!embedded_bp || BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA); + /* + * Normally SPL_FSTRANS will already be set since kernel threads which + * expect to call the DMU interfaces will set it when created. System + * calls are similarly handled by setting/cleaning the bit in the + * registered callback (module/os/.../zfs/zpl_*). + * + * External consumers such as Lustre which call the exported DMU + * interfaces may not have set SPL_FSTRANS. To avoid a deadlock + * on the hash_lock always set and clear the bit. + */ + fstrans_cookie_t cookie = spl_fstrans_mark(); top: if (!embedded_bp) { /* @@ -6636,6 +6647,7 @@ out: /* embedded bps don't actually go to disk */ if (!embedded_bp) spa_read_history_add(spa, zb, *arc_flags); + spl_fstrans_unmark(cookie); return (rc); }