From 74962917c23c2721593f8f73b8d8280eadeb1759 Mon Sep 17 00:00:00 2001 From: Thomas Lamprecht Date: Tue, 30 Nov 2021 15:55:43 +0100 Subject: [PATCH] switch from impish to jelly as base ubuntu kernel Signed-off-by: Thomas Lamprecht --- .gitmodules | 6 +- Makefile | 2 +- ...sce-host-bridge-contiguous-apertures.patch | 2 +- ...ove-the-ERESTARTSYS-handling-in-blkd.patch | 6 +- .../0009-io-wq-fix-queue-stalling-race.patch | 69 --- ...ded-and-unbounded-work-into-separate.patch | 412 ------------------ submodules/ubuntu-jammy | 1 + 7 files changed, 9 insertions(+), 489 deletions(-) delete mode 100644 patches/kernel/0009-io-wq-fix-queue-stalling-race.patch delete mode 100644 patches/kernel/0010-io-wq-split-bounded-and-unbounded-work-into-separate.patch create mode 160000 submodules/ubuntu-jammy diff --git a/.gitmodules b/.gitmodules index 019094c..b69f70c 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,6 +1,6 @@ [submodule "submodules/zfsonlinux"] path = submodules/zfsonlinux url = ../zfsonlinux -[submodule "submodules/ubuntu-impish"] - path = submodules/ubuntu-impish - url = ../mirror_ubuntu-impish-kernel +[submodule "submodules/ubuntu-jammy"] + path = submodules/ubuntu-jammy + url = ../mirror_ubuntu-jammy-kernel diff --git a/Makefile b/Makefile index 03242cd..09f294e 100644 --- a/Makefile +++ b/Makefile @@ -31,7 +31,7 @@ SKIPABI=0 BUILD_DIR=build -KERNEL_SRC=ubuntu-impish +KERNEL_SRC=ubuntu-jammy KERNEL_SRC_SUBMODULE=submodules/$(KERNEL_SRC) KERNEL_CFG_ORG=config-${KERNEL_VER}.org diff --git a/patches/kernel/0006-Revert-PCI-Coalesce-host-bridge-contiguous-apertures.patch b/patches/kernel/0006-Revert-PCI-Coalesce-host-bridge-contiguous-apertures.patch index 9b339e4..632427d 100644 --- a/patches/kernel/0006-Revert-PCI-Coalesce-host-bridge-contiguous-apertures.patch +++ b/patches/kernel/0006-Revert-PCI-Coalesce-host-bridge-contiguous-apertures.patch @@ -23,9 +23,9 @@ index f6046a16dac1..275204646c68 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -19,7 +19,6 @@ - #include #include #include + #include -#include #include "pci.h" diff --git a/patches/kernel/0008-Revert-block-remove-the-ERESTARTSYS-handling-in-blkd.patch b/patches/kernel/0008-Revert-block-remove-the-ERESTARTSYS-handling-in-blkd.patch index 03d3b25..2b2a6b0 100644 --- a/patches/kernel/0008-Revert-block-remove-the-ERESTARTSYS-handling-in-blkd.patch +++ b/patches/kernel/0008-Revert-block-remove-the-ERESTARTSYS-handling-in-blkd.patch @@ -11,13 +11,13 @@ https://github.com/openzfs/zfs/issues/12301#issuecomment-873303739 Signed-off-by: Thomas Lamprecht --- - fs/block_dev.c | 6 ++++++ + block/bdev.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/block_dev.c b/fs/block_dev.c index e4a80bd4ddf1..4a14d28dd923 100644 ---- a/fs/block_dev.c -+++ b/fs/block_dev.c +--- a/block/bdev.c ++++ b/block/bdev.c @@ -1436,6 +1436,10 @@ struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder) if (ret) return ERR_PTR(ret); diff --git a/patches/kernel/0009-io-wq-fix-queue-stalling-race.patch b/patches/kernel/0009-io-wq-fix-queue-stalling-race.patch deleted file mode 100644 index 05fb6bc..0000000 --- a/patches/kernel/0009-io-wq-fix-queue-stalling-race.patch +++ /dev/null @@ -1,69 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Jens Axboe -Date: Tue, 31 Aug 2021 13:53:00 -0600 -Subject: [PATCH] io-wq: fix queue stalling race - -We need to set the stalled bit early, before we drop the lock for adding -us to the stall hash queue. If not, then we can race with new work being -queued between adding us to the stall hash and io_worker_handle_work() -marking us stalled. - -Signed-off-by: Jens Axboe -[backport] -Signed-off-by: Fabian Ebner ---- - fs/io-wq.c | 15 +++++++-------- - 1 file changed, 7 insertions(+), 8 deletions(-) - -diff --git a/fs/io-wq.c b/fs/io-wq.c -index 6612d0aa497e..33678185f3bc 100644 ---- a/fs/io-wq.c -+++ b/fs/io-wq.c -@@ -437,8 +437,7 @@ static bool io_worker_can_run_work(struct io_worker *worker, - } - - static struct io_wq_work *io_get_next_work(struct io_wqe *wqe, -- struct io_worker *worker, -- bool *stalled) -+ struct io_worker *worker) - __must_hold(wqe->lock) - { - struct io_wq_work_node *node, *prev; -@@ -476,10 +475,14 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe, - } - - if (stall_hash != -1U) { -+ /* -+ * Set this before dropping the lock to avoid racing with new -+ * work being added and clearing the stalled bit. -+ */ -+ wqe->flags |= IO_WQE_FLAG_STALLED; - raw_spin_unlock(&wqe->lock); - io_wait_on_hash(wqe, stall_hash); - raw_spin_lock(&wqe->lock); -- *stalled = true; - } - - return NULL; -@@ -519,7 +522,6 @@ static void io_worker_handle_work(struct io_worker *worker) - - do { - struct io_wq_work *work; -- bool stalled; - get_next: - /* - * If we got some work, mark us as busy. If we didn't, but -@@ -528,12 +530,9 @@ static void io_worker_handle_work(struct io_worker *worker) - * can't make progress, any work completion or insertion will - * clear the stalled flag. - */ -- stalled = false; -- work = io_get_next_work(wqe, worker, &stalled); -+ work = io_get_next_work(wqe, worker); - if (work) - __io_worker_busy(wqe, worker, work); -- else if (stalled) -- wqe->flags |= IO_WQE_FLAG_STALLED; - - raw_spin_unlock_irq(&wqe->lock); - if (!work) diff --git a/patches/kernel/0010-io-wq-split-bounded-and-unbounded-work-into-separate.patch b/patches/kernel/0010-io-wq-split-bounded-and-unbounded-work-into-separate.patch deleted file mode 100644 index 789b199..0000000 --- a/patches/kernel/0010-io-wq-split-bounded-and-unbounded-work-into-separate.patch +++ /dev/null @@ -1,412 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Jens Axboe -Date: Tue, 31 Aug 2021 13:57:32 -0600 -Subject: [PATCH] io-wq: split bounded and unbounded work into separate lists - -We've got a few issues that all boil down to the fact that we have one -list of pending work items, yet two different types of workers to -serve them. This causes some oddities around workers switching type and -even hashed work vs regular work on the same bounded list. - -Just separate them out cleanly, similarly to how we already do -accounting of what is running. That provides a clean separation and -removes some corner cases that can cause stalls when handling IO -that is punted to io-wq. - -Fixes: ecc53c48c13d ("io-wq: check max_worker limits if a worker transitions bound state") -Signed-off-by: Jens Axboe -[backport] -Signed-off-by: Fabian Ebner ---- - fs/io-wq.c | 156 +++++++++++++++++++++++------------------------------ - 1 file changed, 68 insertions(+), 88 deletions(-) - -diff --git a/fs/io-wq.c b/fs/io-wq.c -index 33678185f3bc..2496d8781ea1 100644 ---- a/fs/io-wq.c -+++ b/fs/io-wq.c -@@ -34,7 +34,7 @@ enum { - }; - - enum { -- IO_WQE_FLAG_STALLED = 1, /* stalled on hash */ -+ IO_ACCT_STALLED_BIT = 0, /* stalled on hash */ - }; - - /* -@@ -73,25 +73,24 @@ struct io_wqe_acct { - unsigned max_workers; - int index; - atomic_t nr_running; -+ struct io_wq_work_list work_list; -+ unsigned long flags; - }; - - enum { - IO_WQ_ACCT_BOUND, - IO_WQ_ACCT_UNBOUND, -+ IO_WQ_ACCT_NR, - }; - - /* - * Per-node worker thread pool - */ - struct io_wqe { -- struct { -- raw_spinlock_t lock; -- struct io_wq_work_list work_list; -- unsigned flags; -- } ____cacheline_aligned_in_smp; -+ raw_spinlock_t lock; -+ struct io_wqe_acct acct[2]; - - int node; -- struct io_wqe_acct acct[2]; - - struct hlist_nulls_head free_list; - struct list_head all_list; -@@ -196,11 +195,10 @@ static void io_worker_exit(struct io_worker *worker) - do_exit(0); - } - --static inline bool io_wqe_run_queue(struct io_wqe *wqe) -- __must_hold(wqe->lock) -+static inline bool io_acct_run_queue(struct io_wqe_acct *acct) - { -- if (!wq_list_empty(&wqe->work_list) && -- !(wqe->flags & IO_WQE_FLAG_STALLED)) -+ if (!wq_list_empty(&acct->work_list) && -+ !test_bit(IO_ACCT_STALLED_BIT, &acct->flags)) - return true; - return false; - } -@@ -209,7 +207,8 @@ static inline bool io_wqe_run_queue(struct io_wqe *wqe) - * Check head of free list for an available worker. If one isn't available, - * caller must create one. - */ --static bool io_wqe_activate_free_worker(struct io_wqe *wqe) -+static bool io_wqe_activate_free_worker(struct io_wqe *wqe, -+ struct io_wqe_acct *acct) - __must_hold(RCU) - { - struct hlist_nulls_node *n; -@@ -223,6 +222,10 @@ static bool io_wqe_activate_free_worker(struct io_wqe *wqe) - hlist_nulls_for_each_entry_rcu(worker, n, &wqe->free_list, nulls_node) { - if (!io_worker_get(worker)) - continue; -+ if (io_wqe_get_acct(worker) != acct) { -+ io_worker_release(worker); -+ continue; -+ } - if (wake_up_process(worker->task)) { - io_worker_release(worker); - return true; -@@ -341,7 +344,7 @@ static void io_wqe_dec_running(struct io_worker *worker) - if (!(worker->flags & IO_WORKER_F_UP)) - return; - -- if (atomic_dec_and_test(&acct->nr_running) && io_wqe_run_queue(wqe)) { -+ if (atomic_dec_and_test(&acct->nr_running) && io_acct_run_queue(acct)) { - atomic_inc(&acct->nr_running); - atomic_inc(&wqe->wq->worker_refs); - io_queue_worker_create(wqe, worker, acct); -@@ -356,29 +359,10 @@ static void __io_worker_busy(struct io_wqe *wqe, struct io_worker *worker, - struct io_wq_work *work) - __must_hold(wqe->lock) - { -- bool worker_bound, work_bound; -- -- BUILD_BUG_ON((IO_WQ_ACCT_UNBOUND ^ IO_WQ_ACCT_BOUND) != 1); -- - if (worker->flags & IO_WORKER_F_FREE) { - worker->flags &= ~IO_WORKER_F_FREE; - hlist_nulls_del_init_rcu(&worker->nulls_node); - } -- -- /* -- * If worker is moving from bound to unbound (or vice versa), then -- * ensure we update the running accounting. -- */ -- worker_bound = (worker->flags & IO_WORKER_F_BOUND) != 0; -- work_bound = (work->flags & IO_WQ_WORK_UNBOUND) == 0; -- if (worker_bound != work_bound) { -- int index = work_bound ? IO_WQ_ACCT_UNBOUND : IO_WQ_ACCT_BOUND; -- io_wqe_dec_running(worker); -- worker->flags ^= IO_WORKER_F_BOUND; -- wqe->acct[index].nr_workers--; -- wqe->acct[index ^ 1].nr_workers++; -- io_wqe_inc_running(worker); -- } - } - - /* -@@ -417,44 +401,23 @@ static void io_wait_on_hash(struct io_wqe *wqe, unsigned int hash) - spin_unlock(&wq->hash->wait.lock); - } - --/* -- * We can always run the work if the worker is currently the same type as -- * the work (eg both are bound, or both are unbound). If they are not the -- * same, only allow it if incrementing the worker count would be allowed. -- */ --static bool io_worker_can_run_work(struct io_worker *worker, -- struct io_wq_work *work) --{ -- struct io_wqe_acct *acct; -- -- if (!(worker->flags & IO_WORKER_F_BOUND) != -- !(work->flags & IO_WQ_WORK_UNBOUND)) -- return true; -- -- /* not the same type, check if we'd go over the limit */ -- acct = io_work_get_acct(worker->wqe, work); -- return acct->nr_workers < acct->max_workers; --} -- --static struct io_wq_work *io_get_next_work(struct io_wqe *wqe, -+static struct io_wq_work *io_get_next_work(struct io_wqe_acct *acct, - struct io_worker *worker) - __must_hold(wqe->lock) - { - struct io_wq_work_node *node, *prev; - struct io_wq_work *work, *tail; - unsigned int stall_hash = -1U; -+ struct io_wqe *wqe = worker->wqe; - -- wq_list_for_each(node, prev, &wqe->work_list) { -+ wq_list_for_each(node, prev, &acct->work_list) { - unsigned int hash; - - work = container_of(node, struct io_wq_work, list); - -- if (!io_worker_can_run_work(worker, work)) -- break; -- - /* not hashed, can run anytime */ - if (!io_wq_is_hashed(work)) { -- wq_list_del(&wqe->work_list, node, prev); -+ wq_list_del(&acct->work_list, node, prev); - return work; - } - -@@ -465,7 +428,7 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe, - /* hashed, can run if not already running */ - if (!test_and_set_bit(hash, &wqe->wq->hash->map)) { - wqe->hash_tail[hash] = NULL; -- wq_list_cut(&wqe->work_list, &tail->list, prev); -+ wq_list_cut(&acct->work_list, &tail->list, prev); - return work; - } - if (stall_hash == -1U) -@@ -479,7 +442,7 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe, - * Set this before dropping the lock to avoid racing with new - * work being added and clearing the stalled bit. - */ -- wqe->flags |= IO_WQE_FLAG_STALLED; -+ set_bit(IO_ACCT_STALLED_BIT, &acct->flags); - raw_spin_unlock(&wqe->lock); - io_wait_on_hash(wqe, stall_hash); - raw_spin_lock(&wqe->lock); -@@ -516,6 +479,7 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work); - static void io_worker_handle_work(struct io_worker *worker) - __releases(wqe->lock) - { -+ struct io_wqe_acct *acct = io_wqe_get_acct(worker); - struct io_wqe *wqe = worker->wqe; - struct io_wq *wq = wqe->wq; - bool do_kill = test_bit(IO_WQ_BIT_EXIT, &wq->state); -@@ -530,7 +494,7 @@ static void io_worker_handle_work(struct io_worker *worker) - * can't make progress, any work completion or insertion will - * clear the stalled flag. - */ -- work = io_get_next_work(wqe, worker); -+ work = io_get_next_work(acct, worker); - if (work) - __io_worker_busy(wqe, worker, work); - -@@ -564,10 +528,10 @@ static void io_worker_handle_work(struct io_worker *worker) - - if (hash != -1U && !next_hashed) { - clear_bit(hash, &wq->hash->map); -+ clear_bit(IO_ACCT_STALLED_BIT, &acct->flags); - if (wq_has_sleeper(&wq->hash->wait)) - wake_up(&wq->hash->wait); - raw_spin_lock_irq(&wqe->lock); -- wqe->flags &= ~IO_WQE_FLAG_STALLED; - /* skip unnecessary unlock-lock wqe->lock */ - if (!work) - goto get_next; -@@ -582,6 +546,7 @@ static void io_worker_handle_work(struct io_worker *worker) - static int io_wqe_worker(void *data) - { - struct io_worker *worker = data; -+ struct io_wqe_acct *acct = io_wqe_get_acct(worker); - struct io_wqe *wqe = worker->wqe; - struct io_wq *wq = wqe->wq; - char buf[TASK_COMM_LEN]; -@@ -597,7 +562,7 @@ static int io_wqe_worker(void *data) - set_current_state(TASK_INTERRUPTIBLE); - loop: - raw_spin_lock_irq(&wqe->lock); -- if (io_wqe_run_queue(wqe)) { -+ if (io_acct_run_queue(acct)) { - io_worker_handle_work(worker); - goto loop; - } -@@ -623,7 +588,7 @@ static int io_wqe_worker(void *data) - - if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) { - raw_spin_lock_irq(&wqe->lock); -- if (!wq_list_empty(&wqe->work_list)) -+ if (!wq_list_empty(&acct->work_list)) - io_worker_handle_work(worker); - else - raw_spin_unlock_irq(&wqe->lock); -@@ -769,12 +734,13 @@ static void io_run_cancel(struct io_wq_work *work, struct io_wqe *wqe) - - static void io_wqe_insert_work(struct io_wqe *wqe, struct io_wq_work *work) - { -+ struct io_wqe_acct *acct = io_work_get_acct(wqe, work); - unsigned int hash; - struct io_wq_work *tail; - - if (!io_wq_is_hashed(work)) { - append: -- wq_list_add_tail(&work->list, &wqe->work_list); -+ wq_list_add_tail(&work->list, &acct->work_list); - return; - } - -@@ -784,7 +750,7 @@ static void io_wqe_insert_work(struct io_wqe *wqe, struct io_wq_work *work) - if (!tail) - goto append; - -- wq_list_add_after(&work->list, &tail->list, &wqe->work_list); -+ wq_list_add_after(&work->list, &tail->list, &acct->work_list); - } - - static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work) -@@ -806,10 +772,10 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work) - - raw_spin_lock_irqsave(&wqe->lock, flags); - io_wqe_insert_work(wqe, work); -- wqe->flags &= ~IO_WQE_FLAG_STALLED; -+ clear_bit(IO_ACCT_STALLED_BIT, &acct->flags); - - rcu_read_lock(); -- do_create = !io_wqe_activate_free_worker(wqe); -+ do_create = !io_wqe_activate_free_worker(wqe, acct); - rcu_read_unlock(); - - raw_spin_unlock_irqrestore(&wqe->lock, flags); -@@ -862,6 +828,7 @@ static inline void io_wqe_remove_pending(struct io_wqe *wqe, - struct io_wq_work *work, - struct io_wq_work_node *prev) - { -+ struct io_wqe_acct *acct = io_work_get_acct(wqe, work); - unsigned int hash = io_get_work_hash(work); - struct io_wq_work *prev_work = NULL; - -@@ -873,7 +840,7 @@ static inline void io_wqe_remove_pending(struct io_wqe *wqe, - else - wqe->hash_tail[hash] = NULL; - } -- wq_list_del(&wqe->work_list, &work->list, prev); -+ wq_list_del(&acct->work_list, &work->list, prev); - } - - static void io_wqe_cancel_pending_work(struct io_wqe *wqe, -@@ -882,22 +849,27 @@ static void io_wqe_cancel_pending_work(struct io_wqe *wqe, - struct io_wq_work_node *node, *prev; - struct io_wq_work *work; - unsigned long flags; -+ int i; - - retry: - raw_spin_lock_irqsave(&wqe->lock, flags); -- wq_list_for_each(node, prev, &wqe->work_list) { -- work = container_of(node, struct io_wq_work, list); -- if (!match->fn(work, match->data)) -- continue; -- io_wqe_remove_pending(wqe, work, prev); -- raw_spin_unlock_irqrestore(&wqe->lock, flags); -- io_run_cancel(work, wqe); -- match->nr_pending++; -- if (!match->cancel_all) -- return; -+ for (i = 0; i < IO_WQ_ACCT_NR; i++) { -+ struct io_wqe_acct *acct = io_get_acct(wqe, i == 0); - -- /* not safe to continue after unlock */ -- goto retry; -+ wq_list_for_each(node, prev, &acct->work_list) { -+ work = container_of(node, struct io_wq_work, list); -+ if (!match->fn(work, match->data)) -+ continue; -+ io_wqe_remove_pending(wqe, work, prev); -+ raw_spin_unlock_irqrestore(&wqe->lock, flags); -+ io_run_cancel(work, wqe); -+ match->nr_pending++; -+ if (!match->cancel_all) -+ return; -+ -+ /* not safe to continue after unlock */ -+ goto retry; -+ } - } - raw_spin_unlock_irqrestore(&wqe->lock, flags); - } -@@ -958,18 +930,24 @@ static int io_wqe_hash_wake(struct wait_queue_entry *wait, unsigned mode, - int sync, void *key) - { - struct io_wqe *wqe = container_of(wait, struct io_wqe, wait); -+ int i; - - list_del_init(&wait->entry); - - rcu_read_lock(); -- io_wqe_activate_free_worker(wqe); -+ for (i = 0; i < IO_WQ_ACCT_NR; i++) { -+ struct io_wqe_acct *acct = &wqe->acct[i]; -+ -+ if (test_and_clear_bit(IO_ACCT_STALLED_BIT, &acct->flags)) -+ io_wqe_activate_free_worker(wqe, acct); -+ } - rcu_read_unlock(); - return 1; - } - - struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) - { -- int ret = -ENOMEM, node; -+ int ret, node, i; - struct io_wq *wq; - - if (WARN_ON_ONCE(!data->free_work || !data->do_work)) -@@ -1006,18 +984,20 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) - goto err; - wq->wqes[node] = wqe; - wqe->node = alloc_node; -- wqe->acct[IO_WQ_ACCT_BOUND].index = IO_WQ_ACCT_BOUND; -- wqe->acct[IO_WQ_ACCT_UNBOUND].index = IO_WQ_ACCT_UNBOUND; - wqe->acct[IO_WQ_ACCT_BOUND].max_workers = bounded; -- atomic_set(&wqe->acct[IO_WQ_ACCT_BOUND].nr_running, 0); - wqe->acct[IO_WQ_ACCT_UNBOUND].max_workers = - task_rlimit(current, RLIMIT_NPROC); -- atomic_set(&wqe->acct[IO_WQ_ACCT_UNBOUND].nr_running, 0); -- wqe->wait.func = io_wqe_hash_wake; - INIT_LIST_HEAD(&wqe->wait.entry); -+ wqe->wait.func = io_wqe_hash_wake; -+ for (i = 0; i < IO_WQ_ACCT_NR; i++) { -+ struct io_wqe_acct *acct = &wqe->acct[i]; -+ -+ acct->index = i; -+ atomic_set(&acct->nr_running, 0); -+ INIT_WQ_LIST(&acct->work_list); -+ } - wqe->wq = wq; - raw_spin_lock_init(&wqe->lock); -- INIT_WQ_LIST(&wqe->work_list); - INIT_HLIST_NULLS_HEAD(&wqe->free_list, 0); - INIT_LIST_HEAD(&wqe->all_list); - } diff --git a/submodules/ubuntu-jammy b/submodules/ubuntu-jammy new file mode 160000 index 0000000..d5d0348 --- /dev/null +++ b/submodules/ubuntu-jammy @@ -0,0 +1 @@ +Subproject commit d5d03485a8a6a91f4406f08c8c0375dbdc84de7e