mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2026-05-24 03:08:51 +03:00
Compare commits
76 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| baa5031456 | |||
| cd42e992b5 | |||
| c60df6a801 | |||
| d8fa32a79d | |||
| 88a5ee0706 | |||
| 0465fbecd7 | |||
| a99a37991e | |||
| 2ca1515374 | |||
| bce36e21ca | |||
| 86492e3c96 | |||
| 07f0465742 | |||
| 0f9457d1dd | |||
| 859f906a4b | |||
| 84a9861536 | |||
| 9d64d1bfad | |||
| ce22dc2589 | |||
| 8479a45abe | |||
| 8156099cf2 | |||
| 11ad6124c3 | |||
| 11de432c8b | |||
| 464747ffd3 | |||
| 92a8af0f8b | |||
| 4fa84563b8 | |||
| 6961d4fb57 | |||
| 3a36797ad6 | |||
| ac6500389b | |||
| 1f055436f3 | |||
| 0172ee525b | |||
| 33174af151 | |||
| 6f27c4cadd | |||
| dd5de55eba | |||
| b5835ed137 | |||
| ef08cb26da | |||
| 9ad205ecde | |||
| 14cce09a65 | |||
| 9835255f5d | |||
| 4d2f7f9839 | |||
| 25c4271d2f | |||
| c950c5d369 | |||
| 13ccbbb47a | |||
| ba3c7692cd | |||
| 27cc6df760 | |||
| d06c8de748 | |||
| 2a2e358475 | |||
| bc42d96d66 | |||
| 88686213c3 | |||
| 21f66db674 | |||
| 3ca305f873 | |||
| 96cad4ca4c | |||
| 5668411713 | |||
| 32cd2da551 | |||
| fa2480f5b3 | |||
| ad8c8c1e31 | |||
| f14a62ebbe | |||
| dfdac38afb | |||
| 08da054005 | |||
| bb401c02fc | |||
| da9da6aea6 | |||
| 97f1eb8052 | |||
| 7d8e2a7f73 | |||
| 3ea3649755 | |||
| 0342c4a6b2 | |||
| d7bf0e5259 | |||
| c24a039042 | |||
| f4e2aed42a | |||
| 54ef0fdf60 | |||
| 2eab4f7b39 | |||
| 4c0fbd8d6d | |||
| fa4b1a404e | |||
| 4c484d66b7 | |||
| 41f2a9c81f | |||
| 6724746596 | |||
| 938d1588eb | |||
| 0f1e8ba2f8 | |||
| b474dfad0d | |||
| 9edf6af4ae |
@@ -77,7 +77,10 @@ Yanping Gao <yanping.gao@xtaotech.com>
|
||||
Youzhong Yang <youzhong@gmail.com>
|
||||
|
||||
# Signed-off-by: overriding Author:
|
||||
Ryan <errornointernet@envs.net> <error.nointernet@gmail.com>
|
||||
Qiuhao Chen <chenqiuhao1997@gmail.com> <haohao0924@126.com>
|
||||
Yuxin Wang <yuxinwang9999@gmail.com> <Bi11gates9999@gmail.com>
|
||||
Zhenlei Huang <zlei@FreeBSD.org> <zlei.huang@gmail.com>
|
||||
|
||||
# Commits from strange places, long ago
|
||||
Brian Behlendorf <behlendorf1@llnl.gov> <behlendo@7e1ea52c-4ff2-0310-8f11-9dd32ca42a1c>
|
||||
@@ -95,6 +98,7 @@ Alek Pinchuk <apinchuk@axcient.com> <alek-p@users.noreply.github.com>
|
||||
Alexander Lobakin <alobakin@pm.me> <solbjorn@users.noreply.github.com>
|
||||
Alexey Smirnoff <fling@member.fsf.org> <fling-@users.noreply.github.com>
|
||||
Allen Holl <allen.m.holl@gmail.com> <65494904+allen-4@users.noreply.github.com>
|
||||
Alphan Yılmaz <alphanyilmaz@gmail.com> <a1ea321@users.noreply.github.com>
|
||||
Ameer Hamza <ahamza@ixsystems.com> <106930537+ixhamza@users.noreply.github.com>
|
||||
Andrew J. Hesford <ajh@sideband.org> <48421688+ahesford@users.noreply.github.com>>
|
||||
Andrew Sun <me@andrewsun.com> <as-com@users.noreply.github.com>
|
||||
@@ -102,6 +106,7 @@ Aron Xu <happyaron.xu@gmail.com> <happyaron@users.noreply.github.com>
|
||||
Arun KV <arun.kv@datacore.com> <65647132+arun-kv@users.noreply.github.com>
|
||||
Ben Wolsieffer <benwolsieffer@gmail.com> <lopsided98@users.noreply.github.com>
|
||||
bernie1995 <bernie.pikes@gmail.com> <42413912+bernie1995@users.noreply.github.com>
|
||||
Bojan Novković <bnovkov@FreeBSD.org> <72801811+bnovkov@users.noreply.github.com>
|
||||
Boris Protopopov <boris.protopopov@actifio.com> <bprotopopov@users.noreply.github.com>
|
||||
Brad Forschinger <github@bnjf.id.au> <bnjf@users.noreply.github.com>
|
||||
Brandon Thetford <brandon@dodecatec.com> <dodexahedron@users.noreply.github.com>
|
||||
@@ -193,6 +198,7 @@ Stefan Lendl <s.lendl@proxmox.com> <1321542+stfl@users.noreply.github.com>
|
||||
Thomas Bertschinger <bertschinger@lanl.gov> <101425190+bertschinger@users.noreply.github.com>
|
||||
Thomas Geppert <geppi@digitx.de> <geppi@users.noreply.github.com>
|
||||
Tim Crawford <tcrawford@datto.com> <crawfxrd@users.noreply.github.com>
|
||||
Todd Seidelmann <18294602+seidelma@users.noreply.github.com>
|
||||
Tom Matthews <tom@axiom-partners.com> <tomtastic@users.noreply.github.com>
|
||||
Tony Perkins <tperkins@datto.com> <62951051+tony-zfs@users.noreply.github.com>
|
||||
Torsten Wörtwein <twoertwein@gmail.com> <twoertwein@users.noreply.github.com>
|
||||
|
||||
@@ -46,6 +46,7 @@ CONTRIBUTORS:
|
||||
Alex Zhuravlev <alexey.zhuravlev@intel.com>
|
||||
Allan Jude <allanjude@freebsd.org>
|
||||
Allen Holl <allen.m.holl@gmail.com>
|
||||
Alphan Yılmaz <alphanyilmaz@gmail.com>
|
||||
alteriks <alteriks@gmail.com>
|
||||
Alyssa Ross <hi@alyssa.is>
|
||||
Ameer Hamza <ahamza@ixsystems.com>
|
||||
@@ -99,6 +100,7 @@ CONTRIBUTORS:
|
||||
bernie1995 <bernie.pikes@gmail.com>
|
||||
Bill McGonigle <bill-github.com-public1@bfccomputing.com>
|
||||
Bill Pijewski <wdp@joyent.com>
|
||||
Bojan Novković <bnovkov@FreeBSD.org>
|
||||
Boris Protopopov <boris.protopopov@nexenta.com>
|
||||
Brad Forschinger <github@bnjf.id.au>
|
||||
Brad Lewis <brad.lewis@delphix.com>
|
||||
@@ -168,6 +170,7 @@ CONTRIBUTORS:
|
||||
Daniel Hoffman <dj.hoffman@delphix.com>
|
||||
Daniel Kobras <d.kobras@science-computing.de>
|
||||
Daniel Kolesa <daniel@octaforge.org>
|
||||
Daniel Perry <dtperry@amazon.com>
|
||||
Daniel Reichelt <hacking@nachtgeist.net>
|
||||
Daniel Stevenson <bot@dstev.net>
|
||||
Daniel Verite <daniel@verite.pro>
|
||||
@@ -187,6 +190,7 @@ CONTRIBUTORS:
|
||||
Dennis R. Friedrichsen <dennis.r.friedrichsen@gmail.com>
|
||||
Denys Rtveliashvili <denys@rtveliashvili.name>
|
||||
Derek Dai <daiderek@gmail.com>
|
||||
Derek Schrock <dereks@lifeofadishwasher.com>
|
||||
Dex Wood <slash2314@gmail.com>
|
||||
DHE <git@dehacked.net>
|
||||
Didier Roche <didrocks@ubuntu.com>
|
||||
@@ -245,6 +249,7 @@ CONTRIBUTORS:
|
||||
Gionatan Danti <g.danti@assyoma.it>
|
||||
Giuseppe Di Natale <guss80@gmail.com>
|
||||
Glenn Washburn <development@efficientek.com>
|
||||
glibg10b <glibg10b@users.noreply.github.com>
|
||||
gofaster <felix.gofaster@gmail.com>
|
||||
Gordan Bobic <gordan@redsleeve.org>
|
||||
Gordon Bergling <gbergling@googlemail.com>
|
||||
@@ -410,6 +415,7 @@ CONTRIBUTORS:
|
||||
Mart Frauenlob <allkind@fastest.cc>
|
||||
Martin Matuska <mm@FreeBSD.org>
|
||||
Martin Rüegg <martin.rueegg@metaworx.ch>
|
||||
Martin Wagner <martin.wagner.dev@gmail.com>
|
||||
Massimo Maggi <me@massimo-maggi.eu>
|
||||
Mateusz Guzik <mjguzik@gmail.com>
|
||||
Mateusz Piotrowski <0mp@FreeBSD.org>
|
||||
@@ -488,6 +494,7 @@ CONTRIBUTORS:
|
||||
Peng <peng.hse@xtaotech.com>
|
||||
Peter Ashford <ashford@accs.com>
|
||||
Peter Dave Hello <hsu@peterdavehello.org>
|
||||
Peter Doherty <peterd@acranox.org>
|
||||
Peter Levine <plevine457@gmail.com>
|
||||
Peter Wirdemo <peter.wirdemo@gmail.com>
|
||||
Petros Koutoupis <petros@petroskoutoupis.com>
|
||||
@@ -501,6 +508,7 @@ CONTRIBUTORS:
|
||||
Prasad Joshi <prasadjoshi124@gmail.com>
|
||||
privb0x23 <privb0x23@users.noreply.github.com>
|
||||
P.SCH <p88@yahoo.com>
|
||||
Qiuhao Chen <chenqiuhao1997@gmail.com>
|
||||
Quartz <yyhran@163.com>
|
||||
Quentin Zdanis <zdanisq@gmail.com>
|
||||
Rafael Kitover <rkitover@gmail.com>
|
||||
@@ -532,6 +540,7 @@ CONTRIBUTORS:
|
||||
Roman Strashkin <roman.strashkin@nexenta.com>
|
||||
Ross Williams <ross@ross-williams.net>
|
||||
Ruben Kerkhof <ruben@rubenkerkhof.com>
|
||||
Ryan <errornointernet@envs.net>
|
||||
Ryan Hirasaki <ryanhirasaki@gmail.com>
|
||||
Ryan Lahfa <masterancpp@gmail.com>
|
||||
Ryan Libby <rlibby@FreeBSD.org>
|
||||
@@ -556,6 +565,7 @@ CONTRIBUTORS:
|
||||
Sen Haerens <sen@senhaerens.be>
|
||||
Serapheim Dimitropoulos <serapheim@delphix.com>
|
||||
Seth Forshee <seth.forshee@canonical.com>
|
||||
Seth Troisi <sethtroisi@google.com>
|
||||
Shaan Nobee <sniper111@gmail.com>
|
||||
Shampavman <sham.pavman@nexenta.com>
|
||||
Shaun Tancheff <shaun@aeonazure.com>
|
||||
@@ -602,6 +612,7 @@ CONTRIBUTORS:
|
||||
Tim Schumacher <timschumi@gmx.de>
|
||||
Tino Reichardt <milky-zfs@mcmilk.de>
|
||||
Tobin Harding <me@tobin.cc>
|
||||
Todd Seidelmann <seidelma@users.noreply.github.com>
|
||||
Tom Caputi <tcaputi@datto.com>
|
||||
Tom Matthews <tom@axiom-partners.com>
|
||||
Tomohiro Kusumi <kusumi.tomohiro@gmail.com>
|
||||
@@ -653,6 +664,8 @@ CONTRIBUTORS:
|
||||
Zachary Bedell <zac@thebedells.org>
|
||||
Zach Dykstra <dykstra.zachary@gmail.com>
|
||||
zgock <zgock@nuc.base.zgock-lab.net>
|
||||
Zhao Yongming <zym@apache.org>
|
||||
Zhenlei Huang <zlei@FreeBSD.org>
|
||||
Zhu Chuang <chuang@melty.land>
|
||||
Érico Nogueira <erico.erc@gmail.com>
|
||||
Đoàn Trần Công Danh <congdanhqx@gmail.com>
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
Meta: 1
|
||||
Name: zfs
|
||||
Branch: 1.0
|
||||
Version: 2.2.4
|
||||
Version: 2.2.6
|
||||
Release: 1
|
||||
Release-Tags: relext
|
||||
License: CDDL
|
||||
Author: OpenZFS
|
||||
Linux-Maximum: 6.8
|
||||
Linux-Maximum: 6.10
|
||||
Linux-Minimum: 3.10
|
||||
|
||||
+83
-10
@@ -48,6 +48,7 @@
|
||||
#include <sys/spa_impl.h>
|
||||
#include <sys/dmu.h>
|
||||
#include <sys/zap.h>
|
||||
#include <sys/zap_impl.h>
|
||||
#include <sys/fs/zfs.h>
|
||||
#include <sys/zfs_znode.h>
|
||||
#include <sys/zfs_sa.h>
|
||||
@@ -84,6 +85,9 @@
|
||||
#include <sys/brt_impl.h>
|
||||
#include <zfs_comutil.h>
|
||||
#include <sys/zstd/zstd.h>
|
||||
#if (__GLIBC__ && !__UCLIBC__)
|
||||
#include <execinfo.h> /* for backtrace() */
|
||||
#endif
|
||||
|
||||
#include <libnvpair.h>
|
||||
#include <libzutil.h>
|
||||
@@ -926,11 +930,41 @@ usage(void)
|
||||
static void
|
||||
dump_debug_buffer(void)
|
||||
{
|
||||
if (dump_opt['G']) {
|
||||
(void) printf("\n");
|
||||
(void) fflush(stdout);
|
||||
zfs_dbgmsg_print("zdb");
|
||||
}
|
||||
ssize_t ret __attribute__((unused));
|
||||
|
||||
if (!dump_opt['G'])
|
||||
return;
|
||||
/*
|
||||
* We use write() instead of printf() so that this function
|
||||
* is safe to call from a signal handler.
|
||||
*/
|
||||
ret = write(STDOUT_FILENO, "\n", 1);
|
||||
zfs_dbgmsg_print("zdb");
|
||||
}
|
||||
|
||||
#define BACKTRACE_SZ 100
|
||||
|
||||
static void sig_handler(int signo)
|
||||
{
|
||||
struct sigaction action;
|
||||
#if (__GLIBC__ && !__UCLIBC__) /* backtrace() is a GNU extension */
|
||||
int nptrs;
|
||||
void *buffer[BACKTRACE_SZ];
|
||||
|
||||
nptrs = backtrace(buffer, BACKTRACE_SZ);
|
||||
backtrace_symbols_fd(buffer, nptrs, STDERR_FILENO);
|
||||
#endif
|
||||
dump_debug_buffer();
|
||||
|
||||
/*
|
||||
* Restore default action and re-raise signal so SIGSEGV and
|
||||
* SIGABRT can trigger a core dump.
|
||||
*/
|
||||
action.sa_handler = SIG_DFL;
|
||||
sigemptyset(&action.sa_mask);
|
||||
action.sa_flags = 0;
|
||||
(void) sigaction(signo, &action, NULL);
|
||||
raise(signo);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -1199,16 +1233,33 @@ dump_zap(objset_t *os, uint64_t object, void *data, size_t size)
|
||||
for (zap_cursor_init(&zc, os, object);
|
||||
zap_cursor_retrieve(&zc, &attr) == 0;
|
||||
zap_cursor_advance(&zc)) {
|
||||
(void) printf("\t\t%s = ", attr.za_name);
|
||||
boolean_t key64 =
|
||||
!!(zap_getflags(zc.zc_zap) & ZAP_FLAG_UINT64_KEY);
|
||||
|
||||
if (key64)
|
||||
(void) printf("\t\t0x%010lx = ",
|
||||
*(uint64_t *)attr.za_name);
|
||||
else
|
||||
(void) printf("\t\t%s = ", attr.za_name);
|
||||
|
||||
if (attr.za_num_integers == 0) {
|
||||
(void) printf("\n");
|
||||
continue;
|
||||
}
|
||||
prop = umem_zalloc(attr.za_num_integers *
|
||||
attr.za_integer_length, UMEM_NOFAIL);
|
||||
(void) zap_lookup(os, object, attr.za_name,
|
||||
attr.za_integer_length, attr.za_num_integers, prop);
|
||||
if (attr.za_integer_length == 1) {
|
||||
|
||||
if (key64)
|
||||
(void) zap_lookup_uint64(os, object,
|
||||
(const uint64_t *)attr.za_name, 1,
|
||||
attr.za_integer_length, attr.za_num_integers,
|
||||
prop);
|
||||
else
|
||||
(void) zap_lookup(os, object, attr.za_name,
|
||||
attr.za_integer_length, attr.za_num_integers,
|
||||
prop);
|
||||
|
||||
if (attr.za_integer_length == 1 && !key64) {
|
||||
if (strcmp(attr.za_name,
|
||||
DSL_CRYPTO_KEY_MASTER_KEY) == 0 ||
|
||||
strcmp(attr.za_name,
|
||||
@@ -1227,6 +1278,10 @@ dump_zap(objset_t *os, uint64_t object, void *data, size_t size)
|
||||
} else {
|
||||
for (i = 0; i < attr.za_num_integers; i++) {
|
||||
switch (attr.za_integer_length) {
|
||||
case 1:
|
||||
(void) printf("%u ",
|
||||
((uint8_t *)prop)[i]);
|
||||
break;
|
||||
case 2:
|
||||
(void) printf("%u ",
|
||||
((uint16_t *)prop)[i]);
|
||||
@@ -5217,7 +5272,7 @@ dump_label(const char *dev)
|
||||
sizeof (cksum_record_t), offsetof(cksum_record_t, link));
|
||||
|
||||
psize = statbuf.st_size;
|
||||
psize = P2ALIGN(psize, (uint64_t)sizeof (vdev_label_t));
|
||||
psize = P2ALIGN_TYPED(psize, sizeof (vdev_label_t), uint64_t);
|
||||
ashift = SPA_MINBLOCKSHIFT;
|
||||
|
||||
/*
|
||||
@@ -8934,9 +8989,27 @@ main(int argc, char **argv)
|
||||
char *spa_config_path_env, *objset_str;
|
||||
boolean_t target_is_spa = B_TRUE, dataset_lookup = B_FALSE;
|
||||
nvlist_t *cfg = NULL;
|
||||
struct sigaction action;
|
||||
|
||||
dprintf_setup(&argc, argv);
|
||||
|
||||
/*
|
||||
* Set up signal handlers, so if we crash due to bad on-disk data we
|
||||
* can get more info. Unlike ztest, we don't bail out if we can't set
|
||||
* up signal handlers, because zdb is very useful without them.
|
||||
*/
|
||||
action.sa_handler = sig_handler;
|
||||
sigemptyset(&action.sa_mask);
|
||||
action.sa_flags = 0;
|
||||
if (sigaction(SIGSEGV, &action, NULL) < 0) {
|
||||
(void) fprintf(stderr, "zdb: cannot catch SIGSEGV: %s\n",
|
||||
strerror(errno));
|
||||
}
|
||||
if (sigaction(SIGABRT, &action, NULL) < 0) {
|
||||
(void) fprintf(stderr, "zdb: cannot catch SIGABRT: %s\n",
|
||||
strerror(errno));
|
||||
}
|
||||
|
||||
/*
|
||||
* If there is an environment variable SPA_CONFIG_PATH it overrides
|
||||
* default spa_config_path setting. If -U flag is specified it will
|
||||
|
||||
@@ -186,7 +186,7 @@ static void
|
||||
zfs_redup_stream(int infd, int outfd, boolean_t verbose)
|
||||
{
|
||||
int bufsz = SPA_MAXBLOCKSIZE;
|
||||
dmu_replay_record_t thedrr = { 0 };
|
||||
dmu_replay_record_t thedrr;
|
||||
dmu_replay_record_t *drr = &thedrr;
|
||||
redup_table_t rdt;
|
||||
zio_cksum_t stream_cksum;
|
||||
@@ -194,6 +194,8 @@ zfs_redup_stream(int infd, int outfd, boolean_t verbose)
|
||||
uint64_t num_records = 0;
|
||||
uint64_t num_write_byref_records = 0;
|
||||
|
||||
memset(&thedrr, 0, sizeof (dmu_replay_record_t));
|
||||
|
||||
#ifdef _ILP32
|
||||
uint64_t max_rde_size = SMALLEST_POSSIBLE_MAX_RDT_MB << 20;
|
||||
#else
|
||||
|
||||
+7
-5
@@ -2448,7 +2448,7 @@ ztest_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf,
|
||||
ASSERT3P(zio, !=, NULL);
|
||||
size = doi.doi_data_block_size;
|
||||
if (ISP2(size)) {
|
||||
offset = P2ALIGN(offset, size);
|
||||
offset = P2ALIGN_TYPED(offset, size, uint64_t);
|
||||
} else {
|
||||
ASSERT3U(offset, <, size);
|
||||
offset = 0;
|
||||
@@ -4668,7 +4668,8 @@ ztest_dmu_object_next_chunk(ztest_ds_t *zd, uint64_t id)
|
||||
*/
|
||||
mutex_enter(&os->os_obj_lock);
|
||||
object = ztest_random(os->os_obj_next_chunk);
|
||||
os->os_obj_next_chunk = P2ALIGN(object, dnodes_per_chunk);
|
||||
os->os_obj_next_chunk = P2ALIGN_TYPED(object, dnodes_per_chunk,
|
||||
uint64_t);
|
||||
mutex_exit(&os->os_obj_lock);
|
||||
}
|
||||
|
||||
@@ -6284,7 +6285,8 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id)
|
||||
* the end of the disk (vdev_psize) is aligned to
|
||||
* sizeof (vdev_label_t).
|
||||
*/
|
||||
uint64_t psize = P2ALIGN(fsize, sizeof (vdev_label_t));
|
||||
uint64_t psize = P2ALIGN_TYPED(fsize, sizeof (vdev_label_t),
|
||||
uint64_t);
|
||||
if ((leaf & 1) == 1 &&
|
||||
offset + sizeof (bad) > psize - VDEV_LABEL_END_SIZE)
|
||||
continue;
|
||||
@@ -6600,8 +6602,8 @@ ztest_fletcher_incr(ztest_ds_t *zd, uint64_t id)
|
||||
size_t inc = 64 * ztest_random(size / 67);
|
||||
/* sometimes add few bytes to test non-simd */
|
||||
if (ztest_random(100) < 10)
|
||||
inc += P2ALIGN(ztest_random(64),
|
||||
sizeof (uint32_t));
|
||||
inc += P2ALIGN_TYPED(ztest_random(64),
|
||||
sizeof (uint32_t), uint64_t);
|
||||
|
||||
if (inc > (size - pos))
|
||||
inc = size - pos;
|
||||
|
||||
@@ -90,8 +90,8 @@ AC_DEFUN([ZFS_AC_FIND_SYSTEM_LIBRARY], [
|
||||
AC_DEFINE([HAVE_][$1], [1], [Define if you have [$5]])
|
||||
$7
|
||||
],[dnl ELSE
|
||||
AC_SUBST([$1]_CFLAGS, [])
|
||||
AC_SUBST([$1]_LIBS, [])
|
||||
AC_SUBST([$1]_CFLAGS, [""])
|
||||
AC_SUBST([$1]_LIBS, [""])
|
||||
AC_MSG_WARN([cannot find [$5] via pkg-config or in the standard locations])
|
||||
$8
|
||||
])
|
||||
|
||||
@@ -25,6 +25,8 @@ AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE_PLUG], [
|
||||
dnl #
|
||||
dnl # 2.6.32 - 4.11: statically allocated bdi in request_queue
|
||||
dnl # 4.12: dynamically allocated bdi in request_queue
|
||||
dnl # 6.11: bdi no longer available through request_queue, so get it from
|
||||
dnl # the gendisk attached to the queue
|
||||
dnl #
|
||||
AC_DEFUN([ZFS_AC_KERNEL_SRC_BLK_QUEUE_BDI], [
|
||||
ZFS_LINUX_TEST_SRC([blk_queue_bdi], [
|
||||
@@ -47,6 +49,30 @@ AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE_BDI], [
|
||||
])
|
||||
])
|
||||
|
||||
AC_DEFUN([ZFS_AC_KERNEL_SRC_BLK_QUEUE_DISK_BDI], [
|
||||
ZFS_LINUX_TEST_SRC([blk_queue_disk_bdi], [
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/backing-dev.h>
|
||||
], [
|
||||
struct request_queue q;
|
||||
struct gendisk disk;
|
||||
struct backing_dev_info bdi __attribute__ ((unused));
|
||||
q.disk = &disk;
|
||||
q.disk->bdi = &bdi;
|
||||
])
|
||||
])
|
||||
|
||||
AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE_DISK_BDI], [
|
||||
AC_MSG_CHECKING([whether backing_dev_info is available through queue gendisk])
|
||||
ZFS_LINUX_TEST_RESULT([blk_queue_disk_bdi], [
|
||||
AC_MSG_RESULT(yes)
|
||||
AC_DEFINE(HAVE_BLK_QUEUE_DISK_BDI, 1,
|
||||
[backing_dev_info is available through queue gendisk])
|
||||
],[
|
||||
AC_MSG_RESULT(no)
|
||||
])
|
||||
])
|
||||
|
||||
dnl #
|
||||
dnl # 5.9: added blk_queue_update_readahead(),
|
||||
dnl # 5.15: renamed to disk_update_readahead()
|
||||
@@ -332,7 +358,7 @@ AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE_MAX_HW_SECTORS], [
|
||||
ZFS_LINUX_TEST_RESULT([blk_queue_max_hw_sectors], [
|
||||
AC_MSG_RESULT(yes)
|
||||
],[
|
||||
ZFS_LINUX_TEST_ERROR([blk_queue_max_hw_sectors])
|
||||
AC_MSG_RESULT(no)
|
||||
])
|
||||
])
|
||||
|
||||
@@ -355,7 +381,7 @@ AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE_MAX_SEGMENTS], [
|
||||
ZFS_LINUX_TEST_RESULT([blk_queue_max_segments], [
|
||||
AC_MSG_RESULT(yes)
|
||||
], [
|
||||
ZFS_LINUX_TEST_ERROR([blk_queue_max_segments])
|
||||
AC_MSG_RESULT(no)
|
||||
])
|
||||
])
|
||||
|
||||
@@ -407,6 +433,7 @@ AC_DEFUN([ZFS_AC_KERNEL_BLK_MQ], [
|
||||
AC_DEFUN([ZFS_AC_KERNEL_SRC_BLK_QUEUE], [
|
||||
ZFS_AC_KERNEL_SRC_BLK_QUEUE_PLUG
|
||||
ZFS_AC_KERNEL_SRC_BLK_QUEUE_BDI
|
||||
ZFS_AC_KERNEL_SRC_BLK_QUEUE_DISK_BDI
|
||||
ZFS_AC_KERNEL_SRC_BLK_QUEUE_UPDATE_READAHEAD
|
||||
ZFS_AC_KERNEL_SRC_BLK_QUEUE_DISCARD
|
||||
ZFS_AC_KERNEL_SRC_BLK_QUEUE_SECURE_ERASE
|
||||
@@ -421,6 +448,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLK_QUEUE], [
|
||||
AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE], [
|
||||
ZFS_AC_KERNEL_BLK_QUEUE_PLUG
|
||||
ZFS_AC_KERNEL_BLK_QUEUE_BDI
|
||||
ZFS_AC_KERNEL_BLK_QUEUE_DISK_BDI
|
||||
ZFS_AC_KERNEL_BLK_QUEUE_UPDATE_READAHEAD
|
||||
ZFS_AC_KERNEL_BLK_QUEUE_DISCARD
|
||||
ZFS_AC_KERNEL_BLK_QUEUE_SECURE_ERASE
|
||||
|
||||
@@ -534,6 +534,30 @@ AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_BDEV_WHOLE], [
|
||||
])
|
||||
])
|
||||
|
||||
dnl #
|
||||
dnl # 5.16 API change
|
||||
dnl # Added bdev_nr_bytes() helper.
|
||||
dnl #
|
||||
AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_NR_BYTES], [
|
||||
ZFS_LINUX_TEST_SRC([bdev_nr_bytes], [
|
||||
#include <linux/blkdev.h>
|
||||
],[
|
||||
struct block_device *bdev = NULL;
|
||||
loff_t nr_bytes __attribute__ ((unused)) = 0;
|
||||
nr_bytes = bdev_nr_bytes(bdev);
|
||||
])
|
||||
])
|
||||
|
||||
AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_BDEV_NR_BYTES], [
|
||||
AC_MSG_CHECKING([whether bdev_nr_bytes() is available])
|
||||
ZFS_LINUX_TEST_RESULT([bdev_nr_bytes], [
|
||||
AC_MSG_RESULT(yes)
|
||||
AC_DEFINE(HAVE_BDEV_NR_BYTES, 1, [bdev_nr_bytes() is available])
|
||||
],[
|
||||
AC_MSG_RESULT(no)
|
||||
])
|
||||
])
|
||||
|
||||
dnl #
|
||||
dnl # 5.20 API change,
|
||||
dnl # Removed bdevname(), snprintf(.., %pg) should be used.
|
||||
@@ -747,6 +771,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV], [
|
||||
ZFS_AC_KERNEL_SRC_BLKDEV_CHECK_DISK_CHANGE
|
||||
ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_CHECK_MEDIA_CHANGE
|
||||
ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_WHOLE
|
||||
ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_NR_BYTES
|
||||
ZFS_AC_KERNEL_SRC_BLKDEV_BDEVNAME
|
||||
ZFS_AC_KERNEL_SRC_BLKDEV_ISSUE_DISCARD
|
||||
ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_KOBJ
|
||||
@@ -767,6 +792,7 @@ AC_DEFUN([ZFS_AC_KERNEL_BLKDEV], [
|
||||
ZFS_AC_KERNEL_BLKDEV_CHECK_DISK_CHANGE
|
||||
ZFS_AC_KERNEL_BLKDEV_BDEV_CHECK_MEDIA_CHANGE
|
||||
ZFS_AC_KERNEL_BLKDEV_BDEV_WHOLE
|
||||
ZFS_AC_KERNEL_BLKDEV_BDEV_NR_BYTES
|
||||
ZFS_AC_KERNEL_BLKDEV_BDEVNAME
|
||||
ZFS_AC_KERNEL_BLKDEV_GET_ERESTARTSYS
|
||||
ZFS_AC_KERNEL_BLKDEV_ISSUE_DISCARD
|
||||
|
||||
@@ -58,6 +58,13 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_MAKE_REQUEST_FN], [
|
||||
disk = blk_alloc_disk(lim, NUMA_NO_NODE);
|
||||
])
|
||||
|
||||
ZFS_LINUX_TEST_SRC([blkdev_queue_limits_features], [
|
||||
#include <linux/blkdev.h>
|
||||
],[
|
||||
struct queue_limits *lim = NULL;
|
||||
lim->features = 0;
|
||||
])
|
||||
|
||||
ZFS_LINUX_TEST_SRC([blk_cleanup_disk], [
|
||||
#include <linux/blkdev.h>
|
||||
],[
|
||||
@@ -114,6 +121,20 @@ AC_DEFUN([ZFS_AC_KERNEL_MAKE_REQUEST_FN], [
|
||||
AC_MSG_RESULT(yes)
|
||||
AC_DEFINE([HAVE_BLK_ALLOC_DISK_2ARG], 1, [blk_alloc_disk() exists and takes 2 args])
|
||||
|
||||
dnl #
|
||||
dnl # Linux 6.11 API change:
|
||||
dnl # struct queue_limits gains a 'features' field,
|
||||
dnl # used to set flushing options
|
||||
dnl #
|
||||
AC_MSG_CHECKING([whether struct queue_limits has a features field])
|
||||
ZFS_LINUX_TEST_RESULT([blkdev_queue_limits_features], [
|
||||
AC_MSG_RESULT(yes)
|
||||
AC_DEFINE([HAVE_BLKDEV_QUEUE_LIMITS_FEATURES], 1,
|
||||
[struct queue_limits has a features field])
|
||||
], [
|
||||
AC_MSG_RESULT(no)
|
||||
])
|
||||
|
||||
dnl #
|
||||
dnl # 5.20 API change,
|
||||
dnl # Removed blk_cleanup_disk(), put_disk() should be used.
|
||||
|
||||
@@ -1,17 +0,0 @@
|
||||
AC_DEFUN([ZFS_AC_KERNEL_SRC_MM_PAGE_SIZE], [
|
||||
ZFS_LINUX_TEST_SRC([page_size], [
|
||||
#include <linux/mm.h>
|
||||
],[
|
||||
unsigned long s;
|
||||
s = page_size(NULL);
|
||||
])
|
||||
])
|
||||
AC_DEFUN([ZFS_AC_KERNEL_MM_PAGE_SIZE], [
|
||||
AC_MSG_CHECKING([whether page_size() is available])
|
||||
ZFS_LINUX_TEST_RESULT([page_size], [
|
||||
AC_MSG_RESULT(yes)
|
||||
AC_DEFINE(HAVE_MM_PAGE_SIZE, 1, [page_size() is available])
|
||||
],[
|
||||
AC_MSG_RESULT(no)
|
||||
])
|
||||
])
|
||||
@@ -0,0 +1,36 @@
|
||||
AC_DEFUN([ZFS_AC_KERNEL_SRC_MM_PAGE_SIZE], [
|
||||
ZFS_LINUX_TEST_SRC([page_size], [
|
||||
#include <linux/mm.h>
|
||||
],[
|
||||
unsigned long s;
|
||||
s = page_size(NULL);
|
||||
])
|
||||
])
|
||||
AC_DEFUN([ZFS_AC_KERNEL_MM_PAGE_SIZE], [
|
||||
AC_MSG_CHECKING([whether page_size() is available])
|
||||
ZFS_LINUX_TEST_RESULT([page_size], [
|
||||
AC_MSG_RESULT(yes)
|
||||
AC_DEFINE(HAVE_MM_PAGE_SIZE, 1, [page_size() is available])
|
||||
],[
|
||||
AC_MSG_RESULT(no)
|
||||
])
|
||||
])
|
||||
|
||||
|
||||
AC_DEFUN([ZFS_AC_KERNEL_SRC_MM_PAGE_MAPPING], [
|
||||
ZFS_LINUX_TEST_SRC([page_mapping], [
|
||||
#include <linux/pagemap.h>
|
||||
],[
|
||||
struct page *p = NULL;
|
||||
struct address_space *m = page_mapping(NULL);
|
||||
])
|
||||
])
|
||||
AC_DEFUN([ZFS_AC_KERNEL_MM_PAGE_MAPPING], [
|
||||
AC_MSG_CHECKING([whether page_mapping() is available])
|
||||
ZFS_LINUX_TEST_RESULT([page_mapping], [
|
||||
AC_MSG_RESULT(yes)
|
||||
AC_DEFINE(HAVE_MM_PAGE_MAPPING, 1, [page_mapping() is available])
|
||||
],[
|
||||
AC_MSG_RESULT(no)
|
||||
])
|
||||
])
|
||||
@@ -25,3 +25,62 @@ AC_DEFUN([ZFS_AC_KERNEL_REGISTER_SYSCTL_TABLE], [
|
||||
AC_MSG_RESULT([no])
|
||||
])
|
||||
])
|
||||
|
||||
dnl #
|
||||
dnl # Linux 6.11 register_sysctl() enforces that sysctl tables no longer
|
||||
dnl # supply a sentinel end-of-table element. 6.6 introduces
|
||||
dnl # register_sysctl_sz() to enable callers to choose, so we use it if
|
||||
dnl # available for backward compatibility.
|
||||
dnl #
|
||||
AC_DEFUN([ZFS_AC_KERNEL_SRC_REGISTER_SYSCTL_SZ], [
|
||||
ZFS_LINUX_TEST_SRC([has_register_sysctl_sz], [
|
||||
#include <linux/sysctl.h>
|
||||
],[
|
||||
struct ctl_table test_table[] __attribute__((unused)) = {0};
|
||||
register_sysctl_sz("", test_table, 0);
|
||||
])
|
||||
])
|
||||
|
||||
AC_DEFUN([ZFS_AC_KERNEL_REGISTER_SYSCTL_SZ], [
|
||||
AC_MSG_CHECKING([whether register_sysctl_sz exists])
|
||||
ZFS_LINUX_TEST_RESULT([has_register_sysctl_sz], [
|
||||
AC_MSG_RESULT([yes])
|
||||
AC_DEFINE(HAVE_REGISTER_SYSCTL_SZ, 1,
|
||||
[register_sysctl_sz exists])
|
||||
],[
|
||||
AC_MSG_RESULT([no])
|
||||
])
|
||||
])
|
||||
|
||||
dnl #
|
||||
dnl # Linux 6.11 makes const the ctl_table arg of proc_handler
|
||||
dnl #
|
||||
AC_DEFUN([ZFS_AC_KERNEL_SRC_PROC_HANDLER_CTL_TABLE_CONST], [
|
||||
ZFS_LINUX_TEST_SRC([has_proc_handler_ctl_table_const], [
|
||||
#include <linux/sysctl.h>
|
||||
|
||||
static int test_handler(
|
||||
const struct ctl_table *ctl __attribute((unused)),
|
||||
int write __attribute((unused)),
|
||||
void *buffer __attribute((unused)),
|
||||
size_t *lenp __attribute((unused)),
|
||||
loff_t *ppos __attribute((unused)))
|
||||
{
|
||||
return (0);
|
||||
}
|
||||
], [
|
||||
proc_handler *ph __attribute((unused)) =
|
||||
&test_handler;
|
||||
])
|
||||
])
|
||||
|
||||
AC_DEFUN([ZFS_AC_KERNEL_PROC_HANDLER_CTL_TABLE_CONST], [
|
||||
AC_MSG_CHECKING([whether proc_handler ctl_table arg is const])
|
||||
ZFS_LINUX_TEST_RESULT([has_proc_handler_ctl_table_const], [
|
||||
AC_MSG_RESULT([yes])
|
||||
AC_DEFINE(HAVE_PROC_HANDLER_CTL_TABLE_CONST, 1,
|
||||
[proc_handler ctl_table arg is const])
|
||||
], [
|
||||
AC_MSG_RESULT([no])
|
||||
])
|
||||
])
|
||||
|
||||
@@ -0,0 +1,40 @@
|
||||
dnl #
|
||||
dnl # check if kernel provides definitions for given types
|
||||
dnl #
|
||||
|
||||
dnl _ZFS_AC_KERNEL_SRC_TYPE(type)
|
||||
AC_DEFUN([_ZFS_AC_KERNEL_SRC_TYPE], [
|
||||
ZFS_LINUX_TEST_SRC([type_$1], [
|
||||
#include <linux/types.h>
|
||||
],[
|
||||
const $1 __attribute__((unused)) x = ($1) 0;
|
||||
])
|
||||
])
|
||||
|
||||
dnl _ZFS_AC_KERNEL_TYPE(type)
|
||||
AC_DEFUN([_ZFS_AC_KERNEL_TYPE], [
|
||||
AC_MSG_CHECKING([whether kernel defines $1])
|
||||
ZFS_LINUX_TEST_RESULT([type_$1], [
|
||||
AC_MSG_RESULT([yes])
|
||||
AC_DEFINE([HAVE_KERNEL_]m4_quote(m4_translit([$1], [a-z], [A-Z])),
|
||||
1, [kernel defines $1])
|
||||
], [
|
||||
AC_MSG_RESULT([no])
|
||||
])
|
||||
])
|
||||
|
||||
dnl ZFS_AC_KERNEL_TYPES([types...])
|
||||
AC_DEFUN([ZFS_AC_KERNEL_TYPES], [
|
||||
AC_DEFUN([ZFS_AC_KERNEL_SRC_TYPES], [
|
||||
m4_foreach_w([type], [$1], [
|
||||
_ZFS_AC_KERNEL_SRC_TYPE(type)
|
||||
])
|
||||
])
|
||||
AC_DEFUN([ZFS_AC_KERNEL_TYPES], [
|
||||
m4_foreach_w([type], [$1], [
|
||||
_ZFS_AC_KERNEL_TYPE(type)
|
||||
])
|
||||
])
|
||||
])
|
||||
|
||||
ZFS_AC_KERNEL_TYPES([intptr_t])
|
||||
@@ -37,6 +37,7 @@ dnl # only once the compilation can be done in parallel significantly
|
||||
dnl # speeding up the process.
|
||||
dnl #
|
||||
AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [
|
||||
ZFS_AC_KERNEL_SRC_TYPES
|
||||
ZFS_AC_KERNEL_SRC_OBJTOOL
|
||||
ZFS_AC_KERNEL_SRC_GLOBAL_PAGE_STATE
|
||||
ZFS_AC_KERNEL_SRC_ACCESS_OK_TYPE
|
||||
@@ -165,9 +166,12 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [
|
||||
ZFS_AC_KERNEL_SRC_WRITEPAGE_T
|
||||
ZFS_AC_KERNEL_SRC_RECLAIMED
|
||||
ZFS_AC_KERNEL_SRC_REGISTER_SYSCTL_TABLE
|
||||
ZFS_AC_KERNEL_SRC_REGISTER_SYSCTL_SZ
|
||||
ZFS_AC_KERNEL_SRC_PROC_HANDLER_CTL_TABLE_CONST
|
||||
ZFS_AC_KERNEL_SRC_COPY_SPLICE_READ
|
||||
ZFS_AC_KERNEL_SRC_SYNC_BDEV
|
||||
ZFS_AC_KERNEL_SRC_MM_PAGE_SIZE
|
||||
ZFS_AC_KERNEL_SRC_MM_PAGE_MAPPING
|
||||
case "$host_cpu" in
|
||||
powerpc*)
|
||||
ZFS_AC_KERNEL_SRC_CPU_HAS_FEATURE
|
||||
@@ -187,6 +191,7 @@ dnl #
|
||||
dnl # Check results of kernel interface tests.
|
||||
dnl #
|
||||
AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [
|
||||
ZFS_AC_KERNEL_TYPES
|
||||
ZFS_AC_KERNEL_ACCESS_OK_TYPE
|
||||
ZFS_AC_KERNEL_GLOBAL_PAGE_STATE
|
||||
ZFS_AC_KERNEL_OBJTOOL
|
||||
@@ -315,9 +320,12 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [
|
||||
ZFS_AC_KERNEL_WRITEPAGE_T
|
||||
ZFS_AC_KERNEL_RECLAIMED
|
||||
ZFS_AC_KERNEL_REGISTER_SYSCTL_TABLE
|
||||
ZFS_AC_KERNEL_REGISTER_SYSCTL_SZ
|
||||
ZFS_AC_KERNEL_PROC_HANDLER_CTL_TABLE_CONST
|
||||
ZFS_AC_KERNEL_COPY_SPLICE_READ
|
||||
ZFS_AC_KERNEL_SYNC_BDEV
|
||||
ZFS_AC_KERNEL_MM_PAGE_SIZE
|
||||
ZFS_AC_KERNEL_MM_PAGE_MAPPING
|
||||
case "$host_cpu" in
|
||||
powerpc*)
|
||||
ZFS_AC_KERNEL_CPU_HAS_FEATURE
|
||||
|
||||
@@ -0,0 +1,14 @@
|
||||
dnl
|
||||
dnl backtrace(), for userspace assertions. glibc has this directly in libc.
|
||||
dnl FreeBSD and (sometimes) musl have it in a separate -lexecinfo. It's assumed
|
||||
dnl that this will also get the companion function backtrace_symbols().
|
||||
dnl
|
||||
AC_DEFUN([ZFS_AC_CONFIG_USER_BACKTRACE], [
|
||||
AX_SAVE_FLAGS
|
||||
LIBS=""
|
||||
AC_SEARCH_LIBS([backtrace], [execinfo], [
|
||||
AC_DEFINE(HAVE_BACKTRACE, 1, [backtrace() is available])
|
||||
AC_SUBST([BACKTRACE_LIBS], ["$LIBS"])
|
||||
])
|
||||
AX_RESTORE_FLAGS
|
||||
])
|
||||
@@ -0,0 +1,44 @@
|
||||
dnl
|
||||
dnl Checks for libunwind, which usually does a better job than backtrace() when
|
||||
dnl resolving symbols in the stack backtrace. Newer versions have support for
|
||||
dnl getting info about the object file the function came from, so we look for
|
||||
dnl that too and use it if found.
|
||||
dnl
|
||||
AC_DEFUN([ZFS_AC_CONFIG_USER_LIBUNWIND], [
|
||||
AC_ARG_WITH([libunwind],
|
||||
AS_HELP_STRING([--with-libunwind],
|
||||
[use libunwind for backtraces in userspace assertions]),
|
||||
[],
|
||||
[with_libunwind=auto])
|
||||
|
||||
AS_IF([test "x$with_libunwind" != "xno"], [
|
||||
ZFS_AC_FIND_SYSTEM_LIBRARY(LIBUNWIND, [libunwind], [libunwind.h], [], [unwind], [], [
|
||||
dnl unw_get_elf_filename() is sometimes a macro, other
|
||||
dnl times a proper symbol, so we can't just do a link
|
||||
dnl check; we need to include the header properly.
|
||||
AX_SAVE_FLAGS
|
||||
CFLAGS="$CFLAGS $LIBUNWIND_CFLAGS"
|
||||
LIBS="$LIBS $LIBUNWIND_LIBS"
|
||||
AC_MSG_CHECKING([for unw_get_elf_filename in libunwind])
|
||||
AC_LINK_IFELSE([
|
||||
AC_LANG_PROGRAM([
|
||||
#define UNW_LOCAL_ONLY
|
||||
#include <libunwind.h>
|
||||
], [
|
||||
unw_get_elf_filename(0, 0, 0, 0);
|
||||
])
|
||||
], [
|
||||
AC_MSG_RESULT([yes])
|
||||
AC_DEFINE(HAVE_LIBUNWIND_ELF, 1,
|
||||
[libunwind has unw_get_elf_filename])
|
||||
], [
|
||||
AC_MSG_RESULT([no])
|
||||
])
|
||||
AX_RESTORE_FLAGS
|
||||
], [
|
||||
AS_IF([test "x$with_libunwind" = "xyes"], [
|
||||
AC_MSG_FAILURE([--with-libunwind was given, but libunwind is not available, try installing libunwind-devel])
|
||||
])
|
||||
])
|
||||
])
|
||||
])
|
||||
+3
-1
@@ -26,12 +26,14 @@ AC_DEFUN([ZFS_AC_CONFIG_USER], [
|
||||
ZFS_AC_CONFIG_USER_AIO_H
|
||||
ZFS_AC_CONFIG_USER_CLOCK_GETTIME
|
||||
ZFS_AC_CONFIG_USER_PAM
|
||||
ZFS_AC_CONFIG_USER_BACKTRACE
|
||||
ZFS_AC_CONFIG_USER_LIBUNWIND
|
||||
ZFS_AC_CONFIG_USER_RUNSTATEDIR
|
||||
ZFS_AC_CONFIG_USER_MAKEDEV_IN_SYSMACROS
|
||||
ZFS_AC_CONFIG_USER_MAKEDEV_IN_MKDEV
|
||||
ZFS_AC_CONFIG_USER_ZFSEXEC
|
||||
|
||||
AC_CHECK_FUNCS([issetugid mlockall strlcat strlcpy])
|
||||
AC_CHECK_FUNCS([execvpe issetugid mlockall strlcat strlcpy gettid])
|
||||
|
||||
AC_SUBST(RM)
|
||||
])
|
||||
|
||||
@@ -1 +1,2 @@
|
||||
/zfs
|
||||
/zpool
|
||||
|
||||
@@ -1,5 +1,9 @@
|
||||
nodist_bashcompletion_DATA = %D%/zfs
|
||||
SUBSTFILES += $(nodist_bashcompletion_DATA)
|
||||
nodist_bashcompletion_DATA = %D%/zfs %D%/zpool
|
||||
COMPLETION_FILES = %D%/zfs
|
||||
SUBSTFILES += $(COMPLETION_FILES)
|
||||
|
||||
SHELLCHECKSCRIPTS += $(nodist_bashcompletion_DATA)
|
||||
$(call SHELLCHECK_OPTS,$(nodist_bashcompletion_DATA)): SHELLCHECK_SHELL = bash
|
||||
SHELLCHECKSCRIPTS += $(COMPLETION_FILES)
|
||||
$(call SHELLCHECK_OPTS,$(COMPLETION_FILES)): SHELLCHECK_SHELL = bash
|
||||
|
||||
%D%/zpool: %D%/zfs
|
||||
$(LN_S) zfs $@
|
||||
|
||||
@@ -138,7 +138,8 @@ typedef int enum_t;
|
||||
#define readdir64 readdir
|
||||
#define dirent64 dirent
|
||||
#endif
|
||||
#define P2ALIGN(x, align) ((x) & -(align))
|
||||
// Deprecated. Use P2ALIGN_TYPED instead.
|
||||
// #define P2ALIGN(x, align) ((x) & -(align))
|
||||
#define P2CROSS(x, y, align) (((x) ^ (y)) > (align) - 1)
|
||||
#define P2ROUNDUP(x, align) ((((x) - 1) | ((align) - 1)) + 1)
|
||||
#define P2PHASE(x, align) ((x) & ((align) - 1))
|
||||
|
||||
@@ -31,13 +31,14 @@
|
||||
|
||||
#include_next <sys/sdt.h>
|
||||
#ifdef KDTRACE_HOOKS
|
||||
/* CSTYLED */
|
||||
/* BEGIN CSTYLED */
|
||||
SDT_PROBE_DECLARE(sdt, , , set__error);
|
||||
|
||||
#define SET_ERROR(err) \
|
||||
((sdt_sdt___set__error->id ? \
|
||||
(*sdt_probe_func)(sdt_sdt___set__error->id, \
|
||||
(uintptr_t)err, 0, 0, 0, 0) : 0), err)
|
||||
#define SET_ERROR(err) ({ \
|
||||
SDT_PROBE1(sdt, , , set__error, (uintptr_t)err); \
|
||||
err; \
|
||||
})
|
||||
/* END CSTYLED */
|
||||
#else
|
||||
#define SET_ERROR(err) (err)
|
||||
#endif
|
||||
|
||||
@@ -191,7 +191,8 @@ extern unsigned char bcd_to_byte[256];
|
||||
* eg, P2ALIGN(0x1234, 0x100) == 0x1200 (0x12*align)
|
||||
* eg, P2ALIGN(0x5600, 0x100) == 0x5600 (0x56*align)
|
||||
*/
|
||||
#define P2ALIGN(x, align) ((x) & -(align))
|
||||
// Deprecated. Use P2ALIGN_TYPED instead.
|
||||
// #define P2ALIGN(x, align) ((x) & -(align))
|
||||
|
||||
/*
|
||||
* return x % (mod) align
|
||||
|
||||
@@ -57,6 +57,11 @@ blk_queue_flag_clear(unsigned int flag, struct request_queue *q)
|
||||
#endif
|
||||
|
||||
/*
|
||||
* 6.11 API
|
||||
* Setting the flush flags directly is no longer possible; flush flags are set
|
||||
* on the queue_limits structure and passed to blk_disk_alloc(). In this case
|
||||
* we remove this function entirely.
|
||||
*
|
||||
* 4.7 API,
|
||||
* The blk_queue_write_cache() interface has replaced blk_queue_flush()
|
||||
* interface. However, the new interface is GPL-only thus we implement
|
||||
@@ -68,39 +73,43 @@ blk_queue_flag_clear(unsigned int flag, struct request_queue *q)
|
||||
* new one is GPL-only. Thus if the GPL-only version is detected we
|
||||
* implement our own trivial helper.
|
||||
*/
|
||||
#if !defined(HAVE_BLK_ALLOC_DISK_2ARG) || \
|
||||
!defined(HAVE_BLKDEV_QUEUE_LIMITS_FEATURES)
|
||||
static inline void
|
||||
blk_queue_set_write_cache(struct request_queue *q, bool wc, bool fua)
|
||||
blk_queue_set_write_cache(struct request_queue *q, bool on)
|
||||
{
|
||||
#if defined(HAVE_BLK_QUEUE_WRITE_CACHE_GPL_ONLY)
|
||||
if (wc)
|
||||
if (on) {
|
||||
blk_queue_flag_set(QUEUE_FLAG_WC, q);
|
||||
else
|
||||
blk_queue_flag_clear(QUEUE_FLAG_WC, q);
|
||||
if (fua)
|
||||
blk_queue_flag_set(QUEUE_FLAG_FUA, q);
|
||||
else
|
||||
} else {
|
||||
blk_queue_flag_clear(QUEUE_FLAG_WC, q);
|
||||
blk_queue_flag_clear(QUEUE_FLAG_FUA, q);
|
||||
}
|
||||
#elif defined(HAVE_BLK_QUEUE_WRITE_CACHE)
|
||||
blk_queue_write_cache(q, wc, fua);
|
||||
blk_queue_write_cache(q, on, on);
|
||||
#elif defined(HAVE_BLK_QUEUE_FLUSH_GPL_ONLY)
|
||||
if (wc)
|
||||
q->flush_flags |= REQ_FLUSH;
|
||||
if (fua)
|
||||
q->flush_flags |= REQ_FUA;
|
||||
if (on)
|
||||
q->flush_flags |= REQ_FLUSH | REQ_FUA;
|
||||
else
|
||||
q->flush_flags &= ~(REQ_FLUSH | REQ_FUA);
|
||||
#elif defined(HAVE_BLK_QUEUE_FLUSH)
|
||||
blk_queue_flush(q, (wc ? REQ_FLUSH : 0) | (fua ? REQ_FUA : 0));
|
||||
blk_queue_flush(q, on ? (REQ_FLUSH | REQ_FUA) : 0);
|
||||
#else
|
||||
#error "Unsupported kernel"
|
||||
#endif
|
||||
}
|
||||
#endif /* !HAVE_BLK_ALLOC_DISK_2ARG || !HAVE_BLKDEV_QUEUE_LIMITS_FEATURES */
|
||||
|
||||
static inline void
|
||||
blk_queue_set_read_ahead(struct request_queue *q, unsigned long ra_pages)
|
||||
{
|
||||
#if !defined(HAVE_BLK_QUEUE_UPDATE_READAHEAD) && \
|
||||
!defined(HAVE_DISK_UPDATE_READAHEAD)
|
||||
#ifdef HAVE_BLK_QUEUE_BDI_DYNAMIC
|
||||
#if defined(HAVE_BLK_QUEUE_BDI_DYNAMIC)
|
||||
q->backing_dev_info->ra_pages = ra_pages;
|
||||
#elif defined(HAVE_BLK_QUEUE_DISK_BDI)
|
||||
q->disk->bdi->ra_pages = ra_pages;
|
||||
#else
|
||||
q->backing_dev_info.ra_pages = ra_pages;
|
||||
#endif
|
||||
|
||||
@@ -21,16 +21,23 @@
|
||||
|
||||
/*
|
||||
* Copyright (c) 2023, 2024, Klara Inc.
|
||||
* Copyright (c) 2024, Rob Norris <robn@despairlabs.com>
|
||||
*/
|
||||
|
||||
#ifndef _ZFS_MM_COMPAT_H
|
||||
#define _ZFS_MM_COMPAT_H
|
||||
|
||||
#include <linux/mm.h>
|
||||
#include <linux/pagemap.h>
|
||||
|
||||
/* 5.4 introduced page_size(). Older kernels can use a trivial macro instead */
|
||||
#ifndef HAVE_MM_PAGE_SIZE
|
||||
#define page_size(p) ((unsigned long)(PAGE_SIZE << compound_order(p)))
|
||||
#endif
|
||||
|
||||
/* 6.11 removed page_mapping(). A simple wrapper around folio_mapping() works */
|
||||
#ifndef HAVE_MM_PAGE_MAPPING
|
||||
#define page_mapping(p) folio_mapping(page_folio(p))
|
||||
#endif
|
||||
|
||||
#endif /* _ZFS_MM_COMPAT_H */
|
||||
|
||||
@@ -192,22 +192,25 @@ extern void spl_kmem_reap(void);
|
||||
extern uint64_t spl_kmem_cache_inuse(kmem_cache_t *cache);
|
||||
extern uint64_t spl_kmem_cache_entry_size(kmem_cache_t *cache);
|
||||
|
||||
#ifndef SPL_KMEM_CACHE_IMPLEMENTING
|
||||
/*
|
||||
* Macros for the kmem_cache_* API expected by ZFS and SPL clients. We don't
|
||||
* define them inside spl-kmem-cache.c, as that uses the kernel's incompatible
|
||||
* kmem_cache_* facilities to implement ours.
|
||||
*/
|
||||
|
||||
/* Avoid conflicts with kernel names that might be implemented as macros. */
|
||||
#undef kmem_cache_alloc
|
||||
|
||||
#define kmem_cache_create(name, size, align, ctor, dtor, rclm, priv, vmp, fl) \
|
||||
spl_kmem_cache_create(name, size, align, ctor, dtor, rclm, priv, vmp, fl)
|
||||
#define kmem_cache_set_move(skc, move) spl_kmem_cache_set_move(skc, move)
|
||||
#define kmem_cache_destroy(skc) spl_kmem_cache_destroy(skc)
|
||||
/*
|
||||
* This is necessary to be compatible with other kernel modules
|
||||
* or in-tree filesystem that may define kmem_cache_alloc,
|
||||
* like bcachefs does it now.
|
||||
*/
|
||||
#ifdef kmem_cache_alloc
|
||||
#undef kmem_cache_alloc
|
||||
#endif
|
||||
#define kmem_cache_alloc(skc, flags) spl_kmem_cache_alloc(skc, flags)
|
||||
#define kmem_cache_free(skc, obj) spl_kmem_cache_free(skc, obj)
|
||||
#define kmem_cache_reap_now(skc) spl_kmem_cache_reap_now(skc)
|
||||
#define kmem_reap() spl_kmem_reap()
|
||||
#endif
|
||||
|
||||
/*
|
||||
* The following functions are only available for internal use.
|
||||
|
||||
@@ -159,7 +159,8 @@ makedev(unsigned int major, unsigned int minor)
|
||||
/*
|
||||
* Compatibility macros/typedefs needed for Solaris -> Linux port
|
||||
*/
|
||||
#define P2ALIGN(x, align) ((x) & -(align))
|
||||
// Deprecated. Use P2ALIGN_TYPED instead.
|
||||
// #define P2ALIGN(x, align) ((x) & -(align))
|
||||
#define P2CROSS(x, y, align) (((x) ^ (y)) > (align) - 1)
|
||||
#define P2ROUNDUP(x, align) ((((x) - 1) | ((align) - 1)) + 1)
|
||||
#define P2PHASE(x, align) ((x) & ((align) - 1))
|
||||
|
||||
@@ -38,7 +38,9 @@ typedef unsigned long ulong_t;
|
||||
typedef unsigned long long u_longlong_t;
|
||||
typedef long long longlong_t;
|
||||
|
||||
#ifndef HAVE_KERNEL_INTPTR_T
|
||||
typedef long intptr_t;
|
||||
#endif
|
||||
typedef unsigned long long rlim64_t;
|
||||
|
||||
typedef struct task_struct kthread_t;
|
||||
|
||||
@@ -80,7 +80,7 @@
|
||||
snprintf(__get_str(msg), TRACE_DBUF_MSG_MAX, \
|
||||
DBUF_TP_PRINTK_FMT, DBUF_TP_PRINTK_ARGS); \
|
||||
} else { \
|
||||
__assign_str(os_spa, "NULL") \
|
||||
__assign_str(os_spa, "NULL"); \
|
||||
__entry->ds_object = 0; \
|
||||
__entry->db_object = 0; \
|
||||
__entry->db_level = 0; \
|
||||
|
||||
@@ -173,6 +173,7 @@ typedef struct dsl_scan {
|
||||
dsl_scan_phys_t scn_phys; /* on disk representation of scan */
|
||||
dsl_scan_phys_t scn_phys_cached;
|
||||
avl_tree_t scn_queue; /* queue of datasets to scan */
|
||||
kmutex_t scn_queue_lock; /* serializes scn_queue inserts */
|
||||
uint64_t scn_queues_pending; /* outstanding data to issue */
|
||||
/* members needed for syncing error scrub status to disk */
|
||||
dsl_errorscrub_phys_t errorscrub_phys;
|
||||
|
||||
@@ -1175,8 +1175,8 @@ efi_use_whole_disk(int fd)
|
||||
* (for performance reasons). The alignment should match the
|
||||
* alignment used by the "zpool_label_disk" function.
|
||||
*/
|
||||
limit = P2ALIGN(efi_label->efi_last_lba - nblocks - EFI_MIN_RESV_SIZE,
|
||||
PARTITION_END_ALIGNMENT);
|
||||
limit = P2ALIGN_TYPED(efi_label->efi_last_lba - nblocks -
|
||||
EFI_MIN_RESV_SIZE, PARTITION_END_ALIGNMENT, diskaddr_t);
|
||||
if (data_start + data_size != limit || resv_start != limit)
|
||||
sync_needed = B_TRUE;
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
include $(srcdir)/%D%/include/Makefile.am
|
||||
|
||||
libspl_assert_la_CFLAGS = $(AM_CFLAGS) $(LIBRARY_CFLAGS)
|
||||
libspl_assert_la_CFLAGS = $(AM_CFLAGS) $(LIBRARY_CFLAGS) $(LIBUNWIND_CFLAGS)
|
||||
libspl_la_CFLAGS = $(libspl_assert_la_CFLAGS)
|
||||
|
||||
noinst_LTLIBRARIES += libspl_assert.la libspl.la
|
||||
@@ -43,3 +43,9 @@ libspl_la_LIBADD = \
|
||||
libspl_assert.la
|
||||
|
||||
libspl_la_LIBADD += $(LIBATOMIC_LIBS) $(LIBCLOCK_GETTIME)
|
||||
|
||||
libspl_assert_la_LIBADD = $(BACKTRACE_LIBS) $(LIBUNWIND_LIBS)
|
||||
|
||||
if BUILD_FREEBSD
|
||||
libspl_assert_la_LIBADD += -lpthread
|
||||
endif
|
||||
|
||||
+110
-2
@@ -22,8 +22,96 @@
|
||||
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
|
||||
* Use is subject to license terms.
|
||||
*/
|
||||
/*
|
||||
* Copyright (c) 2024, Rob Norris <robn@despairlabs.com>
|
||||
*/
|
||||
|
||||
#include <assert.h>
|
||||
#include <pthread.h>
|
||||
|
||||
#if defined(__linux__)
|
||||
#include <errno.h>
|
||||
#include <sys/prctl.h>
|
||||
#ifdef HAVE_GETTID
|
||||
#define libspl_gettid() gettid()
|
||||
#else
|
||||
#include <sys/syscall.h>
|
||||
#define libspl_gettid() ((pid_t)syscall(__NR_gettid))
|
||||
#endif
|
||||
#define libspl_getprogname() (program_invocation_short_name)
|
||||
#define libspl_getthreadname(buf, len) \
|
||||
prctl(PR_GET_NAME, (unsigned long)(buf), 0, 0, 0)
|
||||
#elif defined(__FreeBSD__) || defined(__APPLE__)
|
||||
#if !defined(__APPLE__)
|
||||
#include <pthread_np.h>
|
||||
#define libspl_gettid() pthread_getthreadid_np()
|
||||
#endif
|
||||
#define libspl_getprogname() getprogname()
|
||||
#define libspl_getthreadname(buf, len) \
|
||||
pthread_getname_np(pthread_self(), buf, len);
|
||||
#endif
|
||||
|
||||
#if defined(HAVE_LIBUNWIND)
|
||||
#define UNW_LOCAL_ONLY
|
||||
#include <libunwind.h>
|
||||
|
||||
static inline void
|
||||
libspl_dump_backtrace(void)
|
||||
{
|
||||
unw_context_t uc;
|
||||
unw_cursor_t cp;
|
||||
unw_word_t ip, off;
|
||||
char funcname[128];
|
||||
#ifdef HAVE_LIBUNWIND_ELF
|
||||
char objname[128];
|
||||
unw_word_t objoff;
|
||||
#endif
|
||||
|
||||
fprintf(stderr, "Call trace:\n");
|
||||
unw_getcontext(&uc);
|
||||
unw_init_local(&cp, &uc);
|
||||
while (unw_step(&cp) > 0) {
|
||||
unw_get_reg(&cp, UNW_REG_IP, &ip);
|
||||
unw_get_proc_name(&cp, funcname, sizeof (funcname), &off);
|
||||
#ifdef HAVE_LIBUNWIND_ELF
|
||||
unw_get_elf_filename(&cp, objname, sizeof (objname), &objoff);
|
||||
fprintf(stderr, " [0x%08lx] %s+0x%2lx (in %s +0x%2lx)\n",
|
||||
ip, funcname, off, objname, objoff);
|
||||
#else
|
||||
fprintf(stderr, " [0x%08lx] %s+0x%2lx\n", ip, funcname, off);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
#elif defined(HAVE_BACKTRACE)
|
||||
#include <execinfo.h>
|
||||
|
||||
static inline void
|
||||
libspl_dump_backtrace(void)
|
||||
{
|
||||
void *btptrs[100];
|
||||
size_t nptrs = backtrace(btptrs, 100);
|
||||
char **bt = backtrace_symbols(btptrs, nptrs);
|
||||
fprintf(stderr, "Call trace:\n");
|
||||
for (size_t i = 0; i < nptrs; i++)
|
||||
fprintf(stderr, " %s\n", bt[i]);
|
||||
free(bt);
|
||||
}
|
||||
#else
|
||||
#define libspl_dump_backtrace()
|
||||
#endif
|
||||
|
||||
#if defined(__APPLE__)
|
||||
static inline uint64_t
|
||||
libspl_gettid(void)
|
||||
{
|
||||
uint64_t tid;
|
||||
|
||||
if (pthread_threadid_np(NULL, &tid) != 0)
|
||||
tid = 0;
|
||||
|
||||
return (tid);
|
||||
}
|
||||
#endif
|
||||
|
||||
static boolean_t libspl_assert_ok = B_FALSE;
|
||||
|
||||
@@ -33,21 +121,41 @@ libspl_set_assert_ok(boolean_t val)
|
||||
libspl_assert_ok = val;
|
||||
}
|
||||
|
||||
static pthread_mutex_t assert_lock = PTHREAD_MUTEX_INITIALIZER;
|
||||
|
||||
/* printf version of libspl_assert */
|
||||
void
|
||||
libspl_assertf(const char *file, const char *func, int line,
|
||||
const char *format, ...)
|
||||
{
|
||||
pthread_mutex_lock(&assert_lock);
|
||||
|
||||
va_list args;
|
||||
char tname[64];
|
||||
|
||||
libspl_getthreadname(tname, sizeof (tname));
|
||||
|
||||
fprintf(stderr, "ASSERT at %s:%d:%s()\n", file, line, func);
|
||||
|
||||
va_start(args, format);
|
||||
vfprintf(stderr, format, args);
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "ASSERT at %s:%d:%s()", file, line, func);
|
||||
va_end(args);
|
||||
|
||||
fprintf(stderr, "\n"
|
||||
" PID: %-8u COMM: %s\n"
|
||||
#if defined(__APPLE__)
|
||||
" TID: %-8" PRIu64 " NAME: %s\n",
|
||||
#else
|
||||
" TID: %-8u NAME: %s\n",
|
||||
#endif
|
||||
getpid(), libspl_getprogname(),
|
||||
libspl_gettid(), tname);
|
||||
|
||||
libspl_dump_backtrace();
|
||||
|
||||
#if !__has_feature(attribute_analyzer_noreturn) && !defined(__COVERITY__)
|
||||
if (libspl_assert_ok) {
|
||||
pthread_mutex_unlock(&assert_lock);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -57,6 +57,8 @@
|
||||
extern size_t spl_pagesize(void);
|
||||
#define PAGESIZE (spl_pagesize())
|
||||
|
||||
#ifndef HAVE_EXECVPE
|
||||
extern int execvpe(const char *name, char * const argv[], char * const envp[]);
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
@@ -52,7 +52,8 @@
|
||||
/*
|
||||
* Compatibility macros/typedefs needed for Solaris -> Linux port
|
||||
*/
|
||||
#define P2ALIGN(x, align) ((x) & -(align))
|
||||
// Deprecated. Use P2ALIGN_TYPED instead.
|
||||
// #define P2ALIGN(x, align) ((x) & -(align))
|
||||
#define P2CROSS(x, y, align) (((x) ^ (y)) > (align) - 1)
|
||||
#define P2ROUNDUP(x, align) ((((x) - 1) | ((align) - 1)) + 1)
|
||||
#define P2BOUNDARY(off, len, align) \
|
||||
|
||||
@@ -2170,7 +2170,8 @@ out:
|
||||
static int
|
||||
send_conclusion_record(int fd, zio_cksum_t *zc)
|
||||
{
|
||||
dmu_replay_record_t drr = { 0 };
|
||||
dmu_replay_record_t drr;
|
||||
memset(&drr, 0, sizeof (dmu_replay_record_t));
|
||||
drr.drr_type = DRR_END;
|
||||
if (zc != NULL)
|
||||
drr.drr_u.drr_end.drr_checksum = *zc;
|
||||
@@ -2272,7 +2273,8 @@ send_prelim_records(zfs_handle_t *zhp, const char *from, int fd,
|
||||
}
|
||||
|
||||
if (!dryrun) {
|
||||
dmu_replay_record_t drr = { 0 };
|
||||
dmu_replay_record_t drr;
|
||||
memset(&drr, 0, sizeof (dmu_replay_record_t));
|
||||
/* write first begin record */
|
||||
drr.drr_type = DRR_BEGIN;
|
||||
drr.drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC;
|
||||
|
||||
@@ -38,7 +38,8 @@
|
||||
#define ZFS_KMOD "openzfs"
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef HAVE_EXECVPE
|
||||
/* FreeBSD prior to 15 lacks execvpe */
|
||||
static int
|
||||
execvPe(const char *name, const char *path, char * const *argv,
|
||||
char * const *envp)
|
||||
@@ -192,6 +193,7 @@ execvpe(const char *name, char * const argv[], char * const envp[])
|
||||
|
||||
return (execvPe(name, path, argv, envp));
|
||||
}
|
||||
#endif /* !HAVE_EXECVPE */
|
||||
|
||||
static __thread char errbuf[ERRBUFLEN];
|
||||
|
||||
|
||||
@@ -268,7 +268,8 @@ zpool_label_disk(libzfs_handle_t *hdl, zpool_handle_t *zhp, const char *name)
|
||||
if (start_block == MAXOFFSET_T)
|
||||
start_block = NEW_START_BLOCK;
|
||||
slice_size -= start_block;
|
||||
slice_size = P2ALIGN(slice_size, PARTITION_END_ALIGNMENT);
|
||||
slice_size = P2ALIGN_TYPED(slice_size, PARTITION_END_ALIGNMENT,
|
||||
uint64_t);
|
||||
|
||||
vtoc->efi_parts[0].p_start = start_block;
|
||||
vtoc->efi_parts[0].p_size = slice_size;
|
||||
|
||||
+16
-6
@@ -121,20 +121,26 @@ Controls whether buffers present on special vdevs are eligible for caching
|
||||
into L2ARC.
|
||||
If set to 1, exclude dbufs on special vdevs from being cached to L2ARC.
|
||||
.
|
||||
.It Sy l2arc_mfuonly Ns = Ns Sy 0 Ns | Ns 1 Pq int
|
||||
.It Sy l2arc_mfuonly Ns = Ns Sy 0 Ns | Ns 1 Ns | Ns 2 Pq int
|
||||
Controls whether only MFU metadata and data are cached from ARC into L2ARC.
|
||||
This may be desired to avoid wasting space on L2ARC when reading/writing large
|
||||
amounts of data that are not expected to be accessed more than once.
|
||||
.Pp
|
||||
The default is off,
|
||||
The default is 0,
|
||||
meaning both MRU and MFU data and metadata are cached.
|
||||
When turning off this feature, some MRU buffers will still be present
|
||||
in ARC and eventually cached on L2ARC.
|
||||
When turning off this feature (setting it to 0), some MRU buffers will
|
||||
still be present in ARC and eventually cached on L2ARC.
|
||||
.No If Sy l2arc_noprefetch Ns = Ns Sy 0 ,
|
||||
some prefetched buffers will be cached to L2ARC, and those might later
|
||||
transition to MRU, in which case the
|
||||
.Sy l2arc_mru_asize No arcstat will not be Sy 0 .
|
||||
.Pp
|
||||
Setting it to 1 means to L2 cache only MFU data and metadata.
|
||||
.Pp
|
||||
Setting it to 2 means to L2 cache all metadata (MRU+MFU) but
|
||||
only MFU data (ie: MRU data are not cached). This can be the right setting
|
||||
to cache as much metadata as possible even when having high data turnover.
|
||||
.Pp
|
||||
Regardless of
|
||||
.Sy l2arc_noprefetch ,
|
||||
some MFU buffers might be evicted from ARC,
|
||||
@@ -2324,8 +2330,8 @@ Prioritize requeued I/O.
|
||||
.
|
||||
.It Sy zio_taskq_batch_pct Ns = Ns Sy 80 Ns % Pq uint
|
||||
Percentage of online CPUs which will run a worker thread for I/O.
|
||||
These workers are responsible for I/O work such as compression and
|
||||
checksum calculations.
|
||||
These workers are responsible for I/O work such as compression, encryption,
|
||||
checksum and parity calculations.
|
||||
Fractional number of CPUs will be rounded down.
|
||||
.Pp
|
||||
The default value of
|
||||
@@ -2333,6 +2339,7 @@ The default value of
|
||||
was chosen to avoid using all CPUs which can result in
|
||||
latency issues and inconsistent application performance,
|
||||
especially when slower compression and/or checksumming is enabled.
|
||||
Set value only applies to pools imported/created after that.
|
||||
.
|
||||
.It Sy zio_taskq_batch_tpq Ns = Ns Sy 0 Pq uint
|
||||
Number of worker threads per taskq.
|
||||
@@ -2342,16 +2349,19 @@ while higher reduces lock contention.
|
||||
If
|
||||
.Sy 0 ,
|
||||
generate a system-dependent value close to 6 threads per taskq.
|
||||
Set value only applies to pools imported/created after that.
|
||||
.
|
||||
.It Sy zio_taskq_read Ns = Ns Sy fixed,1,8 null scale null Pq charp
|
||||
Set the queue and thread configuration for the IO read queues.
|
||||
This is an advanced debugging parameter.
|
||||
Don't change this unless you understand what it does.
|
||||
Set values only apply to pools imported/created after that.
|
||||
.
|
||||
.It Sy zio_taskq_write Ns = Ns Sy batch fixed,1,5 scale fixed,1,5 Pq charp
|
||||
Set the queue and thread configuration for the IO write queues.
|
||||
This is an advanced debugging parameter.
|
||||
Don't change this unless you understand what it does.
|
||||
Set values only apply to pools imported/created after that.
|
||||
.
|
||||
.It Sy zvol_inhibit_dev Ns = Ns Sy 0 Ns | Ns 1 Pq uint
|
||||
Do not create zvol device nodes.
|
||||
|
||||
@@ -154,7 +154,7 @@ defaults to the current kernel release.
|
||||
.
|
||||
.It Sy bootfs.rollback Ns Op Sy = Ns Ar snapshot-name
|
||||
Execute
|
||||
.Nm zfs Cm snapshot Fl Rf Ar boot-dataset Ns Sy @ Ns Ar snapshot-name
|
||||
.Nm zfs Cm rollback Fl Rf Ar boot-dataset Ns Sy @ Ns Ar snapshot-name
|
||||
before pivoting to the real root.
|
||||
.Ar snapshot-name
|
||||
defaults to the current kernel release.
|
||||
|
||||
+4
-2
@@ -16,8 +16,8 @@ src = @abs_srcdir@
|
||||
obj = @abs_builddir@
|
||||
else
|
||||
zfs_include = $(srctree)/include/zfs
|
||||
icp_include = $(srctree)/$(src)/icp/include
|
||||
zstd_include = $(srctree)/$(src)/zstd/include
|
||||
icp_include = $(src)/icp/include
|
||||
zstd_include = $(src)/zstd/include
|
||||
ZFS_MODULE_CFLAGS += -include $(zfs_include)/zfs_config.h
|
||||
endif
|
||||
|
||||
@@ -492,6 +492,8 @@ zfs-$(CONFIG_PPC64) += $(addprefix zfs/,$(ZFS_OBJS_PPC_PPC64))
|
||||
UBSAN_SANITIZE_zap_leaf.o := n
|
||||
UBSAN_SANITIZE_zap_micro.o := n
|
||||
UBSAN_SANITIZE_sa.o := n
|
||||
UBSAN_SANITIZE_zfs/zap_micro.o := n
|
||||
UBSAN_SANITIZE_zfs/sa.o := n
|
||||
|
||||
# Suppress incorrect warnings from versions of objtool which are not
|
||||
# aware of x86 EVEX prefix instructions used for AVX512.
|
||||
|
||||
@@ -118,7 +118,15 @@ const sha256_ops_t sha256_shani_impl = {
|
||||
};
|
||||
#endif
|
||||
|
||||
#elif defined(__aarch64__) || (defined(__arm__) && __ARM_ARCH > 6)
|
||||
#elif defined(__aarch64__) || defined(__arm__)
|
||||
extern void zfs_sha256_block_armv7(uint32_t s[8], const void *, size_t);
|
||||
const sha256_ops_t sha256_armv7_impl = {
|
||||
.is_supported = sha2_is_supported,
|
||||
.transform = zfs_sha256_block_armv7,
|
||||
.name = "armv7"
|
||||
};
|
||||
|
||||
#if __ARM_ARCH > 6
|
||||
static boolean_t sha256_have_neon(void)
|
||||
{
|
||||
return (kfpu_allowed() && zfs_neon_available());
|
||||
@@ -129,13 +137,6 @@ static boolean_t sha256_have_armv8ce(void)
|
||||
return (kfpu_allowed() && zfs_sha256_available());
|
||||
}
|
||||
|
||||
extern void zfs_sha256_block_armv7(uint32_t s[8], const void *, size_t);
|
||||
const sha256_ops_t sha256_armv7_impl = {
|
||||
.is_supported = sha2_is_supported,
|
||||
.transform = zfs_sha256_block_armv7,
|
||||
.name = "armv7"
|
||||
};
|
||||
|
||||
TF(zfs_sha256_block_neon, tf_sha256_neon);
|
||||
const sha256_ops_t sha256_neon_impl = {
|
||||
.is_supported = sha256_have_neon,
|
||||
@@ -149,6 +150,7 @@ const sha256_ops_t sha256_armv8_impl = {
|
||||
.transform = tf_sha256_armv8ce,
|
||||
.name = "armv8-ce"
|
||||
};
|
||||
#endif
|
||||
|
||||
#elif defined(__PPC64__)
|
||||
static boolean_t sha256_have_isa207(void)
|
||||
@@ -192,11 +194,13 @@ static const sha256_ops_t *const sha256_impls[] = {
|
||||
#if defined(__x86_64) && defined(HAVE_SSE4_1)
|
||||
&sha256_shani_impl,
|
||||
#endif
|
||||
#if defined(__aarch64__) || (defined(__arm__) && __ARM_ARCH > 6)
|
||||
#if defined(__aarch64__) || defined(__arm__)
|
||||
&sha256_armv7_impl,
|
||||
#if __ARM_ARCH > 6
|
||||
&sha256_neon_impl,
|
||||
&sha256_armv8_impl,
|
||||
#endif
|
||||
#endif
|
||||
#if defined(__PPC64__)
|
||||
&sha256_ppc_impl,
|
||||
&sha256_power8_impl,
|
||||
|
||||
@@ -88,7 +88,7 @@ const sha512_ops_t sha512_avx2_impl = {
|
||||
};
|
||||
#endif
|
||||
|
||||
#elif defined(__aarch64__)
|
||||
#elif defined(__aarch64__) || defined(__arm__)
|
||||
extern void zfs_sha512_block_armv7(uint64_t s[8], const void *, size_t);
|
||||
const sha512_ops_t sha512_armv7_impl = {
|
||||
.is_supported = sha2_is_supported,
|
||||
@@ -96,6 +96,7 @@ const sha512_ops_t sha512_armv7_impl = {
|
||||
.name = "armv7"
|
||||
};
|
||||
|
||||
#if defined(__aarch64__)
|
||||
static boolean_t sha512_have_armv8ce(void)
|
||||
{
|
||||
return (kfpu_allowed() && zfs_sha512_available());
|
||||
@@ -107,15 +108,9 @@ const sha512_ops_t sha512_armv8_impl = {
|
||||
.transform = tf_sha512_armv8ce,
|
||||
.name = "armv8-ce"
|
||||
};
|
||||
#endif
|
||||
|
||||
#elif defined(__arm__) && __ARM_ARCH > 6
|
||||
extern void zfs_sha512_block_armv7(uint64_t s[8], const void *, size_t);
|
||||
const sha512_ops_t sha512_armv7_impl = {
|
||||
.is_supported = sha2_is_supported,
|
||||
.transform = zfs_sha512_block_armv7,
|
||||
.name = "armv7"
|
||||
};
|
||||
|
||||
#if defined(__arm__) && __ARM_ARCH > 6
|
||||
static boolean_t sha512_have_neon(void)
|
||||
{
|
||||
return (kfpu_allowed() && zfs_neon_available());
|
||||
@@ -127,6 +122,7 @@ const sha512_ops_t sha512_neon_impl = {
|
||||
.transform = tf_sha512_neon,
|
||||
.name = "neon"
|
||||
};
|
||||
#endif
|
||||
|
||||
#elif defined(__PPC64__)
|
||||
TF(zfs_sha512_ppc, tf_sha512_ppc);
|
||||
@@ -164,14 +160,15 @@ static const sha512_ops_t *const sha512_impls[] = {
|
||||
#if defined(__x86_64) && defined(HAVE_AVX2)
|
||||
&sha512_avx2_impl,
|
||||
#endif
|
||||
#if defined(__aarch64__)
|
||||
#if defined(__aarch64__) || defined(__arm__)
|
||||
&sha512_armv7_impl,
|
||||
#if defined(__aarch64__)
|
||||
&sha512_armv8_impl,
|
||||
#endif
|
||||
#if defined(__arm__) && __ARM_ARCH > 6
|
||||
&sha512_armv7_impl,
|
||||
&sha512_neon_impl,
|
||||
#endif
|
||||
#endif
|
||||
#if defined(__PPC64__)
|
||||
&sha512_ppc_impl,
|
||||
&sha512_power8_impl,
|
||||
|
||||
@@ -21,8 +21,11 @@
|
||||
|
||||
#if defined(__arm__)
|
||||
|
||||
#define __ARM_ARCH__ 7
|
||||
#define __ARM_MAX_ARCH__ 7
|
||||
#ifndef __ARM_ARCH
|
||||
# define __ARM_ARCH__ 7
|
||||
#else
|
||||
# define __ARM_ARCH__ __ARM_ARCH
|
||||
#endif
|
||||
|
||||
#if defined(__thumb2__)
|
||||
.syntax unified
|
||||
@@ -1834,6 +1837,7 @@ zfs_sha256_block_armv7:
|
||||
#endif
|
||||
.size zfs_sha256_block_armv7,.-zfs_sha256_block_armv7
|
||||
|
||||
#if __ARM_ARCH__ >= 7
|
||||
.arch armv7-a
|
||||
.fpu neon
|
||||
|
||||
@@ -2766,4 +2770,5 @@ zfs_sha256_block_armv8:
|
||||
bx lr @ bx lr
|
||||
.size zfs_sha256_block_armv8,.-zfs_sha256_block_armv8
|
||||
|
||||
#endif
|
||||
#endif // #if __ARM_ARCH__ >= 7
|
||||
#endif // #if defined(__arm__)
|
||||
|
||||
@@ -21,8 +21,11 @@
|
||||
|
||||
#if defined(__arm__)
|
||||
|
||||
#define __ARM_ARCH__ 7
|
||||
#define __ARM_MAX_ARCH__ 7
|
||||
#ifndef __ARM_ARCH
|
||||
# define __ARM_ARCH__ 7
|
||||
#else
|
||||
# define __ARM_ARCH__ __ARM_ARCH
|
||||
#endif
|
||||
|
||||
#ifndef __KERNEL__
|
||||
# define VFP_ABI_PUSH vstmdb sp!,{d8-d15}
|
||||
@@ -490,6 +493,7 @@ zfs_sha512_block_armv7:
|
||||
#endif
|
||||
.size zfs_sha512_block_armv7,.-zfs_sha512_block_armv7
|
||||
|
||||
#if __ARM_ARCH__ >= 7
|
||||
.arch armv7-a
|
||||
.fpu neon
|
||||
|
||||
@@ -1819,4 +1823,5 @@ zfs_sha512_block_neon:
|
||||
VFP_ABI_POP
|
||||
bx lr @ .word 0xe12fff1e
|
||||
.size zfs_sha512_block_neon,.-zfs_sha512_block_neon
|
||||
#endif
|
||||
#endif // #if __ARM_ARCH__ >= 7
|
||||
#endif // #if defined(__arm__)
|
||||
|
||||
+6
-2
@@ -832,12 +832,14 @@ aes_encrypt_atomic(crypto_mechanism_t *mechanism,
|
||||
crypto_key_t *key, crypto_data_t *plaintext, crypto_data_t *ciphertext,
|
||||
crypto_spi_ctx_template_t template)
|
||||
{
|
||||
aes_ctx_t aes_ctx = {{{{0}}}};
|
||||
aes_ctx_t aes_ctx;
|
||||
off_t saved_offset;
|
||||
size_t saved_length;
|
||||
size_t length_needed;
|
||||
int ret;
|
||||
|
||||
memset(&aes_ctx, 0, sizeof (aes_ctx_t));
|
||||
|
||||
ASSERT(ciphertext != NULL);
|
||||
|
||||
/*
|
||||
@@ -956,12 +958,14 @@ aes_decrypt_atomic(crypto_mechanism_t *mechanism,
|
||||
crypto_key_t *key, crypto_data_t *ciphertext, crypto_data_t *plaintext,
|
||||
crypto_spi_ctx_template_t template)
|
||||
{
|
||||
aes_ctx_t aes_ctx = {{{{0}}}};
|
||||
aes_ctx_t aes_ctx;
|
||||
off_t saved_offset;
|
||||
size_t saved_length;
|
||||
size_t length_needed;
|
||||
int ret;
|
||||
|
||||
memset(&aes_ctx, 0, sizeof (aes_ctx_t));
|
||||
|
||||
ASSERT(plaintext != NULL);
|
||||
|
||||
/*
|
||||
|
||||
@@ -457,7 +457,7 @@ vdev_geom_read_config(struct g_consumer *cp, nvlist_t **configp)
|
||||
ZFS_LOG(1, "Reading config from %s...", pp->name);
|
||||
|
||||
psize = pp->mediasize;
|
||||
psize = P2ALIGN(psize, (uint64_t)sizeof (vdev_label_t));
|
||||
psize = P2ALIGN_TYPED(psize, sizeof (vdev_label_t), uint64_t);
|
||||
|
||||
size = sizeof (*vdev_lists[0]) + pp->sectorsize -
|
||||
((sizeof (*vdev_lists[0]) - 1) % pp->sectorsize) - 1;
|
||||
|
||||
@@ -543,6 +543,7 @@ zfs_rmnode(znode_t *zp)
|
||||
dataset_kstats_update_nunlinked_kstat(&zfsvfs->z_kstat, 1);
|
||||
|
||||
zfs_znode_delete(zp, tx);
|
||||
zfs_znode_free(zp);
|
||||
|
||||
dmu_tx_commit(tx);
|
||||
|
||||
|
||||
@@ -1169,10 +1169,25 @@ zfs_create(znode_t *dzp, const char *name, vattr_t *vap, int excl, int mode,
|
||||
return (error);
|
||||
}
|
||||
zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
|
||||
|
||||
error = zfs_link_create(dzp, name, zp, tx, ZNEW);
|
||||
if (error != 0) {
|
||||
/*
|
||||
* Since, we failed to add the directory entry for it,
|
||||
* delete the newly created dnode.
|
||||
*/
|
||||
zfs_znode_delete(zp, tx);
|
||||
VOP_UNLOCK1(ZTOV(zp));
|
||||
zrele(zp);
|
||||
zfs_acl_ids_free(&acl_ids);
|
||||
dmu_tx_commit(tx);
|
||||
getnewvnode_drop_reserve();
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (fuid_dirtied)
|
||||
zfs_fuid_sync(zfsvfs, tx);
|
||||
|
||||
(void) zfs_link_create(dzp, name, zp, tx, ZNEW);
|
||||
txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
|
||||
zfs_log_create(zilog, tx, txtype, dzp, zp, name,
|
||||
vsecp, acl_ids.z_fuidp, vap);
|
||||
@@ -1520,13 +1535,19 @@ zfs_mkdir(znode_t *dzp, const char *dirname, vattr_t *vap, znode_t **zpp,
|
||||
*/
|
||||
zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
|
||||
|
||||
if (fuid_dirtied)
|
||||
zfs_fuid_sync(zfsvfs, tx);
|
||||
|
||||
/*
|
||||
* Now put new name in parent dir.
|
||||
*/
|
||||
(void) zfs_link_create(dzp, dirname, zp, tx, ZNEW);
|
||||
error = zfs_link_create(dzp, dirname, zp, tx, ZNEW);
|
||||
if (error != 0) {
|
||||
zfs_znode_delete(zp, tx);
|
||||
VOP_UNLOCK1(ZTOV(zp));
|
||||
zrele(zp);
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (fuid_dirtied)
|
||||
zfs_fuid_sync(zfsvfs, tx);
|
||||
|
||||
*zpp = zp;
|
||||
|
||||
@@ -1534,6 +1555,7 @@ zfs_mkdir(znode_t *dzp, const char *dirname, vattr_t *vap, znode_t **zpp,
|
||||
zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, NULL,
|
||||
acl_ids.z_fuidp, vap);
|
||||
|
||||
out:
|
||||
zfs_acl_ids_free(&acl_ids);
|
||||
|
||||
dmu_tx_commit(tx);
|
||||
@@ -1544,7 +1566,7 @@ zfs_mkdir(znode_t *dzp, const char *dirname, vattr_t *vap, znode_t **zpp,
|
||||
zil_commit(zilog, 0);
|
||||
|
||||
zfs_exit(zfsvfs, FTAG);
|
||||
return (0);
|
||||
return (error);
|
||||
}
|
||||
|
||||
#if __FreeBSD_version < 1300124
|
||||
@@ -3578,10 +3600,14 @@ zfs_symlink(znode_t *dzp, const char *name, vattr_t *vap,
|
||||
/*
|
||||
* Insert the new object into the directory.
|
||||
*/
|
||||
(void) zfs_link_create(dzp, name, zp, tx, ZNEW);
|
||||
|
||||
zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
|
||||
*zpp = zp;
|
||||
error = zfs_link_create(dzp, name, zp, tx, ZNEW);
|
||||
if (error != 0) {
|
||||
zfs_znode_delete(zp, tx);
|
||||
VOP_UNLOCK1(ZTOV(zp));
|
||||
zrele(zp);
|
||||
} else {
|
||||
zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
|
||||
}
|
||||
|
||||
zfs_acl_ids_free(&acl_ids);
|
||||
|
||||
@@ -3589,8 +3615,12 @@ zfs_symlink(znode_t *dzp, const char *name, vattr_t *vap,
|
||||
|
||||
getnewvnode_drop_reserve();
|
||||
|
||||
if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
|
||||
zil_commit(zilog, 0);
|
||||
if (error == 0) {
|
||||
*zpp = zp;
|
||||
|
||||
if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
|
||||
zil_commit(zilog, 0);
|
||||
}
|
||||
|
||||
zfs_exit(zfsvfs, FTAG);
|
||||
return (error);
|
||||
@@ -6238,7 +6268,6 @@ zfs_freebsd_copy_file_range(struct vop_copy_file_range_args *ap)
|
||||
struct vnode *invp = ap->a_invp;
|
||||
struct vnode *outvp = ap->a_outvp;
|
||||
struct mount *mp;
|
||||
struct uio io;
|
||||
int error;
|
||||
uint64_t len = *ap->a_lenp;
|
||||
|
||||
@@ -6286,12 +6315,6 @@ zfs_freebsd_copy_file_range(struct vop_copy_file_range_args *ap)
|
||||
goto out_locked;
|
||||
#endif
|
||||
|
||||
io.uio_offset = *ap->a_outoffp;
|
||||
io.uio_resid = *ap->a_lenp;
|
||||
error = vn_rlimit_fsize(outvp, &io, ap->a_fsizetd);
|
||||
if (error != 0)
|
||||
goto out_locked;
|
||||
|
||||
error = zfs_clone_range(VTOZ(invp), ap->a_inoffp, VTOZ(outvp),
|
||||
ap->a_outoffp, &len, ap->a_outcred);
|
||||
if (error == EXDEV || error == EAGAIN || error == EINVAL ||
|
||||
|
||||
@@ -1234,7 +1234,6 @@ zfs_znode_delete(znode_t *zp, dmu_tx_t *tx)
|
||||
VERIFY0(dmu_object_free(os, obj, tx));
|
||||
zfs_znode_dmu_fini(zp);
|
||||
ZFS_OBJ_HOLD_EXIT(zfsvfs, obj);
|
||||
zfs_znode_free(zp);
|
||||
}
|
||||
|
||||
void
|
||||
|
||||
@@ -21,6 +21,8 @@
|
||||
* with the SPL. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#define SPL_KMEM_CACHE_IMPLEMENTING
|
||||
|
||||
#include <linux/percpu_compat.h>
|
||||
#include <sys/kmem.h>
|
||||
#include <sys/kmem_cache.h>
|
||||
@@ -33,16 +35,6 @@
|
||||
#include <linux/swap.h>
|
||||
#include <linux/prefetch.h>
|
||||
|
||||
/*
|
||||
* Within the scope of spl-kmem.c file the kmem_cache_* definitions
|
||||
* are removed to allow access to the real Linux slab allocator.
|
||||
*/
|
||||
#undef kmem_cache_destroy
|
||||
#undef kmem_cache_create
|
||||
#undef kmem_cache_alloc
|
||||
#undef kmem_cache_free
|
||||
|
||||
|
||||
/*
|
||||
* Linux 3.16 replaced smp_mb__{before,after}_{atomic,clear}_{dec,inc,bit}()
|
||||
* with smp_mb__{before,after}_atomic() because they were redundant. This is
|
||||
|
||||
@@ -22,6 +22,9 @@
|
||||
*
|
||||
* Solaris Porting Layer (SPL) Proc Implementation.
|
||||
*/
|
||||
/*
|
||||
* Copyright (c) 2024, Rob Norris <robn@despairlabs.com>
|
||||
*/
|
||||
|
||||
#include <sys/systeminfo.h>
|
||||
#include <sys/kstat.h>
|
||||
@@ -43,6 +46,12 @@ typedef struct ctl_table __no_const spl_ctl_table;
|
||||
typedef struct ctl_table spl_ctl_table;
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_PROC_HANDLER_CTL_TABLE_CONST
|
||||
#define CONST_CTL_TABLE const struct ctl_table
|
||||
#else
|
||||
#define CONST_CTL_TABLE struct ctl_table
|
||||
#endif
|
||||
|
||||
static unsigned long table_min = 0;
|
||||
static unsigned long table_max = ~0;
|
||||
|
||||
@@ -60,7 +69,7 @@ struct proc_dir_entry *proc_spl_kstat = NULL;
|
||||
|
||||
#ifdef DEBUG_KMEM
|
||||
static int
|
||||
proc_domemused(struct ctl_table *table, int write,
|
||||
proc_domemused(CONST_CTL_TABLE *table, int write,
|
||||
void __user *buffer, size_t *lenp, loff_t *ppos)
|
||||
{
|
||||
int rc = 0;
|
||||
@@ -88,7 +97,7 @@ proc_domemused(struct ctl_table *table, int write,
|
||||
#endif /* DEBUG_KMEM */
|
||||
|
||||
static int
|
||||
proc_doslab(struct ctl_table *table, int write,
|
||||
proc_doslab(CONST_CTL_TABLE *table, int write,
|
||||
void __user *buffer, size_t *lenp, loff_t *ppos)
|
||||
{
|
||||
int rc = 0;
|
||||
@@ -135,7 +144,7 @@ proc_doslab(struct ctl_table *table, int write,
|
||||
}
|
||||
|
||||
static int
|
||||
proc_dohostid(struct ctl_table *table, int write,
|
||||
proc_dohostid(CONST_CTL_TABLE *table, int write,
|
||||
void __user *buffer, size_t *lenp, loff_t *ppos)
|
||||
{
|
||||
char *end, str[32];
|
||||
@@ -688,6 +697,37 @@ static void spl_proc_cleanup(void)
|
||||
}
|
||||
}
|
||||
|
||||
#ifndef HAVE_REGISTER_SYSCTL_TABLE
|
||||
|
||||
/*
|
||||
* Traditionally, struct ctl_table arrays have been terminated by an "empty"
|
||||
* sentinel element (specifically, one with .procname == NULL).
|
||||
*
|
||||
* Linux 6.6 began migrating away from this, adding register_sysctl_sz() so
|
||||
* that callers could provide the size directly, and redefining
|
||||
* register_sysctl() to just call register_sysctl_sz() with the array size. It
|
||||
* retained support for the terminating element so that existing callers would
|
||||
* continue to work.
|
||||
*
|
||||
* Linux 6.11 removed support for the terminating element, instead interpreting
|
||||
* it as a real malformed element, and rejecting it.
|
||||
*
|
||||
* In order to continue support older kernels, we retain the terminating
|
||||
* sentinel element for our sysctl tables, but instead detect availability of
|
||||
* register_sysctl_sz(). If it exists, we pass it the array size -1, stopping
|
||||
* the kernel from trying to process the terminator. For pre-6.6 kernels that
|
||||
* don't have register_sysctl_sz(), we just use register_sysctl(), which can
|
||||
* handle the terminating element as it always has.
|
||||
*/
|
||||
#ifdef HAVE_REGISTER_SYSCTL_SZ
|
||||
#define spl_proc_register_sysctl(p, t) \
|
||||
register_sysctl_sz(p, t, ARRAY_SIZE(t)-1)
|
||||
#else
|
||||
#define spl_proc_register_sysctl(p, t) \
|
||||
register_sysctl(p, t)
|
||||
#endif
|
||||
#endif
|
||||
|
||||
int
|
||||
spl_proc_init(void)
|
||||
{
|
||||
@@ -698,16 +738,17 @@ spl_proc_init(void)
|
||||
if (spl_header == NULL)
|
||||
return (-EUNATCH);
|
||||
#else
|
||||
spl_header = register_sysctl("kernel/spl", spl_table);
|
||||
spl_header = spl_proc_register_sysctl("kernel/spl", spl_table);
|
||||
if (spl_header == NULL)
|
||||
return (-EUNATCH);
|
||||
|
||||
spl_kmem = register_sysctl("kernel/spl/kmem", spl_kmem_table);
|
||||
spl_kmem = spl_proc_register_sysctl("kernel/spl/kmem", spl_kmem_table);
|
||||
if (spl_kmem == NULL) {
|
||||
rc = -EUNATCH;
|
||||
goto out;
|
||||
}
|
||||
spl_kstat = register_sysctl("kernel/spl/kstat", spl_kstat_table);
|
||||
spl_kstat = spl_proc_register_sysctl("kernel/spl/kstat",
|
||||
spl_kstat_table);
|
||||
if (spl_kstat == NULL) {
|
||||
rc = -EUNATCH;
|
||||
goto out;
|
||||
|
||||
@@ -1015,10 +1015,50 @@ abd_cache_reap_now(void)
|
||||
}
|
||||
|
||||
#if defined(_KERNEL)
|
||||
|
||||
/*
|
||||
* Yield the next page struct and data offset and size within it, without
|
||||
* This is abd_iter_page(), the function underneath abd_iterate_page_func().
|
||||
* It yields the next page struct and data offset and size within it, without
|
||||
* mapping it into the address space.
|
||||
*/
|
||||
|
||||
/*
|
||||
* "Compound pages" are a group of pages that can be referenced from a single
|
||||
* struct page *. Its organised as a "head" page, followed by a series of
|
||||
* "tail" pages.
|
||||
*
|
||||
* In OpenZFS, compound pages are allocated using the __GFP_COMP flag, which we
|
||||
* get from scatter ABDs and SPL vmalloc slabs (ie >16K allocations). So a
|
||||
* great many of the IO buffers we get are going to be of this type.
|
||||
*
|
||||
* The tail pages are just regular PAGESIZE pages, and can be safely used
|
||||
* as-is. However, the head page has length covering itself and all the tail
|
||||
* pages. If the ABD chunk spans multiple pages, then we can use the head page
|
||||
* and a >PAGESIZE length, which is far more efficient.
|
||||
*
|
||||
* Before kernel 4.5 however, compound page heads were refcounted separately
|
||||
* from tail pages, such that moving back to the head page would require us to
|
||||
* take a reference to it and releasing it once we're completely finished with
|
||||
* it. In practice, that means when our caller is done with the ABD, which we
|
||||
* have no insight into from here. Rather than contort this API to track head
|
||||
* page references on such ancient kernels, we disable this special compound
|
||||
* page handling on 4.5, instead just using treating each page within it as a
|
||||
* regular PAGESIZE page (which it is). This is slightly less efficient, but
|
||||
* makes everything far simpler.
|
||||
*
|
||||
* The below test sets/clears ABD_ITER_COMPOUND_PAGES to enable/disable the
|
||||
* special handling, and also defines the ABD_ITER_PAGE_SIZE(page) macro to
|
||||
* understand compound pages, or not, as required.
|
||||
*/
|
||||
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 5, 0)
|
||||
#define ABD_ITER_COMPOUND_PAGES 1
|
||||
#define ABD_ITER_PAGE_SIZE(page) \
|
||||
(PageCompound(page) ? page_size(page) : PAGESIZE)
|
||||
#else
|
||||
#undef ABD_ITER_COMPOUND_PAGES
|
||||
#define ABD_ITER_PAGE_SIZE(page) (PAGESIZE)
|
||||
#endif
|
||||
|
||||
void
|
||||
abd_iter_page(struct abd_iter *aiter)
|
||||
{
|
||||
@@ -1032,6 +1072,12 @@ abd_iter_page(struct abd_iter *aiter)
|
||||
struct page *page;
|
||||
size_t doff, dsize;
|
||||
|
||||
/*
|
||||
* Find the page, and the start of the data within it. This is computed
|
||||
* differently for linear and scatter ABDs; linear is referenced by
|
||||
* virtual memory location, while scatter is referenced by page
|
||||
* pointer.
|
||||
*/
|
||||
if (abd_is_linear(aiter->iter_abd)) {
|
||||
ASSERT3U(aiter->iter_pos, ==, aiter->iter_offset);
|
||||
|
||||
@@ -1044,57 +1090,24 @@ abd_iter_page(struct abd_iter *aiter)
|
||||
|
||||
/* offset of address within the page */
|
||||
doff = offset_in_page(paddr);
|
||||
|
||||
/* total data remaining in abd from this position */
|
||||
dsize = aiter->iter_abd->abd_size - aiter->iter_offset;
|
||||
} else {
|
||||
ASSERT(!abd_is_gang(aiter->iter_abd));
|
||||
|
||||
/* current scatter page */
|
||||
page = sg_page(aiter->iter_sg);
|
||||
page = nth_page(sg_page(aiter->iter_sg),
|
||||
aiter->iter_offset >> PAGE_SHIFT);
|
||||
|
||||
/* position within page */
|
||||
doff = aiter->iter_offset;
|
||||
|
||||
/* remaining data in scatterlist */
|
||||
dsize = MIN(aiter->iter_sg->length - aiter->iter_offset,
|
||||
aiter->iter_abd->abd_size - aiter->iter_pos);
|
||||
doff = aiter->iter_offset & (PAGESIZE - 1);
|
||||
}
|
||||
ASSERT(page);
|
||||
|
||||
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 5, 0)
|
||||
#ifdef ABD_ITER_COMPOUND_PAGES
|
||||
if (PageTail(page)) {
|
||||
/*
|
||||
* This page is part of a "compound page", which is a group of
|
||||
* pages that can be referenced from a single struct page *.
|
||||
* Its organised as a "head" page, followed by a series of
|
||||
* "tail" pages.
|
||||
*
|
||||
* In OpenZFS, compound pages are allocated using the
|
||||
* __GFP_COMP flag, which we get from scatter ABDs and SPL
|
||||
* vmalloc slabs (ie >16K allocations). So a great many of the
|
||||
* IO buffers we get are going to be of this type.
|
||||
*
|
||||
* The tail pages are just regular PAGE_SIZE pages, and can be
|
||||
* safely used as-is. However, the head page has length
|
||||
* covering itself and all the tail pages. If this ABD chunk
|
||||
* spans multiple pages, then we can use the head page and a
|
||||
* >PAGE_SIZE length, which is far more efficient.
|
||||
*
|
||||
* To do this, we need to adjust the offset to be counted from
|
||||
* the head page. struct page for compound pages are stored
|
||||
* contiguously, so we can just adjust by a simple offset.
|
||||
*
|
||||
* Before kernel 4.5, compound page heads were refcounted
|
||||
* separately, such that moving back to the head page would
|
||||
* require us to take a reference to it and releasing it once
|
||||
* we're completely finished with it. In practice, that means
|
||||
* when our caller is done with the ABD, which we have no
|
||||
* insight into from here. Rather than contort this API to
|
||||
* track head page references on such ancient kernels, we just
|
||||
* compile this block out and use the tail pages directly. This
|
||||
* is slightly less efficient, but makes everything far
|
||||
* simpler.
|
||||
* If this is a compound tail page, move back to the head, and
|
||||
* adjust the offset to match. This may let us yield a much
|
||||
* larger amount of data from a single logical page, and so
|
||||
* leave our caller with fewer pages to process.
|
||||
*/
|
||||
struct page *head = compound_head(page);
|
||||
doff += ((page - head) * PAGESIZE);
|
||||
@@ -1102,12 +1115,27 @@ abd_iter_page(struct abd_iter *aiter)
|
||||
}
|
||||
#endif
|
||||
|
||||
/* final page and position within it */
|
||||
ASSERT(page);
|
||||
|
||||
/*
|
||||
* Compute the maximum amount of data we can take from this page. This
|
||||
* is the smaller of:
|
||||
* - the remaining space in the page
|
||||
* - the remaining space in this scatterlist entry (which may not cover
|
||||
* the entire page)
|
||||
* - the remaining space in the abd (which may not cover the entire
|
||||
* scatterlist entry)
|
||||
*/
|
||||
dsize = MIN(ABD_ITER_PAGE_SIZE(page) - doff,
|
||||
aiter->iter_abd->abd_size - aiter->iter_pos);
|
||||
if (!abd_is_linear(aiter->iter_abd))
|
||||
dsize = MIN(dsize, aiter->iter_sg->length - aiter->iter_offset);
|
||||
ASSERT3U(dsize, >, 0);
|
||||
|
||||
/* final iterator outputs */
|
||||
aiter->iter_page = page;
|
||||
aiter->iter_page_doff = doff;
|
||||
|
||||
/* amount of data in the chunk, up to the end of the page */
|
||||
aiter->iter_page_dsize = MIN(dsize, page_size(page) - doff);
|
||||
aiter->iter_page_dsize = dsize;
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
@@ -150,7 +150,11 @@ vdev_bdev_mode(spa_mode_t smode)
|
||||
static uint64_t
|
||||
bdev_capacity(struct block_device *bdev)
|
||||
{
|
||||
#ifdef HAVE_BDEV_NR_BYTES
|
||||
return (bdev_nr_bytes(bdev));
|
||||
#else
|
||||
return (i_size_read(bdev->bd_inode));
|
||||
#endif
|
||||
}
|
||||
|
||||
#if !defined(HAVE_BDEV_WHOLE)
|
||||
@@ -209,7 +213,7 @@ bdev_max_capacity(struct block_device *bdev, uint64_t wholedisk)
|
||||
* "reserved" EFI partition: in such cases return the device
|
||||
* usable capacity.
|
||||
*/
|
||||
available = i_size_read(bdev_whole(bdev)->bd_inode) -
|
||||
available = bdev_capacity(bdev_whole(bdev)) -
|
||||
((EFI_MIN_RESV_SIZE + NEW_START_BLOCK +
|
||||
PARTITION_END_ALIGNMENT) << SECTOR_BITS);
|
||||
psize = MAX(available, bdev_capacity(bdev));
|
||||
@@ -916,12 +920,12 @@ vdev_disk_io_rw(zio_t *zio)
|
||||
/*
|
||||
* Accessing outside the block device is never allowed.
|
||||
*/
|
||||
if (zio->io_offset + zio->io_size > bdev->bd_inode->i_size) {
|
||||
if (zio->io_offset + zio->io_size > bdev_capacity(bdev)) {
|
||||
vdev_dbgmsg(zio->io_vd,
|
||||
"Illegal access %llu size %llu, device size %llu",
|
||||
(u_longlong_t)zio->io_offset,
|
||||
(u_longlong_t)zio->io_size,
|
||||
(u_longlong_t)i_size_read(bdev->bd_inode));
|
||||
(u_longlong_t)bdev_capacity(bdev));
|
||||
return (SET_ERROR(EIO));
|
||||
}
|
||||
|
||||
@@ -1116,12 +1120,12 @@ vdev_classic_physio(zio_t *zio)
|
||||
/*
|
||||
* Accessing outside the block device is never allowed.
|
||||
*/
|
||||
if (io_offset + io_size > bdev->bd_inode->i_size) {
|
||||
if (io_offset + io_size > bdev_capacity(bdev)) {
|
||||
vdev_dbgmsg(zio->io_vd,
|
||||
"Illegal access %llu size %llu, device size %llu",
|
||||
(u_longlong_t)io_offset,
|
||||
(u_longlong_t)io_size,
|
||||
(u_longlong_t)i_size_read(bdev->bd_inode));
|
||||
(u_longlong_t)bdev_capacity(bdev));
|
||||
return (SET_ERROR(EIO));
|
||||
}
|
||||
|
||||
|
||||
@@ -110,8 +110,17 @@ zfs_kobj_fini(zfs_mod_kobj_t *zkobj)
|
||||
}
|
||||
|
||||
/* kobject_put() will call zfs_kobj_release() to release memory */
|
||||
kobject_del(&zkobj->zko_kobj);
|
||||
kobject_put(&zkobj->zko_kobj);
|
||||
/*
|
||||
* Special case note:
|
||||
*
|
||||
* We have to check for 'zkobj->zko_kobj.name != NULL' as
|
||||
* a workaround for #16249 which was added to zfs-2.2.4
|
||||
* and fixed (with this change) in zfs-2.2.5.
|
||||
*/
|
||||
if (zkobj->zko_kobj.name != NULL) {
|
||||
kobject_del(&zkobj->zko_kobj);
|
||||
kobject_put(&zkobj->zko_kobj);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
|
||||
@@ -69,6 +69,7 @@
|
||||
#include <sys/zpl.h>
|
||||
#include <sys/zil.h>
|
||||
#include <sys/sa_impl.h>
|
||||
#include <linux/mm_compat.h>
|
||||
|
||||
/*
|
||||
* Programming rules.
|
||||
|
||||
+278
-97
@@ -20,6 +20,7 @@
|
||||
*/
|
||||
/*
|
||||
* Copyright (c) 2012, 2020 by Delphix. All rights reserved.
|
||||
* Copyright (c) 2024, Rob Norris <robn@despairlabs.com>
|
||||
*/
|
||||
|
||||
#include <sys/dataset_kstats.h>
|
||||
@@ -41,6 +42,7 @@
|
||||
|
||||
#include <linux/blkdev_compat.h>
|
||||
#include <linux/task_io_accounting_ops.h>
|
||||
#include <linux/workqueue.h>
|
||||
|
||||
#ifdef HAVE_BLK_MQ
|
||||
#include <linux/blk-mq.h>
|
||||
@@ -384,7 +386,7 @@ zvol_discard(zv_request_t *zvr)
|
||||
*/
|
||||
if (!io_is_secure_erase(bio, rq)) {
|
||||
start = P2ROUNDUP(start, zv->zv_volblocksize);
|
||||
end = P2ALIGN(end, zv->zv_volblocksize);
|
||||
end = P2ALIGN_TYPED(end, zv->zv_volblocksize, uint64_t);
|
||||
size = end - start;
|
||||
}
|
||||
|
||||
@@ -729,7 +731,7 @@ retry:
|
||||
#endif
|
||||
if (zv == NULL) {
|
||||
rw_exit(&zvol_state_lock);
|
||||
return (SET_ERROR(-ENXIO));
|
||||
return (-SET_ERROR(ENXIO));
|
||||
}
|
||||
|
||||
mutex_enter(&zv->zv_state_lock);
|
||||
@@ -793,10 +795,10 @@ retry:
|
||||
|
||||
#ifdef HAVE_BLKDEV_GET_ERESTARTSYS
|
||||
schedule();
|
||||
return (SET_ERROR(-ERESTARTSYS));
|
||||
return (-SET_ERROR(ERESTARTSYS));
|
||||
#else
|
||||
if ((gethrtime() - start) > timeout)
|
||||
return (SET_ERROR(-ERESTARTSYS));
|
||||
return (-SET_ERROR(ERESTARTSYS));
|
||||
|
||||
schedule_timeout(MSEC_TO_TICK(10));
|
||||
goto retry;
|
||||
@@ -818,7 +820,7 @@ retry:
|
||||
if (zv->zv_open_count == 0)
|
||||
zvol_last_close(zv);
|
||||
|
||||
error = SET_ERROR(-EROFS);
|
||||
error = -SET_ERROR(EROFS);
|
||||
} else {
|
||||
zv->zv_open_count++;
|
||||
}
|
||||
@@ -1073,8 +1075,159 @@ static const struct block_device_operations zvol_ops = {
|
||||
#endif
|
||||
};
|
||||
|
||||
/*
|
||||
* Since 6.9, Linux has been removing queue limit setters in favour of an
|
||||
* initial queue_limits struct applied when the device is open. Since 6.11,
|
||||
* queue_limits is being extended to allow more things to be applied when the
|
||||
* device is open. Setters are also being removed for this.
|
||||
*
|
||||
* For OpenZFS, this means that depending on kernel version, some options may
|
||||
* be set up before the device is open, and some applied to an open device
|
||||
* (queue) after the fact.
|
||||
*
|
||||
* We manage this complexity by having our own limits struct,
|
||||
* zvol_queue_limits_t, in which we carry any queue config that we're
|
||||
* interested in setting. This structure is the same on all kernels.
|
||||
*
|
||||
* These limits are then applied to the queue at device open time by the most
|
||||
* appropriate method for the kernel.
|
||||
*
|
||||
* zvol_queue_limits_convert() is used on 6.9+ (where the two-arg form of
|
||||
* blk_alloc_disk() exists). This converts our limits struct to a proper Linux
|
||||
* struct queue_limits, and passes it in. Any fields added in later kernels are
|
||||
* (obviously) not set up here.
|
||||
*
|
||||
* zvol_queue_limits_apply() is called on all kernel versions after the queue
|
||||
* is created, and applies any remaining config. Before 6.9 that will be
|
||||
* everything, via setter methods. After 6.9 that will be whatever couldn't be
|
||||
* put into struct queue_limits. (This implies that zvol_queue_limits_apply()
|
||||
* will always be a no-op on the latest kernel we support).
|
||||
*/
|
||||
typedef struct zvol_queue_limits {
|
||||
unsigned int zql_max_hw_sectors;
|
||||
unsigned short zql_max_segments;
|
||||
unsigned int zql_max_segment_size;
|
||||
unsigned int zql_io_opt;
|
||||
unsigned int zql_physical_block_size;
|
||||
unsigned int zql_max_discard_sectors;
|
||||
unsigned int zql_discard_granularity;
|
||||
} zvol_queue_limits_t;
|
||||
|
||||
static void
|
||||
zvol_queue_limits_init(zvol_queue_limits_t *limits, zvol_state_t *zv,
|
||||
boolean_t use_blk_mq)
|
||||
{
|
||||
limits->zql_max_hw_sectors = (DMU_MAX_ACCESS / 4) >> 9;
|
||||
|
||||
if (use_blk_mq) {
|
||||
/*
|
||||
* IO requests can be really big (1MB). When an IO request
|
||||
* comes in, it is passed off to zvol_read() or zvol_write()
|
||||
* in a new thread, where it is chunked up into 'volblocksize'
|
||||
* sized pieces and processed. So for example, if the request
|
||||
* is a 1MB write and your volblocksize is 128k, one zvol_write
|
||||
* thread will take that request and sequentially do ten 128k
|
||||
* IOs. This is due to the fact that the thread needs to lock
|
||||
* each volblocksize sized block. So you might be wondering:
|
||||
* "instead of passing the whole 1MB request to one thread,
|
||||
* why not pass ten individual 128k chunks to ten threads and
|
||||
* process the whole write in parallel?" The short answer is
|
||||
* that there's a sweet spot number of chunks that balances
|
||||
* the greater parallelism with the added overhead of more
|
||||
* threads. The sweet spot can be different depending on if you
|
||||
* have a read or write heavy workload. Writes typically want
|
||||
* high chunk counts while reads typically want lower ones. On
|
||||
* a test pool with 6 NVMe drives in a 3x 2-disk mirror
|
||||
* configuration, with volblocksize=8k, the sweet spot for good
|
||||
* sequential reads and writes was at 8 chunks.
|
||||
*/
|
||||
|
||||
/*
|
||||
* Below we tell the kernel how big we want our requests
|
||||
* to be. You would think that blk_queue_io_opt() would be
|
||||
* used to do this since it is used to "set optimal request
|
||||
* size for the queue", but that doesn't seem to do
|
||||
* anything - the kernel still gives you huge requests
|
||||
* with tons of little PAGE_SIZE segments contained within it.
|
||||
*
|
||||
* Knowing that the kernel will just give you PAGE_SIZE segments
|
||||
* no matter what, you can say "ok, I want PAGE_SIZE byte
|
||||
* segments, and I want 'N' of them per request", where N is
|
||||
* the correct number of segments for the volblocksize and
|
||||
* number of chunks you want.
|
||||
*/
|
||||
#ifdef HAVE_BLK_MQ
|
||||
if (zvol_blk_mq_blocks_per_thread != 0) {
|
||||
unsigned int chunks;
|
||||
chunks = MIN(zvol_blk_mq_blocks_per_thread, UINT16_MAX);
|
||||
|
||||
limits->zql_max_segment_size = PAGE_SIZE;
|
||||
limits->zql_max_segments =
|
||||
(zv->zv_volblocksize * chunks) / PAGE_SIZE;
|
||||
} else {
|
||||
/*
|
||||
* Special case: zvol_blk_mq_blocks_per_thread = 0
|
||||
* Max everything out.
|
||||
*/
|
||||
limits->zql_max_segments = UINT16_MAX;
|
||||
limits->zql_max_segment_size = UINT_MAX;
|
||||
}
|
||||
} else {
|
||||
#endif
|
||||
limits->zql_max_segments = UINT16_MAX;
|
||||
limits->zql_max_segment_size = UINT_MAX;
|
||||
}
|
||||
|
||||
limits->zql_io_opt = zv->zv_volblocksize;
|
||||
|
||||
limits->zql_physical_block_size = zv->zv_volblocksize;
|
||||
limits->zql_max_discard_sectors =
|
||||
(zvol_max_discard_blocks * zv->zv_volblocksize) >> 9;
|
||||
limits->zql_discard_granularity = zv->zv_volblocksize;
|
||||
}
|
||||
|
||||
#ifdef HAVE_BLK_ALLOC_DISK_2ARG
|
||||
static void
|
||||
zvol_queue_limits_convert(zvol_queue_limits_t *limits,
|
||||
struct queue_limits *qlimits)
|
||||
{
|
||||
memset(qlimits, 0, sizeof (struct queue_limits));
|
||||
qlimits->max_hw_sectors = limits->zql_max_hw_sectors;
|
||||
qlimits->max_segments = limits->zql_max_segments;
|
||||
qlimits->max_segment_size = limits->zql_max_segment_size;
|
||||
qlimits->io_opt = limits->zql_io_opt;
|
||||
qlimits->physical_block_size = limits->zql_physical_block_size;
|
||||
qlimits->max_discard_sectors = limits->zql_max_discard_sectors;
|
||||
qlimits->max_hw_discard_sectors = limits->zql_max_discard_sectors;
|
||||
qlimits->discard_granularity = limits->zql_discard_granularity;
|
||||
#ifdef HAVE_BLKDEV_QUEUE_LIMITS_FEATURES
|
||||
qlimits->features =
|
||||
BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA | BLK_FEAT_IO_STAT;
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
static void
|
||||
zvol_queue_limits_apply(zvol_queue_limits_t *limits,
|
||||
struct request_queue *queue)
|
||||
{
|
||||
#ifndef HAVE_BLK_ALLOC_DISK_2ARG
|
||||
blk_queue_max_hw_sectors(queue, limits->zql_max_hw_sectors);
|
||||
blk_queue_max_segments(queue, limits->zql_max_segments);
|
||||
blk_queue_max_segment_size(queue, limits->zql_max_segment_size);
|
||||
blk_queue_io_opt(queue, limits->zql_io_opt);
|
||||
blk_queue_physical_block_size(queue, limits->zql_physical_block_size);
|
||||
blk_queue_max_discard_sectors(queue, limits->zql_max_discard_sectors);
|
||||
blk_queue_discard_granularity(queue, limits->zql_discard_granularity);
|
||||
#endif
|
||||
#ifndef HAVE_BLKDEV_QUEUE_LIMITS_FEATURES
|
||||
blk_queue_set_write_cache(queue, B_TRUE);
|
||||
blk_queue_flag_set(QUEUE_FLAG_IO_STAT, queue);
|
||||
#endif
|
||||
}
|
||||
|
||||
static int
|
||||
zvol_alloc_non_blk_mq(struct zvol_state_os *zso)
|
||||
zvol_alloc_non_blk_mq(struct zvol_state_os *zso, zvol_queue_limits_t *limits)
|
||||
{
|
||||
#if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS)
|
||||
#if defined(HAVE_BLK_ALLOC_DISK)
|
||||
@@ -1085,7 +1238,9 @@ zvol_alloc_non_blk_mq(struct zvol_state_os *zso)
|
||||
zso->zvo_disk->minors = ZVOL_MINORS;
|
||||
zso->zvo_queue = zso->zvo_disk->queue;
|
||||
#elif defined(HAVE_BLK_ALLOC_DISK_2ARG)
|
||||
struct gendisk *disk = blk_alloc_disk(NULL, NUMA_NO_NODE);
|
||||
struct queue_limits qlimits;
|
||||
zvol_queue_limits_convert(limits, &qlimits);
|
||||
struct gendisk *disk = blk_alloc_disk(&qlimits, NUMA_NO_NODE);
|
||||
if (IS_ERR(disk)) {
|
||||
zso->zvo_disk = NULL;
|
||||
return (1);
|
||||
@@ -1094,6 +1249,7 @@ zvol_alloc_non_blk_mq(struct zvol_state_os *zso)
|
||||
zso->zvo_disk = disk;
|
||||
zso->zvo_disk->minors = ZVOL_MINORS;
|
||||
zso->zvo_queue = zso->zvo_disk->queue;
|
||||
|
||||
#else
|
||||
zso->zvo_queue = blk_alloc_queue(NUMA_NO_NODE);
|
||||
if (zso->zvo_queue == NULL)
|
||||
@@ -1120,12 +1276,15 @@ zvol_alloc_non_blk_mq(struct zvol_state_os *zso)
|
||||
|
||||
zso->zvo_disk->queue = zso->zvo_queue;
|
||||
#endif /* HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS */
|
||||
|
||||
zvol_queue_limits_apply(limits, zso->zvo_queue);
|
||||
|
||||
return (0);
|
||||
|
||||
}
|
||||
|
||||
static int
|
||||
zvol_alloc_blk_mq(zvol_state_t *zv)
|
||||
zvol_alloc_blk_mq(zvol_state_t *zv, zvol_queue_limits_t *limits)
|
||||
{
|
||||
#ifdef HAVE_BLK_MQ
|
||||
struct zvol_state_os *zso = zv->zv_zso;
|
||||
@@ -1143,7 +1302,9 @@ zvol_alloc_blk_mq(zvol_state_t *zv)
|
||||
zso->zvo_queue = zso->zvo_disk->queue;
|
||||
zso->zvo_disk->minors = ZVOL_MINORS;
|
||||
#elif defined(HAVE_BLK_ALLOC_DISK_2ARG)
|
||||
struct gendisk *disk = blk_mq_alloc_disk(&zso->tag_set, NULL, zv);
|
||||
struct queue_limits qlimits;
|
||||
zvol_queue_limits_convert(limits, &qlimits);
|
||||
struct gendisk *disk = blk_mq_alloc_disk(&zso->tag_set, &qlimits, zv);
|
||||
if (IS_ERR(disk)) {
|
||||
zso->zvo_disk = NULL;
|
||||
blk_mq_free_tag_set(&zso->tag_set);
|
||||
@@ -1169,9 +1330,11 @@ zvol_alloc_blk_mq(zvol_state_t *zv)
|
||||
|
||||
/* Our queue is now created, assign it to our disk */
|
||||
zso->zvo_disk->queue = zso->zvo_queue;
|
||||
#endif
|
||||
|
||||
zvol_queue_limits_apply(limits, zso->zvo_queue);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
return (0);
|
||||
}
|
||||
|
||||
@@ -1180,7 +1343,7 @@ zvol_alloc_blk_mq(zvol_state_t *zv)
|
||||
* request queue and generic disk structures for the block device.
|
||||
*/
|
||||
static zvol_state_t *
|
||||
zvol_alloc(dev_t dev, const char *name)
|
||||
zvol_alloc(dev_t dev, const char *name, uint64_t volblocksize)
|
||||
{
|
||||
zvol_state_t *zv;
|
||||
struct zvol_state_os *zso;
|
||||
@@ -1200,6 +1363,7 @@ zvol_alloc(dev_t dev, const char *name)
|
||||
zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP);
|
||||
zv->zv_zso = zso;
|
||||
zv->zv_volmode = volmode;
|
||||
zv->zv_volblocksize = volblocksize;
|
||||
|
||||
list_link_init(&zv->zv_next);
|
||||
mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL);
|
||||
@@ -1208,6 +1372,9 @@ zvol_alloc(dev_t dev, const char *name)
|
||||
zv->zv_zso->use_blk_mq = zvol_use_blk_mq;
|
||||
#endif
|
||||
|
||||
zvol_queue_limits_t limits;
|
||||
zvol_queue_limits_init(&limits, zv, zv->zv_zso->use_blk_mq);
|
||||
|
||||
/*
|
||||
* The block layer has 3 interfaces for getting BIOs:
|
||||
*
|
||||
@@ -1224,17 +1391,15 @@ zvol_alloc(dev_t dev, const char *name)
|
||||
* disk and the queue separately. (5.13 kernel or older)
|
||||
*/
|
||||
if (zv->zv_zso->use_blk_mq) {
|
||||
ret = zvol_alloc_blk_mq(zv);
|
||||
ret = zvol_alloc_blk_mq(zv, &limits);
|
||||
zso->zvo_disk->fops = &zvol_ops_blk_mq;
|
||||
} else {
|
||||
ret = zvol_alloc_non_blk_mq(zso);
|
||||
ret = zvol_alloc_non_blk_mq(zso, &limits);
|
||||
zso->zvo_disk->fops = &zvol_ops;
|
||||
}
|
||||
if (ret != 0)
|
||||
goto out_kmem;
|
||||
|
||||
blk_queue_set_write_cache(zso->zvo_queue, B_TRUE, B_TRUE);
|
||||
|
||||
/* Limit read-ahead to a single page to prevent over-prefetching. */
|
||||
blk_queue_set_read_ahead(zso->zvo_queue, 1);
|
||||
|
||||
@@ -1243,9 +1408,6 @@ zvol_alloc(dev_t dev, const char *name)
|
||||
blk_queue_flag_set(QUEUE_FLAG_NOMERGES, zso->zvo_queue);
|
||||
}
|
||||
|
||||
/* Enable /proc/diskstats */
|
||||
blk_queue_flag_set(QUEUE_FLAG_IO_STAT, zso->zvo_queue);
|
||||
|
||||
zso->zvo_queue->queuedata = zv;
|
||||
zso->zvo_dev = dev;
|
||||
zv->zv_open_count = 0;
|
||||
@@ -1337,6 +1499,101 @@ zvol_wait_close(zvol_state_t *zv)
|
||||
{
|
||||
}
|
||||
|
||||
struct add_disk_work {
|
||||
struct delayed_work work;
|
||||
struct gendisk *disk;
|
||||
int error;
|
||||
};
|
||||
|
||||
static int
|
||||
__zvol_os_add_disk(struct gendisk *disk)
|
||||
{
|
||||
int error = 0;
|
||||
#ifdef HAVE_ADD_DISK_RET
|
||||
error = add_disk(disk);
|
||||
#else
|
||||
add_disk(disk);
|
||||
#endif
|
||||
return (error);
|
||||
}
|
||||
|
||||
#if defined(HAVE_BDEV_FILE_OPEN_BY_PATH)
|
||||
static void
|
||||
zvol_os_add_disk_work(struct work_struct *work)
|
||||
{
|
||||
struct add_disk_work *add_disk_work;
|
||||
add_disk_work = container_of(work, struct add_disk_work, work.work);
|
||||
add_disk_work->error = __zvol_os_add_disk(add_disk_work->disk);
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
* SPECIAL CASE:
|
||||
*
|
||||
* This function basically calls add_disk() from a workqueue. You may be
|
||||
* thinking: why not just call add_disk() directly?
|
||||
*
|
||||
* When you call add_disk(), the zvol appears to the world. When this happens,
|
||||
* the kernel calls disk_scan_partitions() on the zvol, which behaves
|
||||
* differently on the 6.9+ kernels:
|
||||
*
|
||||
* - 6.8 and older kernels -
|
||||
* disk_scan_partitions()
|
||||
* handle = bdev_open_by_dev(
|
||||
* zvol_open()
|
||||
* bdev_release(handle);
|
||||
* zvol_release()
|
||||
*
|
||||
*
|
||||
* - 6.9+ kernels -
|
||||
* disk_scan_partitions()
|
||||
* file = bdev_file_open_by_dev()
|
||||
* zvol_open()
|
||||
* fput(file)
|
||||
* < wait for return to userspace >
|
||||
* zvol_release()
|
||||
*
|
||||
* The difference is that the bdev_release() from the 6.8 kernel is synchronous
|
||||
* while the fput() from the 6.9 kernel is async. Or more specifically it's
|
||||
* async that has to wait until we return to userspace (since it adds the fput
|
||||
* into the caller's work queue with the TWA_RESUME flag set). This is not the
|
||||
* behavior we want, since we want do things like create+destroy a zvol within
|
||||
* a single ZFS_IOC_CREATE ioctl, and the "create" part needs to release the
|
||||
* reference to the zvol while we're in the IOCTL, which can't wait until we
|
||||
* return to userspace.
|
||||
*
|
||||
* We can get around this since fput() has a special codepath for when it's
|
||||
* running in a kernel thread or interrupt. In those cases, it just puts the
|
||||
* fput into the system workqueue, which we can force to run with
|
||||
* __flush_workqueue(). That is why we call add_disk() from a workqueue - so it
|
||||
* run from a kernel thread and "tricks" the fput() codepaths.
|
||||
*
|
||||
* Note that __flush_workqueue() is slowly getting deprecated. This may be ok
|
||||
* though, since our IOCTL will spin on EBUSY waiting for the zvol release (via
|
||||
* fput) to happen, which it eventually, naturally, will from the system_wq
|
||||
* without us explicitly calling __flush_workqueue().
|
||||
*/
|
||||
static int
|
||||
zvol_os_add_disk(struct gendisk *disk)
|
||||
{
|
||||
#if defined(HAVE_BDEV_FILE_OPEN_BY_PATH) /* 6.9+ kernel */
|
||||
struct add_disk_work add_disk_work;
|
||||
|
||||
INIT_DELAYED_WORK(&add_disk_work.work, zvol_os_add_disk_work);
|
||||
add_disk_work.disk = disk;
|
||||
add_disk_work.error = 0;
|
||||
|
||||
/* Use *_delayed_work functions since they're not GPL'd */
|
||||
schedule_delayed_work(&add_disk_work.work, 0);
|
||||
flush_delayed_work(&add_disk_work.work);
|
||||
|
||||
__flush_workqueue(system_wq);
|
||||
return (add_disk_work.error);
|
||||
#else /* <= 6.8 kernel */
|
||||
return (__zvol_os_add_disk(disk));
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
* Create a block device minor node and setup the linkage between it
|
||||
* and the specified volume. Once this function returns the block
|
||||
@@ -1393,7 +1650,8 @@ zvol_os_create_minor(const char *name)
|
||||
if (error)
|
||||
goto out_dmu_objset_disown;
|
||||
|
||||
zv = zvol_alloc(MKDEV(zvol_major, minor), name);
|
||||
zv = zvol_alloc(MKDEV(zvol_major, minor), name,
|
||||
doi->doi_data_block_size);
|
||||
if (zv == NULL) {
|
||||
error = SET_ERROR(EAGAIN);
|
||||
goto out_dmu_objset_disown;
|
||||
@@ -1403,84 +1661,11 @@ zvol_os_create_minor(const char *name)
|
||||
if (dmu_objset_is_snapshot(os))
|
||||
zv->zv_flags |= ZVOL_RDONLY;
|
||||
|
||||
zv->zv_volblocksize = doi->doi_data_block_size;
|
||||
zv->zv_volsize = volsize;
|
||||
zv->zv_objset = os;
|
||||
|
||||
set_capacity(zv->zv_zso->zvo_disk, zv->zv_volsize >> 9);
|
||||
|
||||
blk_queue_max_hw_sectors(zv->zv_zso->zvo_queue,
|
||||
(DMU_MAX_ACCESS / 4) >> 9);
|
||||
|
||||
if (zv->zv_zso->use_blk_mq) {
|
||||
/*
|
||||
* IO requests can be really big (1MB). When an IO request
|
||||
* comes in, it is passed off to zvol_read() or zvol_write()
|
||||
* in a new thread, where it is chunked up into 'volblocksize'
|
||||
* sized pieces and processed. So for example, if the request
|
||||
* is a 1MB write and your volblocksize is 128k, one zvol_write
|
||||
* thread will take that request and sequentially do ten 128k
|
||||
* IOs. This is due to the fact that the thread needs to lock
|
||||
* each volblocksize sized block. So you might be wondering:
|
||||
* "instead of passing the whole 1MB request to one thread,
|
||||
* why not pass ten individual 128k chunks to ten threads and
|
||||
* process the whole write in parallel?" The short answer is
|
||||
* that there's a sweet spot number of chunks that balances
|
||||
* the greater parallelism with the added overhead of more
|
||||
* threads. The sweet spot can be different depending on if you
|
||||
* have a read or write heavy workload. Writes typically want
|
||||
* high chunk counts while reads typically want lower ones. On
|
||||
* a test pool with 6 NVMe drives in a 3x 2-disk mirror
|
||||
* configuration, with volblocksize=8k, the sweet spot for good
|
||||
* sequential reads and writes was at 8 chunks.
|
||||
*/
|
||||
|
||||
/*
|
||||
* Below we tell the kernel how big we want our requests
|
||||
* to be. You would think that blk_queue_io_opt() would be
|
||||
* used to do this since it is used to "set optimal request
|
||||
* size for the queue", but that doesn't seem to do
|
||||
* anything - the kernel still gives you huge requests
|
||||
* with tons of little PAGE_SIZE segments contained within it.
|
||||
*
|
||||
* Knowing that the kernel will just give you PAGE_SIZE segments
|
||||
* no matter what, you can say "ok, I want PAGE_SIZE byte
|
||||
* segments, and I want 'N' of them per request", where N is
|
||||
* the correct number of segments for the volblocksize and
|
||||
* number of chunks you want.
|
||||
*/
|
||||
#ifdef HAVE_BLK_MQ
|
||||
if (zvol_blk_mq_blocks_per_thread != 0) {
|
||||
unsigned int chunks;
|
||||
chunks = MIN(zvol_blk_mq_blocks_per_thread, UINT16_MAX);
|
||||
|
||||
blk_queue_max_segment_size(zv->zv_zso->zvo_queue,
|
||||
PAGE_SIZE);
|
||||
blk_queue_max_segments(zv->zv_zso->zvo_queue,
|
||||
(zv->zv_volblocksize * chunks) / PAGE_SIZE);
|
||||
} else {
|
||||
/*
|
||||
* Special case: zvol_blk_mq_blocks_per_thread = 0
|
||||
* Max everything out.
|
||||
*/
|
||||
blk_queue_max_segments(zv->zv_zso->zvo_queue,
|
||||
UINT16_MAX);
|
||||
blk_queue_max_segment_size(zv->zv_zso->zvo_queue,
|
||||
UINT_MAX);
|
||||
}
|
||||
#endif
|
||||
} else {
|
||||
blk_queue_max_segments(zv->zv_zso->zvo_queue, UINT16_MAX);
|
||||
blk_queue_max_segment_size(zv->zv_zso->zvo_queue, UINT_MAX);
|
||||
}
|
||||
|
||||
blk_queue_physical_block_size(zv->zv_zso->zvo_queue,
|
||||
zv->zv_volblocksize);
|
||||
blk_queue_io_opt(zv->zv_zso->zvo_queue, zv->zv_volblocksize);
|
||||
blk_queue_max_discard_sectors(zv->zv_zso->zvo_queue,
|
||||
(zvol_max_discard_blocks * zv->zv_volblocksize) >> 9);
|
||||
blk_queue_discard_granularity(zv->zv_zso->zvo_queue,
|
||||
zv->zv_volblocksize);
|
||||
#ifdef QUEUE_FLAG_DISCARD
|
||||
blk_queue_flag_set(QUEUE_FLAG_DISCARD, zv->zv_zso->zvo_queue);
|
||||
#endif
|
||||
@@ -1541,11 +1726,7 @@ out_doi:
|
||||
rw_enter(&zvol_state_lock, RW_WRITER);
|
||||
zvol_insert(zv);
|
||||
rw_exit(&zvol_state_lock);
|
||||
#ifdef HAVE_ADD_DISK_RET
|
||||
error = add_disk(zv->zv_zso->zvo_disk);
|
||||
#else
|
||||
add_disk(zv->zv_zso->zvo_disk);
|
||||
#endif
|
||||
error = zvol_os_add_disk(zv->zv_zso->zvo_disk);
|
||||
} else {
|
||||
ida_simple_remove(&zvol_ida, idx);
|
||||
}
|
||||
|
||||
@@ -471,7 +471,8 @@ fletcher_4_native(const void *buf, uint64_t size,
|
||||
const void *ctx_template, zio_cksum_t *zcp)
|
||||
{
|
||||
(void) ctx_template;
|
||||
const uint64_t p2size = P2ALIGN(size, FLETCHER_MIN_SIMD_SIZE);
|
||||
const uint64_t p2size = P2ALIGN_TYPED(size, FLETCHER_MIN_SIMD_SIZE,
|
||||
uint64_t);
|
||||
|
||||
ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t)));
|
||||
|
||||
@@ -519,7 +520,8 @@ fletcher_4_byteswap(const void *buf, uint64_t size,
|
||||
const void *ctx_template, zio_cksum_t *zcp)
|
||||
{
|
||||
(void) ctx_template;
|
||||
const uint64_t p2size = P2ALIGN(size, FLETCHER_MIN_SIMD_SIZE);
|
||||
const uint64_t p2size = P2ALIGN_TYPED(size, FLETCHER_MIN_SIMD_SIZE,
|
||||
uint64_t);
|
||||
|
||||
ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t)));
|
||||
|
||||
@@ -878,7 +880,7 @@ abd_fletcher_4_iter(void *data, size_t size, void *private)
|
||||
fletcher_4_ctx_t *ctx = cdp->acd_ctx;
|
||||
fletcher_4_ops_t *ops = (fletcher_4_ops_t *)cdp->acd_private;
|
||||
boolean_t native = cdp->acd_byteorder == ZIO_CHECKSUM_NATIVE;
|
||||
uint64_t asize = P2ALIGN(size, FLETCHER_MIN_SIMD_SIZE);
|
||||
uint64_t asize = P2ALIGN_TYPED(size, FLETCHER_MIN_SIMD_SIZE, uint64_t);
|
||||
|
||||
ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t)));
|
||||
|
||||
|
||||
+29
-43
@@ -8879,7 +8879,7 @@ out:
|
||||
* assertions may be violated without functional consequences
|
||||
* as the device is about to be removed.
|
||||
*/
|
||||
ASSERT3U(dev->l2ad_hand + distance, <, dev->l2ad_end);
|
||||
ASSERT3U(dev->l2ad_hand + distance, <=, dev->l2ad_end);
|
||||
if (!dev->l2ad_first)
|
||||
ASSERT3U(dev->l2ad_hand, <=, dev->l2ad_evict);
|
||||
}
|
||||
@@ -8895,7 +8895,6 @@ l2arc_apply_transforms(spa_t *spa, arc_buf_hdr_t *hdr, uint64_t asize,
|
||||
abd_t **abd_out)
|
||||
{
|
||||
int ret;
|
||||
void *tmp = NULL;
|
||||
abd_t *cabd = NULL, *eabd = NULL, *to_write = hdr->b_l1hdr.b_pabd;
|
||||
enum zio_compress compress = HDR_GET_COMPRESS(hdr);
|
||||
uint64_t psize = HDR_GET_PSIZE(hdr);
|
||||
@@ -8916,12 +8915,11 @@ l2arc_apply_transforms(spa_t *spa, arc_buf_hdr_t *hdr, uint64_t asize,
|
||||
* and copy the data. This may be done to eliminate a dependency on a
|
||||
* shared buffer or to reallocate the buffer to match asize.
|
||||
*/
|
||||
if (HDR_HAS_RABD(hdr) && asize != psize) {
|
||||
ASSERT3U(asize, >=, psize);
|
||||
if (HDR_HAS_RABD(hdr)) {
|
||||
ASSERT3U(asize, >, psize);
|
||||
to_write = abd_alloc_for_io(asize, ismd);
|
||||
abd_copy(to_write, hdr->b_crypt_hdr.b_rabd, psize);
|
||||
if (psize != asize)
|
||||
abd_zero_off(to_write, psize, asize - psize);
|
||||
abd_zero_off(to_write, psize, asize - psize);
|
||||
goto out;
|
||||
}
|
||||
|
||||
@@ -8930,48 +8928,31 @@ l2arc_apply_transforms(spa_t *spa, arc_buf_hdr_t *hdr, uint64_t asize,
|
||||
ASSERT3U(size, ==, psize);
|
||||
to_write = abd_alloc_for_io(asize, ismd);
|
||||
abd_copy(to_write, hdr->b_l1hdr.b_pabd, size);
|
||||
if (size != asize)
|
||||
if (asize > size)
|
||||
abd_zero_off(to_write, size, asize - size);
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (compress != ZIO_COMPRESS_OFF && !HDR_COMPRESSION_ENABLED(hdr)) {
|
||||
/*
|
||||
* In some cases, we can wind up with size > asize, so
|
||||
* we need to opt for the larger allocation option here.
|
||||
*
|
||||
* (We also need abd_return_buf_copy in all cases because
|
||||
* it's an ASSERT() to modify the buffer before returning it
|
||||
* with arc_return_buf(), and all the compressors
|
||||
* write things before deciding to fail compression in nearly
|
||||
* every case.)
|
||||
*/
|
||||
uint64_t bufsize = MAX(size, asize);
|
||||
cabd = abd_alloc_for_io(bufsize, ismd);
|
||||
tmp = abd_borrow_buf(cabd, bufsize);
|
||||
|
||||
psize = zio_compress_data(compress, to_write, &tmp, size,
|
||||
hdr->b_complevel);
|
||||
|
||||
if (psize >= asize) {
|
||||
psize = HDR_GET_PSIZE(hdr);
|
||||
abd_return_buf_copy(cabd, tmp, bufsize);
|
||||
HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_OFF);
|
||||
to_write = cabd;
|
||||
abd_copy(to_write, hdr->b_l1hdr.b_pabd, psize);
|
||||
if (psize != asize)
|
||||
abd_zero_off(to_write, psize, asize - psize);
|
||||
goto encrypt;
|
||||
size_t bufsize = MAX(size, asize);
|
||||
void *buf = zio_buf_alloc(bufsize);
|
||||
uint64_t csize = zio_compress_data(compress, to_write, &buf,
|
||||
size, hdr->b_complevel);
|
||||
if (csize > psize) {
|
||||
/*
|
||||
* We can't re-compress the block into the original
|
||||
* psize. Even if it fits into asize, it does not
|
||||
* matter, since checksum will never match on read.
|
||||
*/
|
||||
zio_buf_free(buf, bufsize);
|
||||
return (SET_ERROR(EIO));
|
||||
}
|
||||
ASSERT3U(psize, <=, HDR_GET_PSIZE(hdr));
|
||||
if (psize < asize)
|
||||
memset((char *)tmp + psize, 0, bufsize - psize);
|
||||
psize = HDR_GET_PSIZE(hdr);
|
||||
abd_return_buf_copy(cabd, tmp, bufsize);
|
||||
to_write = cabd;
|
||||
if (asize > csize)
|
||||
memset((char *)buf + csize, 0, asize - csize);
|
||||
to_write = cabd = abd_get_from_buf(buf, bufsize);
|
||||
abd_take_ownership_of_buf(cabd, B_TRUE);
|
||||
}
|
||||
|
||||
encrypt:
|
||||
if (HDR_ENCRYPTED(hdr)) {
|
||||
eabd = abd_alloc_for_io(asize, ismd);
|
||||
|
||||
@@ -9074,12 +9055,17 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
|
||||
*/
|
||||
for (int pass = 0; pass < L2ARC_FEED_TYPES; pass++) {
|
||||
/*
|
||||
* If pass == 1 or 3, we cache MRU metadata and data
|
||||
* respectively.
|
||||
* pass == 0: MFU meta
|
||||
* pass == 1: MRU meta
|
||||
* pass == 2: MFU data
|
||||
* pass == 3: MRU data
|
||||
*/
|
||||
if (l2arc_mfuonly) {
|
||||
if (l2arc_mfuonly == 1) {
|
||||
if (pass == 1 || pass == 3)
|
||||
continue;
|
||||
} else if (l2arc_mfuonly > 1) {
|
||||
if (pass == 3)
|
||||
continue;
|
||||
}
|
||||
|
||||
uint64_t passed_sz = 0;
|
||||
|
||||
+1
-1
@@ -218,7 +218,7 @@ zfs_btree_create_custom(zfs_btree_t *tree,
|
||||
zfs_btree_find_in_buf : bt_find_in_buf;
|
||||
tree->bt_elem_size = size;
|
||||
tree->bt_leaf_size = lsize;
|
||||
tree->bt_leaf_cap = P2ALIGN(esize / size, 2);
|
||||
tree->bt_leaf_cap = P2ALIGN_TYPED(esize / size, 2, size_t);
|
||||
tree->bt_height = -1;
|
||||
tree->bt_bulk = NULL;
|
||||
}
|
||||
|
||||
@@ -201,6 +201,9 @@ dataset_kstats_destroy(dataset_kstats_t *dk)
|
||||
void
|
||||
dataset_kstats_rename(dataset_kstats_t *dk, const char *name)
|
||||
{
|
||||
if (dk->dk_kstats == NULL)
|
||||
return;
|
||||
|
||||
dataset_kstat_values_t *dkv = dk->dk_kstats->ks_data;
|
||||
char *ds_name;
|
||||
|
||||
|
||||
+109
-114
@@ -161,13 +161,13 @@ struct {
|
||||
} dbuf_sums;
|
||||
|
||||
#define DBUF_STAT_INCR(stat, val) \
|
||||
wmsum_add(&dbuf_sums.stat, val);
|
||||
wmsum_add(&dbuf_sums.stat, val)
|
||||
#define DBUF_STAT_DECR(stat, val) \
|
||||
DBUF_STAT_INCR(stat, -(val));
|
||||
DBUF_STAT_INCR(stat, -(val))
|
||||
#define DBUF_STAT_BUMP(stat) \
|
||||
DBUF_STAT_INCR(stat, 1);
|
||||
DBUF_STAT_INCR(stat, 1)
|
||||
#define DBUF_STAT_BUMPDOWN(stat) \
|
||||
DBUF_STAT_INCR(stat, -1);
|
||||
DBUF_STAT_INCR(stat, -1)
|
||||
#define DBUF_STAT_MAX(stat, v) { \
|
||||
uint64_t _m; \
|
||||
while ((v) > (_m = dbuf_stats.stat.value.ui64) && \
|
||||
@@ -177,7 +177,6 @@ struct {
|
||||
|
||||
static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
|
||||
static void dbuf_sync_leaf_verify_bonus_dnode(dbuf_dirty_record_t *dr);
|
||||
static int dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, uint32_t flags);
|
||||
|
||||
/*
|
||||
* Global data structures and functions for the dbuf cache.
|
||||
@@ -1403,13 +1402,9 @@ dbuf_read_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
|
||||
* a decrypted block. Otherwise success.
|
||||
*/
|
||||
static int
|
||||
dbuf_read_bonus(dmu_buf_impl_t *db, dnode_t *dn, uint32_t flags)
|
||||
dbuf_read_bonus(dmu_buf_impl_t *db, dnode_t *dn)
|
||||
{
|
||||
int bonuslen, max_bonuslen, err;
|
||||
|
||||
err = dbuf_read_verify_dnode_crypt(db, flags);
|
||||
if (err)
|
||||
return (err);
|
||||
int bonuslen, max_bonuslen;
|
||||
|
||||
bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen);
|
||||
max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
|
||||
@@ -1494,32 +1489,46 @@ dbuf_read_hole(dmu_buf_impl_t *db, dnode_t *dn, blkptr_t *bp)
|
||||
* decrypt / authenticate them when we need to read an encrypted bonus buffer.
|
||||
*/
|
||||
static int
|
||||
dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, uint32_t flags)
|
||||
dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, dnode_t *dn, uint32_t flags)
|
||||
{
|
||||
int err = 0;
|
||||
objset_t *os = db->db_objset;
|
||||
arc_buf_t *dnode_abuf;
|
||||
dnode_t *dn;
|
||||
dmu_buf_impl_t *dndb;
|
||||
arc_buf_t *dnbuf;
|
||||
zbookmark_phys_t zb;
|
||||
|
||||
ASSERT(MUTEX_HELD(&db->db_mtx));
|
||||
int err;
|
||||
|
||||
if ((flags & DB_RF_NO_DECRYPT) != 0 ||
|
||||
!os->os_encrypted || os->os_raw_receive)
|
||||
!os->os_encrypted || os->os_raw_receive ||
|
||||
(dndb = dn->dn_dbuf) == NULL)
|
||||
return (0);
|
||||
|
||||
DB_DNODE_ENTER(db);
|
||||
dn = DB_DNODE(db);
|
||||
dnode_abuf = (dn->dn_dbuf != NULL) ? dn->dn_dbuf->db_buf : NULL;
|
||||
|
||||
if (dnode_abuf == NULL || !arc_is_encrypted(dnode_abuf)) {
|
||||
DB_DNODE_EXIT(db);
|
||||
dnbuf = dndb->db_buf;
|
||||
if (!arc_is_encrypted(dnbuf))
|
||||
return (0);
|
||||
}
|
||||
|
||||
mutex_enter(&dndb->db_mtx);
|
||||
|
||||
/*
|
||||
* Since dnode buffer is modified by sync process, there can be only
|
||||
* one copy of it. It means we can not modify (decrypt) it while it
|
||||
* is being written. I don't see how this may happen now, since
|
||||
* encrypted dnode writes by receive should be completed before any
|
||||
* plain-text reads due to txg wait, but better be safe than sorry.
|
||||
*/
|
||||
while (1) {
|
||||
if (!arc_is_encrypted(dnbuf)) {
|
||||
mutex_exit(&dndb->db_mtx);
|
||||
return (0);
|
||||
}
|
||||
dbuf_dirty_record_t *dr = dndb->db_data_pending;
|
||||
if (dr == NULL || dr->dt.dl.dr_data != dnbuf)
|
||||
break;
|
||||
cv_wait(&dndb->db_changed, &dndb->db_mtx);
|
||||
};
|
||||
|
||||
SET_BOOKMARK(&zb, dmu_objset_id(os),
|
||||
DMU_META_DNODE_OBJECT, 0, dn->dn_dbuf->db_blkid);
|
||||
err = arc_untransform(dnode_abuf, os->os_spa, &zb, B_TRUE);
|
||||
DMU_META_DNODE_OBJECT, 0, dndb->db_blkid);
|
||||
err = arc_untransform(dnbuf, os->os_spa, &zb, B_TRUE);
|
||||
|
||||
/*
|
||||
* An error code of EACCES tells us that the key is still not
|
||||
@@ -1532,7 +1541,7 @@ dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, uint32_t flags)
|
||||
!DMU_OT_IS_ENCRYPTED(dn->dn_bonustype))))
|
||||
err = 0;
|
||||
|
||||
DB_DNODE_EXIT(db);
|
||||
mutex_exit(&dndb->db_mtx);
|
||||
|
||||
return (err);
|
||||
}
|
||||
@@ -1558,7 +1567,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags,
|
||||
RW_LOCK_HELD(&db->db_parent->db_rwlock));
|
||||
|
||||
if (db->db_blkid == DMU_BONUS_BLKID) {
|
||||
err = dbuf_read_bonus(db, dn, flags);
|
||||
err = dbuf_read_bonus(db, dn);
|
||||
goto early_unlock;
|
||||
}
|
||||
|
||||
@@ -1619,10 +1628,6 @@ dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags,
|
||||
goto early_unlock;
|
||||
}
|
||||
|
||||
err = dbuf_read_verify_dnode_crypt(db, flags);
|
||||
if (err != 0)
|
||||
goto early_unlock;
|
||||
|
||||
db->db_state = DB_READ;
|
||||
DTRACE_SET_STATE(db, "read issued");
|
||||
mutex_exit(&db->db_mtx);
|
||||
@@ -1738,19 +1743,23 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
|
||||
int
|
||||
dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags)
|
||||
{
|
||||
int err = 0;
|
||||
boolean_t prefetch;
|
||||
dnode_t *dn;
|
||||
boolean_t miss = B_TRUE, need_wait = B_FALSE, prefetch;
|
||||
int err;
|
||||
|
||||
/*
|
||||
* We don't have to hold the mutex to check db_state because it
|
||||
* can't be freed while we have a hold on the buffer.
|
||||
*/
|
||||
ASSERT(!zfs_refcount_is_zero(&db->db_holds));
|
||||
|
||||
DB_DNODE_ENTER(db);
|
||||
dn = DB_DNODE(db);
|
||||
|
||||
/*
|
||||
* Ensure that this block's dnode has been decrypted if the caller
|
||||
* has requested decrypted data.
|
||||
*/
|
||||
err = dbuf_read_verify_dnode_crypt(db, dn, flags);
|
||||
if (err != 0)
|
||||
goto done;
|
||||
|
||||
prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
|
||||
(flags & DB_RF_NOPREFETCH) == 0;
|
||||
|
||||
@@ -1759,13 +1768,38 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags)
|
||||
db->db_partial_read = B_TRUE;
|
||||
else if (!(flags & DB_RF_PARTIAL_MORE))
|
||||
db->db_partial_read = B_FALSE;
|
||||
if (db->db_state == DB_CACHED) {
|
||||
/*
|
||||
* Ensure that this block's dnode has been decrypted if
|
||||
* the caller has requested decrypted data.
|
||||
*/
|
||||
err = dbuf_read_verify_dnode_crypt(db, flags);
|
||||
miss = (db->db_state != DB_CACHED);
|
||||
|
||||
if (db->db_state == DB_READ || db->db_state == DB_FILL) {
|
||||
/*
|
||||
* Another reader came in while the dbuf was in flight between
|
||||
* UNCACHED and CACHED. Either a writer will finish filling
|
||||
* the buffer, sending the dbuf to CACHED, or the first reader's
|
||||
* request will reach the read_done callback and send the dbuf
|
||||
* to CACHED. Otherwise, a failure occurred and the dbuf will
|
||||
* be sent to UNCACHED.
|
||||
*/
|
||||
if (flags & DB_RF_NEVERWAIT) {
|
||||
mutex_exit(&db->db_mtx);
|
||||
DB_DNODE_EXIT(db);
|
||||
goto done;
|
||||
}
|
||||
do {
|
||||
ASSERT(db->db_state == DB_READ ||
|
||||
(flags & DB_RF_HAVESTRUCT) == 0);
|
||||
DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *, db,
|
||||
zio_t *, pio);
|
||||
cv_wait(&db->db_changed, &db->db_mtx);
|
||||
} while (db->db_state == DB_READ || db->db_state == DB_FILL);
|
||||
if (db->db_state == DB_UNCACHED) {
|
||||
err = SET_ERROR(EIO);
|
||||
mutex_exit(&db->db_mtx);
|
||||
DB_DNODE_EXIT(db);
|
||||
goto done;
|
||||
}
|
||||
}
|
||||
|
||||
if (db->db_state == DB_CACHED) {
|
||||
/*
|
||||
* If the arc buf is compressed or encrypted and the caller
|
||||
* requested uncompressed data, we need to untransform it
|
||||
@@ -1773,8 +1807,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags)
|
||||
* unauthenticated blocks, which will verify their MAC if
|
||||
* the key is now available.
|
||||
*/
|
||||
if (err == 0 && db->db_buf != NULL &&
|
||||
(flags & DB_RF_NO_DECRYPT) == 0 &&
|
||||
if ((flags & DB_RF_NO_DECRYPT) == 0 && db->db_buf != NULL &&
|
||||
(arc_is_encrypted(db->db_buf) ||
|
||||
arc_is_unauthenticated(db->db_buf) ||
|
||||
arc_get_compression(db->db_buf) != ZIO_COMPRESS_OFF)) {
|
||||
@@ -1788,17 +1821,10 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags)
|
||||
dbuf_set_data(db, db->db_buf);
|
||||
}
|
||||
mutex_exit(&db->db_mtx);
|
||||
if (err == 0 && prefetch) {
|
||||
dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE,
|
||||
B_FALSE, flags & DB_RF_HAVESTRUCT);
|
||||
}
|
||||
DB_DNODE_EXIT(db);
|
||||
DBUF_STAT_BUMP(hash_hits);
|
||||
} else if (db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL) {
|
||||
boolean_t need_wait = B_FALSE;
|
||||
|
||||
} else {
|
||||
ASSERT(db->db_state == DB_UNCACHED ||
|
||||
db->db_state == DB_NOFILL);
|
||||
db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG);
|
||||
|
||||
if (pio == NULL && (db->db_state == DB_NOFILL ||
|
||||
(db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)))) {
|
||||
spa_t *spa = dn->dn_objset->os_spa;
|
||||
@@ -1806,65 +1832,33 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags)
|
||||
need_wait = B_TRUE;
|
||||
}
|
||||
err = dbuf_read_impl(db, dn, pio, flags, dblt, FTAG);
|
||||
/*
|
||||
* dbuf_read_impl has dropped db_mtx and our parent's rwlock
|
||||
* for us
|
||||
*/
|
||||
if (!err && prefetch) {
|
||||
dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE,
|
||||
db->db_state != DB_CACHED,
|
||||
flags & DB_RF_HAVESTRUCT);
|
||||
}
|
||||
|
||||
DB_DNODE_EXIT(db);
|
||||
DBUF_STAT_BUMP(hash_misses);
|
||||
|
||||
/*
|
||||
* If we created a zio_root we must execute it to avoid
|
||||
* leaking it, even if it isn't attached to any work due
|
||||
* to an error in dbuf_read_impl().
|
||||
*/
|
||||
if (need_wait) {
|
||||
if (err == 0)
|
||||
err = zio_wait(pio);
|
||||
else
|
||||
(void) zio_wait(pio);
|
||||
pio = NULL;
|
||||
}
|
||||
} else {
|
||||
/*
|
||||
* Another reader came in while the dbuf was in flight
|
||||
* between UNCACHED and CACHED. Either a writer will finish
|
||||
* writing the buffer (sending the dbuf to CACHED) or the
|
||||
* first reader's request will reach the read_done callback
|
||||
* and send the dbuf to CACHED. Otherwise, a failure
|
||||
* occurred and the dbuf went to UNCACHED.
|
||||
*/
|
||||
mutex_exit(&db->db_mtx);
|
||||
if (prefetch) {
|
||||
dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE,
|
||||
B_TRUE, flags & DB_RF_HAVESTRUCT);
|
||||
}
|
||||
DB_DNODE_EXIT(db);
|
||||
DBUF_STAT_BUMP(hash_misses);
|
||||
|
||||
/* Skip the wait per the caller's request. */
|
||||
if ((flags & DB_RF_NEVERWAIT) == 0) {
|
||||
mutex_enter(&db->db_mtx);
|
||||
while (db->db_state == DB_READ ||
|
||||
db->db_state == DB_FILL) {
|
||||
ASSERT(db->db_state == DB_READ ||
|
||||
(flags & DB_RF_HAVESTRUCT) == 0);
|
||||
DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *,
|
||||
db, zio_t *, pio);
|
||||
cv_wait(&db->db_changed, &db->db_mtx);
|
||||
}
|
||||
if (db->db_state == DB_UNCACHED)
|
||||
err = SET_ERROR(EIO);
|
||||
mutex_exit(&db->db_mtx);
|
||||
}
|
||||
/* dbuf_read_impl drops db_mtx and parent's rwlock. */
|
||||
miss = (db->db_state != DB_CACHED);
|
||||
}
|
||||
|
||||
if (err == 0 && prefetch) {
|
||||
dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE, miss,
|
||||
flags & DB_RF_HAVESTRUCT);
|
||||
}
|
||||
DB_DNODE_EXIT(db);
|
||||
|
||||
/*
|
||||
* If we created a zio we must execute it to avoid leaking it, even if
|
||||
* it isn't attached to any work due to an error in dbuf_read_impl().
|
||||
*/
|
||||
if (need_wait) {
|
||||
if (err == 0)
|
||||
err = zio_wait(pio);
|
||||
else
|
||||
(void) zio_wait(pio);
|
||||
pio = NULL;
|
||||
}
|
||||
|
||||
done:
|
||||
if (miss)
|
||||
DBUF_STAT_BUMP(hash_misses);
|
||||
else
|
||||
DBUF_STAT_BUMP(hash_hits);
|
||||
if (pio && err != 0) {
|
||||
zio_t *zio = zio_null(pio, pio->io_spa, NULL, NULL, NULL,
|
||||
ZIO_FLAG_CANFAIL);
|
||||
@@ -2840,6 +2834,7 @@ dmu_buf_fill_done(dmu_buf_t *dbuf, dmu_tx_t *tx, boolean_t failed)
|
||||
failed = B_FALSE;
|
||||
} else if (failed) {
|
||||
VERIFY(!dbuf_undirty(db, tx));
|
||||
arc_buf_destroy(db->db_buf, db);
|
||||
db->db_buf = NULL;
|
||||
dbuf_clear_data(db);
|
||||
DTRACE_SET_STATE(db, "fill failed");
|
||||
|
||||
+10
-2
@@ -537,7 +537,8 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
|
||||
if (dn->dn_datablkshift) {
|
||||
int blkshift = dn->dn_datablkshift;
|
||||
nblks = (P2ROUNDUP(offset + length, 1ULL << blkshift) -
|
||||
P2ALIGN(offset, 1ULL << blkshift)) >> blkshift;
|
||||
P2ALIGN_TYPED(offset, 1ULL << blkshift, uint64_t))
|
||||
>> blkshift;
|
||||
} else {
|
||||
if (offset + length > dn->dn_datablksz) {
|
||||
zfs_panic_recover("zfs: accessing past end of object "
|
||||
@@ -814,6 +815,13 @@ get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t minimum, uint64_t *l1blks)
|
||||
|
||||
ASSERT3U(minimum, <=, *start);
|
||||
|
||||
/* dn_nlevels == 1 means we don't have any L1 blocks */
|
||||
if (dn->dn_nlevels <= 1) {
|
||||
*l1blks = 0;
|
||||
*start = minimum;
|
||||
return (0);
|
||||
}
|
||||
|
||||
/*
|
||||
* Check if we can free the entire range assuming that all of the
|
||||
* L1 blocks in this range have data. If we can, we use this
|
||||
@@ -854,7 +862,7 @@ get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t minimum, uint64_t *l1blks)
|
||||
}
|
||||
|
||||
/* set start to the beginning of this L1 indirect */
|
||||
*start = P2ALIGN(*start, iblkrange);
|
||||
*start = P2ALIGN_TYPED(*start, iblkrange, uint64_t);
|
||||
}
|
||||
if (*start < minimum)
|
||||
*start = minimum;
|
||||
|
||||
@@ -160,7 +160,7 @@ dmu_object_alloc_impl(objset_t *os, dmu_object_type_t ot, int blocksize,
|
||||
* is not suitably aligned.
|
||||
*/
|
||||
os->os_obj_next_chunk =
|
||||
P2ALIGN(object, dnodes_per_chunk) +
|
||||
P2ALIGN_TYPED(object, dnodes_per_chunk, uint64_t) +
|
||||
dnodes_per_chunk;
|
||||
(void) atomic_swap_64(cpuobj, object);
|
||||
mutex_exit(&os->os_obj_lock);
|
||||
|
||||
@@ -400,10 +400,10 @@ dnode_hash(const objset_t *os, uint64_t obj)
|
||||
|
||||
ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
|
||||
/*
|
||||
* The low 6 bits of the pointer don't have much entropy, because
|
||||
* the objset_t is larger than 2^6 bytes long.
|
||||
* The lower 11 bits of the pointer don't have much entropy, because
|
||||
* the objset_t is more than 1KB long and so likely aligned to 2KB.
|
||||
*/
|
||||
crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF];
|
||||
crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 11)) & 0xFF];
|
||||
crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF];
|
||||
crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF];
|
||||
crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 16)) & 0xFF];
|
||||
|
||||
@@ -3710,16 +3710,19 @@ dsl_dataset_promote_sync(void *arg, dmu_tx_t *tx)
|
||||
spa_history_log_internal_ds(hds, "promote", tx, " ");
|
||||
|
||||
dsl_dir_rele(odd, FTAG);
|
||||
promote_rele(ddpa, FTAG);
|
||||
|
||||
/*
|
||||
* Transfer common error blocks from old head to new head.
|
||||
* Transfer common error blocks from old head to new head, before
|
||||
* calling promote_rele() on ddpa since we need to dereference
|
||||
* origin_head and hds.
|
||||
*/
|
||||
if (spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_HEAD_ERRLOG)) {
|
||||
uint64_t old_head = origin_head->ds_object;
|
||||
uint64_t new_head = hds->ds_object;
|
||||
spa_swap_errlog(dp->dp_spa, new_head, old_head, tx);
|
||||
}
|
||||
|
||||
promote_rele(ddpa, FTAG);
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
@@ -491,6 +491,7 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
|
||||
|
||||
avl_create(&scn->scn_queue, scan_ds_queue_compare, sizeof (scan_ds_t),
|
||||
offsetof(scan_ds_t, sds_node));
|
||||
mutex_init(&scn->scn_queue_lock, NULL, MUTEX_DEFAULT, NULL);
|
||||
avl_create(&scn->scn_prefetch_queue, scan_prefetch_queue_compare,
|
||||
sizeof (scan_prefetch_issue_ctx_t),
|
||||
offsetof(scan_prefetch_issue_ctx_t, spic_avl_node));
|
||||
@@ -646,6 +647,7 @@ dsl_scan_fini(dsl_pool_t *dp)
|
||||
|
||||
scan_ds_queue_clear(scn);
|
||||
avl_destroy(&scn->scn_queue);
|
||||
mutex_destroy(&scn->scn_queue_lock);
|
||||
scan_ds_prefetch_queue_clear(scn);
|
||||
avl_destroy(&scn->scn_prefetch_queue);
|
||||
|
||||
@@ -2727,8 +2729,10 @@ enqueue_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
|
||||
return (err);
|
||||
ds = prev;
|
||||
}
|
||||
mutex_enter(&scn->scn_queue_lock);
|
||||
scan_ds_queue_insert(scn, ds->ds_object,
|
||||
dsl_dataset_phys(ds)->ds_prev_snap_txg);
|
||||
mutex_exit(&scn->scn_queue_lock);
|
||||
dsl_dataset_rele(ds, FTAG);
|
||||
return (0);
|
||||
}
|
||||
@@ -2919,8 +2923,10 @@ enqueue_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
|
||||
ds = prev;
|
||||
}
|
||||
|
||||
mutex_enter(&scn->scn_queue_lock);
|
||||
scan_ds_queue_insert(scn, ds->ds_object,
|
||||
dsl_dataset_phys(ds)->ds_prev_snap_txg);
|
||||
mutex_exit(&scn->scn_queue_lock);
|
||||
dsl_dataset_rele(ds, FTAG);
|
||||
return (0);
|
||||
}
|
||||
|
||||
@@ -629,8 +629,8 @@ metaslab_class_expandable_space(metaslab_class_t *mc)
|
||||
* metaslabs. We report the expandable space in terms
|
||||
* of the metaslab size since that's the unit of expansion.
|
||||
*/
|
||||
space += P2ALIGN(tvd->vdev_max_asize - tvd->vdev_asize,
|
||||
1ULL << tvd->vdev_ms_shift);
|
||||
space += P2ALIGN_TYPED(tvd->vdev_max_asize - tvd->vdev_asize,
|
||||
1ULL << tvd->vdev_ms_shift, uint64_t);
|
||||
}
|
||||
spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
|
||||
return (space);
|
||||
@@ -640,6 +640,7 @@ void
|
||||
metaslab_class_evict_old(metaslab_class_t *mc, uint64_t txg)
|
||||
{
|
||||
multilist_t *ml = &mc->mc_metaslab_txg_list;
|
||||
hrtime_t now = gethrtime();
|
||||
for (int i = 0; i < multilist_get_num_sublists(ml); i++) {
|
||||
multilist_sublist_t *mls = multilist_sublist_lock_idx(ml, i);
|
||||
metaslab_t *msp = multilist_sublist_head(mls);
|
||||
@@ -663,8 +664,10 @@ metaslab_class_evict_old(metaslab_class_t *mc, uint64_t txg)
|
||||
multilist_sublist_unlock(mls);
|
||||
if (txg >
|
||||
msp->ms_selected_txg + metaslab_unload_delay &&
|
||||
gethrtime() > msp->ms_selected_time +
|
||||
(uint64_t)MSEC2NSEC(metaslab_unload_delay_ms)) {
|
||||
now > msp->ms_selected_time +
|
||||
MSEC2NSEC(metaslab_unload_delay_ms) &&
|
||||
(msp->ms_allocator == -1 ||
|
||||
!metaslab_preload_enabled)) {
|
||||
metaslab_evict(msp, txg);
|
||||
} else {
|
||||
/*
|
||||
|
||||
+7
-4
@@ -9939,6 +9939,9 @@ spa_sync(spa_t *spa, uint64_t txg)
|
||||
|
||||
metaslab_class_evict_old(spa->spa_normal_class, txg);
|
||||
metaslab_class_evict_old(spa->spa_log_class, txg);
|
||||
/* spa_embedded_log_class has only one metaslab per vdev. */
|
||||
metaslab_class_evict_old(spa->spa_special_class, txg);
|
||||
metaslab_class_evict_old(spa->spa_dedup_class, txg);
|
||||
|
||||
spa_sync_close_syncing_log_sm(spa);
|
||||
|
||||
@@ -10561,10 +10564,10 @@ ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_data, INT, ZMOD_RW,
|
||||
ZFS_MODULE_PARAM(zfs_spa, spa_, load_print_vdev_tree, INT, ZMOD_RW,
|
||||
"Print vdev tree to zfs_dbgmsg during pool import");
|
||||
|
||||
ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_pct, UINT, ZMOD_RD,
|
||||
ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_pct, UINT, ZMOD_RW,
|
||||
"Percentage of CPUs to run an IO worker thread");
|
||||
|
||||
ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_tpq, UINT, ZMOD_RD,
|
||||
ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_tpq, UINT, ZMOD_RW,
|
||||
"Number of threads per IO worker taskqueue");
|
||||
|
||||
/* BEGIN CSTYLED */
|
||||
@@ -10595,10 +10598,10 @@ ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, new_alloc, INT,
|
||||
|
||||
#ifdef _KERNEL
|
||||
ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_read,
|
||||
spa_taskq_read_param_set, spa_taskq_read_param_get, ZMOD_RD,
|
||||
spa_taskq_read_param_set, spa_taskq_read_param_get, ZMOD_RW,
|
||||
"Configure IO queues for read IO");
|
||||
ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_write,
|
||||
spa_taskq_write_param_set, spa_taskq_write_param_get, ZMOD_RD,
|
||||
spa_taskq_write_param_set, spa_taskq_write_param_get, ZMOD_RW,
|
||||
"Configure IO queues for write IO");
|
||||
#endif
|
||||
/* END CSTYLED */
|
||||
|
||||
+7
-5
@@ -347,7 +347,8 @@ vdev_get_min_asize(vdev_t *vd)
|
||||
* to the nearest metaslab.
|
||||
*/
|
||||
if (vd == vd->vdev_top)
|
||||
return (P2ALIGN(vd->vdev_asize, 1ULL << vd->vdev_ms_shift));
|
||||
return (P2ALIGN_TYPED(vd->vdev_asize, 1ULL << vd->vdev_ms_shift,
|
||||
uint64_t));
|
||||
|
||||
return (pvd->vdev_ops->vdev_op_min_asize(pvd));
|
||||
}
|
||||
@@ -2007,6 +2008,7 @@ vdev_open(vdev_t *vd)
|
||||
vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
|
||||
vd->vdev_cant_read = B_FALSE;
|
||||
vd->vdev_cant_write = B_FALSE;
|
||||
vd->vdev_fault_wanted = B_FALSE;
|
||||
vd->vdev_min_asize = vdev_get_min_asize(vd);
|
||||
|
||||
/*
|
||||
@@ -2107,8 +2109,8 @@ vdev_open(vdev_t *vd)
|
||||
}
|
||||
}
|
||||
|
||||
osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t));
|
||||
max_osize = P2ALIGN(max_osize, (uint64_t)sizeof (vdev_label_t));
|
||||
osize = P2ALIGN_TYPED(osize, sizeof (vdev_label_t), uint64_t);
|
||||
max_osize = P2ALIGN_TYPED(max_osize, sizeof (vdev_label_t), uint64_t);
|
||||
|
||||
if (vd->vdev_children == 0) {
|
||||
if (osize < SPA_MINDEVSIZE) {
|
||||
@@ -4730,9 +4732,9 @@ vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx)
|
||||
* can expand.
|
||||
*/
|
||||
if (vd->vdev_aux == NULL && tvd != NULL) {
|
||||
vs->vs_esize = P2ALIGN(
|
||||
vs->vs_esize = P2ALIGN_TYPED(
|
||||
vd->vdev_max_asize - vd->vdev_asize,
|
||||
1ULL << tvd->vdev_ms_shift);
|
||||
1ULL << tvd->vdev_ms_shift, uint64_t);
|
||||
}
|
||||
|
||||
vs->vs_configured_ashift = vd->vdev_top != NULL
|
||||
|
||||
+14
-13
@@ -635,6 +635,7 @@ zap_expand_leaf(zap_name_t *zn, zap_leaf_t *l,
|
||||
uint64_t object = zap->zap_object;
|
||||
|
||||
zap_put_leaf(l);
|
||||
*lp = l = NULL;
|
||||
zap_unlockdir(zap, tag);
|
||||
err = zap_lockdir(os, object, tx, RW_WRITER,
|
||||
FALSE, FALSE, tag, &zn->zn_zap);
|
||||
@@ -844,21 +845,17 @@ retry:
|
||||
} else if (err == EAGAIN) {
|
||||
err = zap_expand_leaf(zn, l, tag, tx, &l);
|
||||
zap = zn->zn_zap; /* zap_expand_leaf() may change zap */
|
||||
if (err == 0) {
|
||||
if (err == 0)
|
||||
goto retry;
|
||||
} else if (err == ENOSPC) {
|
||||
/*
|
||||
* If we failed to expand the leaf, then bailout
|
||||
* as there is no point trying
|
||||
* zap_put_leaf_maybe_grow_ptrtbl().
|
||||
*/
|
||||
return (err);
|
||||
}
|
||||
}
|
||||
|
||||
out:
|
||||
if (zap != NULL)
|
||||
zap_put_leaf_maybe_grow_ptrtbl(zn, l, tag, tx);
|
||||
if (l != NULL) {
|
||||
if (err == ENOSPC)
|
||||
zap_put_leaf(l);
|
||||
else
|
||||
zap_put_leaf_maybe_grow_ptrtbl(zn, l, tag, tx);
|
||||
}
|
||||
return (err);
|
||||
}
|
||||
|
||||
@@ -915,8 +912,12 @@ retry:
|
||||
goto retry;
|
||||
}
|
||||
|
||||
if (zap != NULL)
|
||||
zap_put_leaf_maybe_grow_ptrtbl(zn, l, tag, tx);
|
||||
if (l != NULL) {
|
||||
if (err == ENOSPC)
|
||||
zap_put_leaf(l);
|
||||
else
|
||||
zap_put_leaf_maybe_grow_ptrtbl(zn, l, tag, tx);
|
||||
}
|
||||
return (err);
|
||||
}
|
||||
|
||||
|
||||
@@ -903,7 +903,7 @@ zfs_log_clone_range(zilog_t *zilog, dmu_tx_t *tx, int txtype, znode_t *zp,
|
||||
itx_t *itx;
|
||||
lr_clone_range_t *lr;
|
||||
uint64_t partlen, max_log_data;
|
||||
size_t i, partnbps;
|
||||
size_t partnbps;
|
||||
|
||||
if (zil_replaying(zilog, tx) || zp->z_unlinked)
|
||||
return;
|
||||
@@ -912,10 +912,8 @@ zfs_log_clone_range(zilog_t *zilog, dmu_tx_t *tx, int txtype, znode_t *zp,
|
||||
|
||||
while (nbps > 0) {
|
||||
partnbps = MIN(nbps, max_log_data / sizeof (bps[0]));
|
||||
partlen = 0;
|
||||
for (i = 0; i < partnbps; i++) {
|
||||
partlen += BP_GET_LSIZE(&bps[i]);
|
||||
}
|
||||
partlen = partnbps * blksz;
|
||||
ASSERT3U(partlen, <, len + blksz);
|
||||
partlen = MIN(partlen, len);
|
||||
|
||||
itx = zil_itx_create(txtype,
|
||||
|
||||
+19
-2
@@ -513,9 +513,26 @@ zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
|
||||
|
||||
for (; lrp < end; lrp += reclen) {
|
||||
lr_t *lr = (lr_t *)lrp;
|
||||
|
||||
/*
|
||||
* Are the remaining bytes large enough to hold an
|
||||
* log record?
|
||||
*/
|
||||
if ((char *)(lr + 1) > end) {
|
||||
cmn_err(CE_WARN, "zil_parse: lr_t overrun");
|
||||
error = SET_ERROR(ECKSUM);
|
||||
arc_buf_destroy(abuf, &abuf);
|
||||
goto done;
|
||||
}
|
||||
reclen = lr->lrc_reclen;
|
||||
ASSERT3U(reclen, >=, sizeof (lr_t));
|
||||
ASSERT3U(reclen, <=, end - lrp);
|
||||
if (reclen < sizeof (lr_t) || reclen > end - lrp) {
|
||||
cmn_err(CE_WARN,
|
||||
"zil_parse: lr_t has an invalid reclen");
|
||||
error = SET_ERROR(ECKSUM);
|
||||
arc_buf_destroy(abuf, &abuf);
|
||||
goto done;
|
||||
}
|
||||
|
||||
if (lr->lrc_seq > claim_lr_seq) {
|
||||
arc_buf_destroy(abuf, &abuf);
|
||||
goto done;
|
||||
|
||||
@@ -145,6 +145,24 @@ for kernel_version in %{?kernel_versions}; do
|
||||
%{?kernel_cc} \
|
||||
%{?kernel_ld} \
|
||||
%{?kernel_llvm}
|
||||
|
||||
# Pre-6.10 kernel builds didn't need to copy over the source files to the
|
||||
# build directory. However we do need to do it though post-6.10 due to
|
||||
# these commits:
|
||||
#
|
||||
# b1992c3772e6 kbuild: use $(src) instead of $(srctree)/$(src) for source
|
||||
# directory
|
||||
#
|
||||
# 9a0ebe5011f4 kbuild: use $(obj)/ instead of $(src)/ for common pattern
|
||||
# rules
|
||||
#
|
||||
# Note that kmodtool actually copies over the source into the build
|
||||
# directory, so what we're doing here is normal. For efficiency reasons
|
||||
# though we just use hardlinks instead of copying.
|
||||
#
|
||||
# See https://github.com/openzfs/zfs/issues/16439 for more info.
|
||||
cp -lR ../%{module}-%{version}/module/* module/
|
||||
|
||||
make %{?_smp_mflags}
|
||||
cd ..
|
||||
done
|
||||
|
||||
@@ -532,6 +532,7 @@ systemctl --system daemon-reload >/dev/null || true
|
||||
%attr(440, root, root) %config(noreplace) %{_sysconfdir}/sudoers.d/*
|
||||
|
||||
%config(noreplace) %{_bashcompletiondir}/zfs
|
||||
%config(noreplace) %{_bashcompletiondir}/zpool
|
||||
|
||||
%files -n libzpool5
|
||||
%{_libdir}/libzpool.so.*
|
||||
|
||||
@@ -26,6 +26,7 @@ PACKAGE_VERSION="${pkgver}"
|
||||
PACKAGE_CONFIG="${pkgcfg}"
|
||||
NO_WEAK_MODULES="yes"
|
||||
PRE_BUILD="configure
|
||||
--disable-dependency-tracking
|
||||
--prefix=/usr
|
||||
--with-config=kernel
|
||||
--with-linux=\$(
|
||||
|
||||
+22
-3
@@ -32,6 +32,7 @@ SCRIPT_COMMON=${SCRIPT_COMMON:-${0%/*}/common.sh}
|
||||
PROG=zfs-tests.sh
|
||||
VERBOSE="no"
|
||||
QUIET=""
|
||||
DEBUG=""
|
||||
CLEANUP="yes"
|
||||
CLEANUPALL="no"
|
||||
KMSG=""
|
||||
@@ -313,6 +314,7 @@ OPTIONS:
|
||||
-h Show this message
|
||||
-v Verbose zfs-tests.sh output
|
||||
-q Quiet test-runner output
|
||||
-D Debug; show all test output immediately (noisy)
|
||||
-x Remove all testpools, dm, lo, and files (unsafe)
|
||||
-k Disable cleanup after test failure
|
||||
-K Log test names to /dev/kmsg
|
||||
@@ -326,7 +328,8 @@ OPTIONS:
|
||||
-d DIR Use world-writable DIR for files and loopback devices
|
||||
-s SIZE Use vdevs of SIZE (default: 4G)
|
||||
-r RUNFILES Run tests in RUNFILES (default: ${DEFAULT_RUNFILES})
|
||||
-t PATH Run single test at PATH relative to test suite
|
||||
-t PATH|NAME Run single test at PATH relative to test suite,
|
||||
or search for test by NAME
|
||||
-T TAGS Comma separated list of tags (default: 'functional')
|
||||
-u USER Run single test as USER (default: root)
|
||||
|
||||
@@ -340,6 +343,9 @@ $0 -r linux-fast
|
||||
# Run a single test
|
||||
$0 -t tests/functional/cli_root/zfs_bookmark/zfs_bookmark_cliargs.ksh
|
||||
|
||||
# Run a single test by name
|
||||
$0 -t zfs_bookmark_cliargs
|
||||
|
||||
# Cleanup a previous run of the test suite prior to testing, run the
|
||||
# default ($(echo "${DEFAULT_RUNFILES}" | sed 's/\.run//')) suite of tests and perform no cleanup on exit.
|
||||
$0 -x
|
||||
@@ -347,7 +353,7 @@ $0 -x
|
||||
EOF
|
||||
}
|
||||
|
||||
while getopts 'hvqxkKfScRmn:d:s:r:?t:T:u:I:' OPTION; do
|
||||
while getopts 'hvqxkKfScRmn:d:Ds:r:?t:T:u:I:' OPTION; do
|
||||
case $OPTION in
|
||||
h)
|
||||
usage
|
||||
@@ -393,6 +399,9 @@ while getopts 'hvqxkKfScRmn:d:s:r:?t:T:u:I:' OPTION; do
|
||||
d)
|
||||
FILEDIR="$OPTARG"
|
||||
;;
|
||||
D)
|
||||
DEBUG="yes"
|
||||
;;
|
||||
I)
|
||||
ITERATIONS="$OPTARG"
|
||||
if [ "$ITERATIONS" -le 0 ]; then
|
||||
@@ -450,8 +459,15 @@ post_user = root
|
||||
post =
|
||||
outputdir = /var/tmp/test_results
|
||||
EOF
|
||||
SINGLETESTDIR="${SINGLETEST%/*}"
|
||||
if [ "$SINGLETEST" = "${SINGLETEST%/*}" ] ; then
|
||||
NEWSINGLETEST=$(find "$STF_SUITE" -name "$SINGLETEST*" -print -quit)
|
||||
if [ -z "$NEWSINGLETEST" ] ; then
|
||||
fail "couldn't find test matching '$SINGLETEST'"
|
||||
fi
|
||||
SINGLETEST=$NEWSINGLETEST
|
||||
fi
|
||||
|
||||
SINGLETESTDIR="${SINGLETEST%/*}"
|
||||
SETUPDIR="$SINGLETESTDIR"
|
||||
[ "${SETUPDIR#/}" = "$SETUPDIR" ] && SETUPDIR="$STF_SUITE/$SINGLETESTDIR"
|
||||
[ -x "$SETUPDIR/setup.ksh" ] && SETUPSCRIPT="setup" || SETUPSCRIPT=
|
||||
@@ -680,6 +696,7 @@ REPORT_FILE=$(mktemp_file zts-report)
|
||||
#
|
||||
msg "${TEST_RUNNER}" \
|
||||
"${QUIET:+-q}" \
|
||||
"${DEBUG:+-D}" \
|
||||
"${KMEMLEAK:+-m}" \
|
||||
"${KMSG:+-K}" \
|
||||
"-c \"${RUNFILES}\"" \
|
||||
@@ -689,6 +706,7 @@ msg "${TEST_RUNNER}" \
|
||||
{ PATH=$STF_PATH \
|
||||
${TEST_RUNNER} \
|
||||
${QUIET:+-q} \
|
||||
${DEBUG:+-D} \
|
||||
${KMEMLEAK:+-m} \
|
||||
${KMSG:+-K} \
|
||||
-c "${RUNFILES}" \
|
||||
@@ -715,6 +733,7 @@ if [ "$RESULT" -eq "2" ] && [ -n "$RERUN" ]; then
|
||||
{ PATH=$STF_PATH \
|
||||
${TEST_RUNNER} \
|
||||
${QUIET:+-q} \
|
||||
${DEBUG:+-D} \
|
||||
${KMEMLEAK:+-m} \
|
||||
-c "${RUNFILES}" \
|
||||
-T "${TAGS}" \
|
||||
|
||||
@@ -81,7 +81,8 @@ tests = ['block_cloning_clone_mmap_cached',
|
||||
'block_cloning_cross_enc_dataset',
|
||||
'block_cloning_copyfilerange_fallback_same_txg',
|
||||
'block_cloning_replay', 'block_cloning_replay_encrypted',
|
||||
'block_cloning_lwb_buffer_overflow', 'block_cloning_clone_mmap_write']
|
||||
'block_cloning_lwb_buffer_overflow', 'block_cloning_clone_mmap_write',
|
||||
'block_cloning_rlimit_fsize']
|
||||
tags = ['functional', 'block_cloning']
|
||||
|
||||
[tests/functional/bootfs]
|
||||
|
||||
@@ -121,7 +121,7 @@ tests = ['auto_offline_001_pos', 'auto_online_001_pos', 'auto_online_002_pos',
|
||||
'auto_replace_001_pos', 'auto_replace_002_pos', 'auto_spare_001_pos',
|
||||
'auto_spare_002_pos', 'auto_spare_multiple', 'auto_spare_ashift',
|
||||
'auto_spare_shared', 'decrypt_fault', 'decompress_fault',
|
||||
'scrub_after_resilver', 'zpool_status_-s']
|
||||
'scrub_after_resilver', 'suspend_resume_single', 'zpool_status_-s']
|
||||
tags = ['functional', 'fault']
|
||||
|
||||
[tests/functional/features/large_dnode:Linux]
|
||||
|
||||
@@ -113,8 +113,9 @@ class Output(object):
|
||||
This class is a slightly modified version of the 'Stream' class found
|
||||
here: http://goo.gl/aSGfv
|
||||
"""
|
||||
def __init__(self, stream):
|
||||
def __init__(self, stream, debug=False):
|
||||
self.stream = stream
|
||||
self.debug = debug
|
||||
self._buf = b''
|
||||
self.lines = []
|
||||
|
||||
@@ -140,6 +141,8 @@ class Output(object):
|
||||
buf = os.read(fd, 4096)
|
||||
if not buf:
|
||||
return None
|
||||
if self.debug:
|
||||
os.write(sys.stderr.fileno(), buf)
|
||||
if b'\n' not in buf:
|
||||
self._buf += buf
|
||||
return []
|
||||
@@ -238,14 +241,14 @@ User: %s
|
||||
ret = '%s -E -u %s %s' % (SUDO, user, cmd)
|
||||
return ret.split(' ')
|
||||
|
||||
def collect_output(self, proc):
|
||||
def collect_output(self, proc, debug=False):
|
||||
"""
|
||||
Read from stdout/stderr as data becomes available, until the
|
||||
process is no longer running. Return the lines from the stdout and
|
||||
stderr Output objects.
|
||||
"""
|
||||
out = Output(proc.stdout)
|
||||
err = Output(proc.stderr)
|
||||
out = Output(proc.stdout, debug)
|
||||
err = Output(proc.stderr, debug)
|
||||
res = []
|
||||
while proc.returncode is None:
|
||||
proc.poll()
|
||||
@@ -308,7 +311,10 @@ User: %s
|
||||
|
||||
try:
|
||||
t.start()
|
||||
self.result.stdout, self.result.stderr = self.collect_output(proc)
|
||||
|
||||
out, err = self.collect_output(proc, options.debug)
|
||||
self.result.stdout = out
|
||||
self.result.stderr = err
|
||||
|
||||
if kmemleak:
|
||||
cmd = f'{SUDO} sh -c "echo scan > {KMEMLEAK_FILE}"'
|
||||
@@ -624,7 +630,7 @@ Tags: %s
|
||||
|
||||
|
||||
class TestRun(object):
|
||||
props = ['quiet', 'outputdir']
|
||||
props = ['quiet', 'outputdir', 'debug']
|
||||
|
||||
def __init__(self, options):
|
||||
self.tests = {}
|
||||
@@ -644,7 +650,8 @@ class TestRun(object):
|
||||
('post_user', ''),
|
||||
('failsafe', ''),
|
||||
('failsafe_user', ''),
|
||||
('tags', [])
|
||||
('tags', []),
|
||||
('debug', False)
|
||||
]
|
||||
|
||||
def __str__(self):
|
||||
@@ -1067,6 +1074,8 @@ def parse_args():
|
||||
help='Specify tests to run via config files.')
|
||||
parser.add_option('-d', action='store_true', default=False, dest='dryrun',
|
||||
help='Dry run. Print tests, but take no other action.')
|
||||
parser.add_option('-D', action='store_true', default=False, dest='debug',
|
||||
help='Write all test output to stdout as it arrives.')
|
||||
parser.add_option('-l', action='callback', callback=options_cb,
|
||||
default=None, dest='logfile', metavar='logfile',
|
||||
type='string',
|
||||
|
||||
@@ -182,7 +182,6 @@ if sys.platform.startswith('freebsd'):
|
||||
'cli_root/zfs_unshare/zfs_unshare_008_pos': ['SKIP', na_reason],
|
||||
'cp_files/cp_files_002_pos': ['SKIP', na_reason],
|
||||
'link_count/link_count_001': ['SKIP', na_reason],
|
||||
'casenorm/mixed_create_failure': ['FAIL', 13215],
|
||||
'mmap/mmap_sync_001_pos': ['SKIP', na_reason],
|
||||
'rsend/send_raw_ashift': ['SKIP', 14961],
|
||||
})
|
||||
@@ -331,6 +330,8 @@ elif sys.platform.startswith('linux'):
|
||||
['SKIP', cfr_reason],
|
||||
'block_cloning/block_cloning_replay_encrypted':
|
||||
['SKIP', cfr_reason],
|
||||
'block_cloning/block_cloning_rlimit_fsize':
|
||||
['SKIP', cfr_reason],
|
||||
'cli_root/zfs_rename/zfs_rename_002_pos': ['FAIL', known_reason],
|
||||
'cli_root/zpool_reopen/zpool_reopen_003_pos': ['FAIL', known_reason],
|
||||
'cp_files/cp_files_002_pos': ['SKIP', cfr_reason],
|
||||
@@ -380,6 +381,7 @@ if os.environ.get('CI') == 'true':
|
||||
'fault/auto_replace_002_pos': ['SKIP', ci_reason],
|
||||
'fault/auto_spare_ashift': ['SKIP', ci_reason],
|
||||
'fault/auto_spare_shared': ['SKIP', ci_reason],
|
||||
'fault/suspend_resume_single': ['SKIP', ci_reason],
|
||||
'procfs/pool_state': ['SKIP', ci_reason],
|
||||
})
|
||||
|
||||
|
||||
@@ -521,13 +521,15 @@ test_send_new(const char *snapshot, int fd)
|
||||
static void
|
||||
test_recv_new(const char *dataset, int fd)
|
||||
{
|
||||
dmu_replay_record_t drr = { 0 };
|
||||
dmu_replay_record_t drr;
|
||||
nvlist_t *required = fnvlist_alloc();
|
||||
nvlist_t *optional = fnvlist_alloc();
|
||||
nvlist_t *props = fnvlist_alloc();
|
||||
char snapshot[MAXNAMELEN + 32];
|
||||
ssize_t count;
|
||||
|
||||
memset(&drr, 0, sizeof (dmu_replay_record_t));
|
||||
|
||||
int cleanup_fd = open(ZFS_DEV, O_RDWR);
|
||||
if (cleanup_fd == -1) {
|
||||
(void) fprintf(stderr, "open(%s) failed: %s\n", ZFS_DEV,
|
||||
|
||||
@@ -62,11 +62,39 @@ function compare_version_gte
|
||||
}
|
||||
|
||||
# Helper function used by linux_version() and freebsd_version()
|
||||
# $1, if provided, should be a MAJOR, MAJOR.MINOR or MAJOR.MINOR.PATCH
|
||||
# version number
|
||||
function kernel_version
|
||||
{
|
||||
typeset ver="$1"
|
||||
|
||||
[ -z "$ver" ] && ver=$(uname -r | grep -Eo "^[0-9]+\.[0-9]+\.[0-9]+")
|
||||
[ -z "$ver" ] && case "$UNAME" in
|
||||
Linux)
|
||||
# Linux version numbers are X.Y.Z followed by optional
|
||||
# vendor/distro specific stuff
|
||||
# RHEL7: 3.10.0-1160.108.1.el7.x86_64
|
||||
# Fedora 37: 6.5.12-100.fc37.x86_64
|
||||
# Debian 12.6: 6.1.0-22-amd64
|
||||
ver=$(uname -r | grep -Eo "^[0-9]+\.[0-9]+\.[0-9]+")
|
||||
;;
|
||||
FreeBSD)
|
||||
# FreeBSD version numbers are X.Y-BRANCH-pZ. Depending on
|
||||
# branch, -pZ may not be present, but this is typically only
|
||||
# on pre-release or true .0 releases, so can be assumed 0
|
||||
# if not present.
|
||||
# eg:
|
||||
# 13.2-RELEASE-p4
|
||||
# 14.1-RELEASE
|
||||
# 15.0-CURRENT
|
||||
ver=$(uname -r | \
|
||||
grep -Eo "[0-9]+\.[0-9]+(-[A-Z0-9]+-p[0-9]+)?" | \
|
||||
sed -E "s/-[^-]+-p/./")
|
||||
;;
|
||||
*)
|
||||
# Unknown system
|
||||
log_fail "Don't know how to get kernel version for '$UNAME'"
|
||||
;;
|
||||
esac
|
||||
|
||||
typeset version major minor _
|
||||
IFS='.' read -r version major minor _ <<<"$ver"
|
||||
|
||||
@@ -478,6 +478,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
|
||||
functional/block_cloning/block_cloning_replay.ksh \
|
||||
functional/block_cloning/block_cloning_replay_encrypted.ksh \
|
||||
functional/block_cloning/block_cloning_lwb_buffer_overflow.ksh \
|
||||
functional/block_cloning/block_cloning_rlimit_fsize.ksh \
|
||||
functional/bootfs/bootfs_001_pos.ksh \
|
||||
functional/bootfs/bootfs_002_neg.ksh \
|
||||
functional/bootfs/bootfs_003_pos.ksh \
|
||||
@@ -1476,6 +1477,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
|
||||
functional/fault/decompress_fault.ksh \
|
||||
functional/fault/decrypt_fault.ksh \
|
||||
functional/fault/scrub_after_resilver.ksh \
|
||||
functional/fault/suspend_resume_single.ksh \
|
||||
functional/fault/setup.ksh \
|
||||
functional/fault/zpool_status_-s.ksh \
|
||||
functional/features/async_destroy/async_destroy_001_pos.ksh \
|
||||
|
||||
@@ -55,7 +55,7 @@ function display_status
|
||||
((ret |= $?))
|
||||
|
||||
typeset mntpnt=$(get_prop mountpoint $pool)
|
||||
dd if=/dev/random of=$mntpnt/testfile.$$ &
|
||||
dd if=/dev/urandom of=$mntpnt/testfile.$$ &
|
||||
typeset pid=$!
|
||||
|
||||
zpool iostat -v 1 3 > /dev/null
|
||||
|
||||
+64
@@ -0,0 +1,64 @@
|
||||
#!/bin/ksh -p
|
||||
#
|
||||
# CDDL HEADER START
|
||||
#
|
||||
# The contents of this file are subject to the terms of the
|
||||
# Common Development and Distribution License (the "License").
|
||||
# You may not use this file except in compliance with the License.
|
||||
#
|
||||
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
||||
# or https://opensource.org/licenses/CDDL-1.0.
|
||||
# See the License for the specific language governing permissions
|
||||
# and limitations under the License.
|
||||
#
|
||||
# When distributing Covered Code, include this CDDL HEADER in each
|
||||
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
||||
# If applicable, add the following below this CDDL HEADER, with the
|
||||
# fields enclosed by brackets "[]" replaced with your own identifying
|
||||
# information: Portions Copyright [yyyy] [name of copyright owner]
|
||||
#
|
||||
# CDDL HEADER END
|
||||
#
|
||||
|
||||
. $STF_SUITE/include/libtest.shlib
|
||||
. $STF_SUITE/tests/functional/block_cloning/block_cloning.kshlib
|
||||
|
||||
#
|
||||
# DESCRIPTION:
|
||||
# When block cloning is used to implement copy_file_range(2), the
|
||||
# RLIMIT_FSIZE limit must be respected.
|
||||
#
|
||||
# STRATEGY:
|
||||
# 1. Create a pool.
|
||||
# 2. ???
|
||||
#
|
||||
|
||||
verify_runnable "global"
|
||||
|
||||
VDIR=$TEST_BASE_DIR/disk-bclone
|
||||
VDEV="$VDIR/a"
|
||||
|
||||
function cleanup
|
||||
{
|
||||
datasetexists $TESTPOOL && destroy_pool $TESTPOOL
|
||||
rm -rf $VDIR
|
||||
}
|
||||
|
||||
log_onexit cleanup
|
||||
|
||||
log_assert "Test for RLIMIT_FSIZE handling with block cloning enabled"
|
||||
|
||||
log_must rm -rf $VDIR
|
||||
log_must mkdir -p $VDIR
|
||||
log_must truncate -s 1G $VDEV
|
||||
|
||||
log_must zpool create -o feature@block_cloning=enabled $TESTPOOL $VDEV
|
||||
|
||||
log_must dd if=/dev/urandom of=/$TESTPOOL/file1 bs=1 count=1000
|
||||
|
||||
ulimit -f 2
|
||||
log_must clonefile -f /$TESTPOOL/file1 /$TESTPOOL/file2 0 0 all
|
||||
ulimit -f 1
|
||||
log_mustnot clonefile -f /$TESTPOOL/file1 /$TESTPOOL/file3 0 0 all
|
||||
|
||||
log_pass "copy_file_range(2) respects RLIMIT_FSIZE"
|
||||
@@ -84,7 +84,8 @@ function do_vol_test
|
||||
vol=$TESTPOOL/$TESTVOL1
|
||||
vol_b_path=$ZVOL_DEVDIR/$TESTPOOL/$TESTVOL1
|
||||
|
||||
log_must zfs create -V $VOLSIZE -o copies=$copies $vol
|
||||
log_must zfs create -V $VOLSIZE -o compression=off -o copies=$copies \
|
||||
$vol
|
||||
log_must zfs set refreservation=none $vol
|
||||
block_device_wait $vol_b_path
|
||||
|
||||
@@ -116,31 +117,30 @@ function do_vol_test
|
||||
else
|
||||
log_must zpool create $TESTPOOL1 $vol_b_path
|
||||
fi
|
||||
log_must zfs create $TESTPOOL1/$TESTFS1
|
||||
log_must zfs create -o compression=off $TESTPOOL1/$TESTFS1
|
||||
sync_pool $TESTPOOL1
|
||||
;;
|
||||
*)
|
||||
log_unsupported "$type test not implemented"
|
||||
;;
|
||||
esac
|
||||
|
||||
((nfilesize = copies * ${FILESIZE%m}))
|
||||
sync_pool $TESTPOOL
|
||||
pre_used=$(get_prop used $vol)
|
||||
((target_size = pre_used + nfilesize))
|
||||
|
||||
if [[ $type == "zfs" ]]; then
|
||||
log_must mkfile $FILESIZE /$TESTPOOL1/$TESTFS1/$FILE
|
||||
sync_pool $TESTPOOL1
|
||||
else
|
||||
log_must mkfile $FILESIZE $mntp/$FILE
|
||||
log_must sync
|
||||
fi
|
||||
|
||||
sync_pool $TESTPOOL
|
||||
post_used=$(get_prop used $vol)
|
||||
((retries = 0))
|
||||
while ((post_used < target_size && retries++ < 42)); do
|
||||
sleep 1
|
||||
post_used=$(get_prop used $vol)
|
||||
done
|
||||
|
||||
((used = post_used - pre_used))
|
||||
((nfilesize = copies * ${FILESIZE%m}))
|
||||
if ((used < nfilesize)); then
|
||||
log_fail "The space is not charged correctly while setting" \
|
||||
"copies as $copies ($used < $nfilesize)" \
|
||||
@@ -153,5 +153,7 @@ function do_vol_test
|
||||
log_must umount $mntp
|
||||
fi
|
||||
|
||||
# Ubuntu 20.04 wants a sync here
|
||||
log_must sync
|
||||
log_must zfs destroy $vol
|
||||
}
|
||||
|
||||
@@ -0,0 +1,102 @@
|
||||
#!/bin/ksh -p
|
||||
#
|
||||
# CDDL HEADER START
|
||||
#
|
||||
# The contents of this file are subject to the terms of the
|
||||
# Common Development and Distribution License (the "License").
|
||||
# You may not use this file except in compliance with the License.
|
||||
#
|
||||
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
||||
# or https://opensource.org/licenses/CDDL-1.0.
|
||||
# See the License for the specific language governing permissions
|
||||
# and limitations under the License.
|
||||
#
|
||||
# When distributing Covered Code, include this CDDL HEADER in each
|
||||
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
||||
# If applicable, add the following below this CDDL HEADER, with the
|
||||
# fields enclosed by brackets "[]" replaced with your own identifying
|
||||
# information: Portions Copyright [yyyy] [name of copyright owner]
|
||||
#
|
||||
# CDDL HEADER END
|
||||
#
|
||||
|
||||
#
|
||||
# Copyright (c) 2024, Klara Inc.
|
||||
#
|
||||
|
||||
. $STF_SUITE/include/libtest.shlib
|
||||
|
||||
set -x
|
||||
|
||||
DATAFILE="$TMPDIR/datafile"
|
||||
|
||||
function cleanup
|
||||
{
|
||||
destroy_pool $TESTPOOL
|
||||
unload_scsi_debug
|
||||
rm -f $DATA_FILE
|
||||
}
|
||||
|
||||
log_onexit cleanup
|
||||
|
||||
log_assert "ensure single-disk pool resumes properly after suspend and clear"
|
||||
|
||||
# create a file, and take a checksum, so we can compare later
|
||||
log_must dd if=/dev/urandom of=$DATAFILE bs=128K count=1
|
||||
typeset sum1=$(cat $DATAFILE | md5sum)
|
||||
|
||||
# make a debug device that we can "unplug"
|
||||
load_scsi_debug 100 1 1 1 '512b'
|
||||
sd=$(get_debug_device)
|
||||
|
||||
# create a single-device pool
|
||||
log_must zpool create $TESTPOOL $sd
|
||||
log_must zpool sync
|
||||
|
||||
# "pull" the disk
|
||||
log_must eval "echo offline > /sys/block/$sd/device/state"
|
||||
|
||||
# copy data onto the pool. it'll appear to succeed, but only be in memory
|
||||
log_must cp $DATAFILE /$TESTPOOL/file
|
||||
|
||||
# wait until sync starts, and the pool suspends
|
||||
log_note "waiting for pool to suspend"
|
||||
typeset -i tries=10
|
||||
until [[ $(cat /proc/spl/kstat/zfs/$TESTPOOL/state) == "SUSPENDED" ]] ; do
|
||||
if ((tries-- == 0)); then
|
||||
log_fail "pool didn't suspend"
|
||||
fi
|
||||
sleep 1
|
||||
done
|
||||
|
||||
# return the disk
|
||||
log_must eval "echo running > /sys/block/$sd/device/state"
|
||||
|
||||
# clear the error states, which should reopen the vdev, get the pool back
|
||||
# online, and replay the failed IO
|
||||
log_must zpool clear $TESTPOOL
|
||||
|
||||
# wait a while for everything to sync out. if something is going to go wrong,
|
||||
# this is where it will happen
|
||||
log_note "giving pool time to settle and complete txg"
|
||||
sleep 7
|
||||
|
||||
# if the pool suspended, then everything is bad
|
||||
if [[ $(cat /proc/spl/kstat/zfs/$TESTPOOL/state) == "SUSPENDED" ]] ; then
|
||||
log_fail "pool suspended"
|
||||
fi
|
||||
|
||||
# export the pool, to make sure it exports clean, and also to clear the file
|
||||
# out of the cache
|
||||
log_must zpool export $TESTPOOL
|
||||
|
||||
# import the pool
|
||||
log_must zpool import $TESTPOOL
|
||||
|
||||
# sum the file we wrote earlier
|
||||
typeset sum2=$(cat /$TESTPOOL/file | md5sum)
|
||||
|
||||
# make sure the checksums match
|
||||
log_must test "$sum1" = "$sum2"
|
||||
|
||||
log_pass "single-disk pool resumes properly after disk suspend and clear"
|
||||
@@ -37,11 +37,7 @@ export TMP_HISTORY=$TEST_BASE_DIR/tmp_history.$$
|
||||
export NEW_HISTORY=$TEST_BASE_DIR/new_history.$$
|
||||
|
||||
export MIGRATEDPOOLNAME=${MIGRATEDPOOLNAME:-history_pool}
|
||||
if is_freebsd; then
|
||||
export TIMEZONE=${TIMEZONE:-America/Denver}
|
||||
else
|
||||
export TIMEZONE=${TIMEZONE:-US/Mountain}
|
||||
fi
|
||||
export TIMEZONE=${TIMEZONE:-America/Denver}
|
||||
|
||||
export HIST_USER="huser"
|
||||
export HIST_GROUP="hgroup"
|
||||
|
||||
@@ -41,13 +41,13 @@ verify_runnable "global"
|
||||
|
||||
|
||||
if ! $(grep -q "CONFIG_IO_URING=y" /boot/config-$(uname -r)); then
|
||||
log_unsupported "Requires io_uring support"
|
||||
log_unsupported "Requires io_uring support within Kernel"
|
||||
fi
|
||||
|
||||
if [ -e /etc/os-release ] ; then
|
||||
source /etc/os-release
|
||||
if [ -n "$REDHAT_SUPPORT_PRODUCT_VERSION" ] && ((floor($REDHAT_SUPPORT_PRODUCT_VERSION) == 9)) ; then
|
||||
log_unsupported "Disabled on CentOS 9, fails with 'Operation not permitted'"
|
||||
if [ $PLATFORM_ID = "platform:el9" ]; then
|
||||
log_unsupported "Disabled on RHEL 9 variants: fails with 'Operation not permitted'"
|
||||
fi
|
||||
fi
|
||||
|
||||
|
||||
@@ -96,7 +96,7 @@ log_must zfs destroy -R $clone2
|
||||
log_must eval "zfs send -i $sendfs#book2 --redact book3 $sendfs@snap2 >$stream"
|
||||
log_must eval "zfs recv $recvfs <$stream"
|
||||
log_must mount_redacted -f $recvfs
|
||||
log_must diff <(ls $send_mnt) <(ls $recv_mnt)
|
||||
log_must [ "$(ls $send_mnt)" == "$(ls $recv_mnt)" ]
|
||||
log_must zfs destroy -R $recvfs
|
||||
log_must zfs rollback -R $sendfs@snap
|
||||
|
||||
|
||||
@@ -71,8 +71,7 @@ log_must ismounted $recvfs
|
||||
# deleted.
|
||||
contents=$(log_must find $recv_mnt)
|
||||
contents_orig=$(log_must find $send_mnt)
|
||||
log_must diff <(echo ${contents//$recv_mnt/}) \
|
||||
<(echo ${contents_orig//$send_mnt/})
|
||||
log_must [ "${contents//$recv_mnt/}" == "${contents_orig//$send_mnt/}" ]
|
||||
log_must zfs redact $sendvol@snap book2 $clonevol@snap
|
||||
log_must eval "zfs send --redact book2 $sendvol@snap >$stream"
|
||||
log_must eval "zfs receive $recvvol <$stream"
|
||||
@@ -103,7 +102,6 @@ log_must mount_redacted -f $recvfs
|
||||
log_must ismounted $recvfs
|
||||
contents=$(log_must find $recv_mnt)
|
||||
contents_orig=$(log_must find $send_mnt)
|
||||
log_must diff <(echo ${contents//$recv_mnt/}) \
|
||||
<(echo ${contents_orig//$send_mnt/})
|
||||
log_must [ "${contents//$recv_mnt/}" == "${contents_orig//$send_mnt/}" ]
|
||||
|
||||
log_pass "Received redacted streams can be mounted."
|
||||
|
||||
Reference in New Issue
Block a user