update/rebase to zfs-0.7.10 with patches from ZOL

Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>
This commit is contained in:
Stoiko Ivanov 2018-09-11 11:43:41 +02:00 committed by Thomas Lamprecht
parent f0371a1b16
commit a010b40938
32 changed files with 6046 additions and 3 deletions

View File

@ -0,0 +1,124 @@
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Boris Protopopov <bprotopopov@users.noreply.github.com>
Date: Wed, 9 Aug 2017 14:10:47 -0400
Subject: [PATCH] zv_suspend_lock in zvol_open()/zvol_release()
Acquire zv_suspend_lock on first open and last close only.
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Boris Protopopov <boris.protopopov@actifio.com>
Closes #6342
Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>
---
module/zfs/zvol.c | 64 +++++++++++++++++++++++++++++++++++--------------------
1 file changed, 41 insertions(+), 23 deletions(-)
diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c
index 3e7059b3..ffa5fac7 100644
--- a/module/zfs/zvol.c
+++ b/module/zfs/zvol.c
@@ -1347,9 +1347,9 @@ zvol_open(struct block_device *bdev, fmode_t flag)
{
zvol_state_t *zv;
int error = 0;
- boolean_t drop_suspend = B_FALSE;
+ boolean_t drop_suspend = B_TRUE;
- ASSERT(!mutex_owned(&zvol_state_lock));
+ ASSERT(!MUTEX_HELD(&zvol_state_lock));
mutex_enter(&zvol_state_lock);
/*
@@ -1364,23 +1364,31 @@ zvol_open(struct block_device *bdev, fmode_t flag)
return (SET_ERROR(-ENXIO));
}
- /* take zv_suspend_lock before zv_state_lock */
- rw_enter(&zv->zv_suspend_lock, RW_READER);
-
mutex_enter(&zv->zv_state_lock);
-
/*
* make sure zvol is not suspended during first open
- * (hold zv_suspend_lock), otherwise, drop the lock
+ * (hold zv_suspend_lock) and respect proper lock acquisition
+ * ordering - zv_suspend_lock before zv_state_lock
*/
if (zv->zv_open_count == 0) {
- drop_suspend = B_TRUE;
+ if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) {
+ mutex_exit(&zv->zv_state_lock);
+ rw_enter(&zv->zv_suspend_lock, RW_READER);
+ mutex_enter(&zv->zv_state_lock);
+ /* check to see if zv_suspend_lock is needed */
+ if (zv->zv_open_count != 0) {
+ rw_exit(&zv->zv_suspend_lock);
+ drop_suspend = B_FALSE;
+ }
+ }
} else {
- rw_exit(&zv->zv_suspend_lock);
+ drop_suspend = B_FALSE;
}
-
mutex_exit(&zvol_state_lock);
+ ASSERT(MUTEX_HELD(&zv->zv_state_lock));
+ ASSERT(zv->zv_open_count != 0 || RW_READ_HELD(&zv->zv_suspend_lock));
+
if (zv->zv_open_count == 0) {
error = zvol_first_open(zv);
if (error)
@@ -1417,28 +1425,38 @@ static int
zvol_release(struct gendisk *disk, fmode_t mode)
{
zvol_state_t *zv;
- boolean_t drop_suspend = B_FALSE;
+ boolean_t drop_suspend = B_TRUE;
- ASSERT(!mutex_owned(&zvol_state_lock));
+ ASSERT(!MUTEX_HELD(&zvol_state_lock));
mutex_enter(&zvol_state_lock);
zv = disk->private_data;
- ASSERT(zv && zv->zv_open_count > 0);
-
- /* take zv_suspend_lock before zv_state_lock */
- rw_enter(&zv->zv_suspend_lock, RW_READER);
mutex_enter(&zv->zv_state_lock);
- mutex_exit(&zvol_state_lock);
-
+ ASSERT(zv->zv_open_count > 0);
/*
* make sure zvol is not suspended during last close
- * (hold zv_suspend_lock), otherwise, drop the lock
+ * (hold zv_suspend_lock) and respect proper lock acquisition
+ * ordering - zv_suspend_lock before zv_state_lock
*/
- if (zv->zv_open_count == 1)
- drop_suspend = B_TRUE;
- else
- rw_exit(&zv->zv_suspend_lock);
+ if (zv->zv_open_count == 1) {
+ if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) {
+ mutex_exit(&zv->zv_state_lock);
+ rw_enter(&zv->zv_suspend_lock, RW_READER);
+ mutex_enter(&zv->zv_state_lock);
+ /* check to see if zv_suspend_lock is needed */
+ if (zv->zv_open_count != 1) {
+ rw_exit(&zv->zv_suspend_lock);
+ drop_suspend = B_FALSE;
+ }
+ }
+ } else {
+ drop_suspend = B_FALSE;
+ }
+ mutex_exit(&zvol_state_lock);
+
+ ASSERT(MUTEX_HELD(&zv->zv_state_lock));
+ ASSERT(zv->zv_open_count != 1 || RW_READ_HELD(&zv->zv_suspend_lock));
zv->zv_open_count--;
if (zv->zv_open_count == 0)

View File

@ -0,0 +1,560 @@
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Brian Behlendorf <behlendorf1@llnl.gov>
Date: Sun, 12 Aug 2018 18:22:03 -0400
Subject: [PATCH] Linux 4.18 compat: inode timespec -> timespec64
Commit torvalds/linux@95582b0 changes the inode i_atime, i_mtime,
and i_ctime members form timespec's to timespec64's to make them
2038 safe. As part of this change the current_time() function was
also updated to return the timespec64 type.
Resolve this issue by introducing a new inode_timespec_t type which
is defined to match the timespec type used by the inode. It should
be used when working with inode timestamps to ensure matching types.
The timestruc_t type under Illumos was used in a similar fashion but
was specified to always be a timespec_t. Rather than incorrectly
define this type all timespec_t types have been replaced by the new
inode_timespec_t type.
Finally, the kernel and user space 'sys/time.h' headers were aligned
with each other. They define as appropriate for the context several
constants as macros and include static inline implementation of
gethrestime(), gethrestime_sec(), and gethrtime().
Reviewed-by: Chunwei Chen <tuxoko@gmail.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #7643
Backported-by: Richard Yao <ryao@gentoo.org>
Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>
---
config/kernel-current-time.m4 | 7 +++----
include/sys/dmu.h | 2 +-
include/sys/dmu_objset.h | 2 +-
include/sys/dsl_dir.h | 4 ++--
include/sys/spa_impl.h | 2 +-
include/sys/xvattr.h | 2 +-
include/sys/zfs_context.h | 9 +--------
include/sys/zfs_znode.h | 33 +++++++++++++++++++++++--------
include/sys/zpl.h | 9 +++++++++
lib/libspl/Makefile.am | 2 --
lib/libspl/gethrestime.c | 38 ------------------------------------
lib/libspl/gethrtime.c | 45 -------------------------------------------
lib/libspl/include/sys/time.h | 37 +++++++++++++++++++++++++++--------
lib/libzpool/kernel.c | 4 ++--
module/zfs/dmu_objset.c | 2 +-
module/zfs/dsl_dir.c | 6 +++---
module/zfs/fm.c | 2 +-
module/zfs/zfs_ctldir.c | 2 +-
module/zfs/zfs_vnops.c | 4 ++--
module/zfs/zfs_znode.c | 4 ++--
module/zfs/zpl_inode.c | 5 +++--
21 files changed, 88 insertions(+), 133 deletions(-)
delete mode 100644 lib/libspl/gethrestime.c
delete mode 100644 lib/libspl/gethrtime.c
diff --git a/config/kernel-current-time.m4 b/config/kernel-current-time.m4
index 2ede9ff3..c7d5c9b5 100644
--- a/config/kernel-current-time.m4
+++ b/config/kernel-current-time.m4
@@ -1,15 +1,14 @@
dnl #
dnl # 4.9, current_time() added
+dnl # 4.18, return type changed from timespec to timespec64
dnl #
AC_DEFUN([ZFS_AC_KERNEL_CURRENT_TIME],
[AC_MSG_CHECKING([whether current_time() exists])
ZFS_LINUX_TRY_COMPILE_SYMBOL([
#include <linux/fs.h>
], [
- struct inode ip;
- struct timespec now __attribute__ ((unused));
-
- now = current_time(&ip);
+ struct inode ip __attribute__ ((unused));
+ ip.i_atime = current_time(&ip);
], [current_time], [fs/inode.c], [
AC_MSG_RESULT(yes)
AC_DEFINE(HAVE_CURRENT_TIME, 1, [current_time() exists])
diff --git a/include/sys/dmu.h b/include/sys/dmu.h
index bcdf7d64..755a9056 100644
--- a/include/sys/dmu.h
+++ b/include/sys/dmu.h
@@ -891,7 +891,7 @@ uint64_t dmu_objset_fsid_guid(objset_t *os);
/*
* Get the [cm]time for an objset's snapshot dir
*/
-timestruc_t dmu_objset_snap_cmtime(objset_t *os);
+inode_timespec_t dmu_objset_snap_cmtime(objset_t *os);
int dmu_objset_is_snapshot(objset_t *os);
diff --git a/include/sys/dmu_objset.h b/include/sys/dmu_objset.h
index a836e037..531e81d4 100644
--- a/include/sys/dmu_objset.h
+++ b/include/sys/dmu_objset.h
@@ -179,7 +179,7 @@ int dmu_objset_find_dp(struct dsl_pool *dp, uint64_t ddobj,
int func(struct dsl_pool *, struct dsl_dataset *, void *),
void *arg, int flags);
void dmu_objset_evict_dbufs(objset_t *os);
-timestruc_t dmu_objset_snap_cmtime(objset_t *os);
+inode_timespec_t dmu_objset_snap_cmtime(objset_t *os);
/* called from dsl */
void dmu_objset_sync(objset_t *os, zio_t *zio, dmu_tx_t *tx);
diff --git a/include/sys/dsl_dir.h b/include/sys/dsl_dir.h
index 69b0b6a5..80e83fdc 100644
--- a/include/sys/dsl_dir.h
+++ b/include/sys/dsl_dir.h
@@ -103,7 +103,7 @@ struct dsl_dir {
/* Protected by dd_lock */
kmutex_t dd_lock;
list_t dd_props; /* list of dsl_prop_record_t's */
- timestruc_t dd_snap_cmtime; /* last time snapshot namespace changed */
+ inode_timespec_t dd_snap_cmtime; /* last snapshot namespace change */
uint64_t dd_origin_txg;
/* gross estimate of space used by in-flight tx's */
@@ -159,7 +159,7 @@ boolean_t dsl_dir_is_clone(dsl_dir_t *dd);
void dsl_dir_new_refreservation(dsl_dir_t *dd, struct dsl_dataset *ds,
uint64_t reservation, cred_t *cr, dmu_tx_t *tx);
void dsl_dir_snap_cmtime_update(dsl_dir_t *dd);
-timestruc_t dsl_dir_snap_cmtime(dsl_dir_t *dd);
+inode_timespec_t dsl_dir_snap_cmtime(dsl_dir_t *dd);
void dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd, uint64_t value,
dmu_tx_t *tx);
void dsl_dir_zapify(dsl_dir_t *dd, dmu_tx_t *tx);
diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h
index b1e78c1d..fa7490ac 100644
--- a/include/sys/spa_impl.h
+++ b/include/sys/spa_impl.h
@@ -153,7 +153,7 @@ struct spa {
uint64_t spa_freeze_txg; /* freeze pool at this txg */
uint64_t spa_load_max_txg; /* best initial ub_txg */
uint64_t spa_claim_max_txg; /* highest claimed birth txg */
- timespec_t spa_loaded_ts; /* 1st successful open time */
+ inode_timespec_t spa_loaded_ts; /* 1st successful open time */
objset_t *spa_meta_objset; /* copy of dp->dp_meta_objset */
kmutex_t spa_evicting_os_lock; /* Evicting objset list lock */
list_t spa_evicting_os_list; /* Objsets being evicted. */
diff --git a/include/sys/xvattr.h b/include/sys/xvattr.h
index 4779b632..5d38927c 100644
--- a/include/sys/xvattr.h
+++ b/include/sys/xvattr.h
@@ -47,7 +47,7 @@
* Structure of all optional attributes.
*/
typedef struct xoptattr {
- timestruc_t xoa_createtime; /* Create time of file */
+ inode_timespec_t xoa_createtime; /* Create time of file */
uint8_t xoa_archive;
uint8_t xoa_system;
uint8_t xoa_readonly;
diff --git a/include/sys/zfs_context.h b/include/sys/zfs_context.h
index 4fe35342..68c58f95 100644
--- a/include/sys/zfs_context.h
+++ b/include/sys/zfs_context.h
@@ -527,7 +527,7 @@ extern char *vn_dumpdir;
#define AV_SCANSTAMP_SZ 32 /* length of anti-virus scanstamp */
typedef struct xoptattr {
- timestruc_t xoa_createtime; /* Create time of file */
+ inode_timespec_t xoa_createtime; /* Create time of file */
uint8_t xoa_archive;
uint8_t xoa_system;
uint8_t xoa_readonly;
@@ -640,13 +640,6 @@ extern void delay(clock_t ticks);
#define USEC_TO_TICK(usec) ((usec) / (MICROSEC / hz))
#define NSEC_TO_TICK(usec) ((usec) / (NANOSEC / hz))
-#define gethrestime_sec() time(NULL)
-#define gethrestime(t) \
- do {\
- (t)->tv_sec = gethrestime_sec();\
- (t)->tv_nsec = 0;\
- } while (0);
-
#define max_ncpus 64
#define boot_ncpus (sysconf(_SC_NPROCESSORS_ONLN))
diff --git a/include/sys/zfs_znode.h b/include/sys/zfs_znode.h
index c292f037..26d1eb37 100644
--- a/include/sys/zfs_znode.h
+++ b/include/sys/zfs_znode.h
@@ -270,19 +270,36 @@ typedef struct znode_hold {
extern unsigned int zfs_object_mutex_size;
-/* Encode ZFS stored time values from a struct timespec */
+/*
+ * Encode ZFS stored time values from a struct timespec / struct timespec64.
+ */
#define ZFS_TIME_ENCODE(tp, stmp) \
-{ \
+do { \
(stmp)[0] = (uint64_t)(tp)->tv_sec; \
(stmp)[1] = (uint64_t)(tp)->tv_nsec; \
-}
+} while (0)
-/* Decode ZFS stored time values to a struct timespec */
+#if defined(HAVE_INODE_TIMESPEC64_TIMES)
+/*
+ * Decode ZFS stored time values to a struct timespec64
+ * 4.18 and newer kernels.
+ */
#define ZFS_TIME_DECODE(tp, stmp) \
-{ \
- (tp)->tv_sec = (time_t)(stmp)[0]; \
- (tp)->tv_nsec = (long)(stmp)[1]; \
-}
+do { \
+ (tp)->tv_sec = (time64_t)(stmp)[0]; \
+ (tp)->tv_nsec = (long)(stmp)[1]; \
+} while (0)
+#else
+/*
+ * Decode ZFS stored time values to a struct timespec
+ * 4.17 and older kernels.
+ */
+#define ZFS_TIME_DECODE(tp, stmp) \
+do { \
+ (tp)->tv_sec = (time_t)(stmp)[0]; \
+ (tp)->tv_nsec = (long)(stmp)[1]; \
+} while (0)
+#endif /* HAVE_INODE_TIMESPEC64_TIMES */
/*
* Timestamp defines
diff --git a/include/sys/zpl.h b/include/sys/zpl.h
index 65ed4313..e433fbc6 100644
--- a/include/sys/zpl.h
+++ b/include/sys/zpl.h
@@ -189,4 +189,13 @@ zpl_dir_emit_dots(struct file *file, zpl_dir_context_t *ctx)
}
#endif /* HAVE_VFS_ITERATE */
+/*
+ * Linux 4.18, inode times converted from timespec to timespec64.
+ */
+#if defined(HAVE_INODE_TIMESPEC64_TIMES)
+#define zpl_inode_timespec_trunc(ts, gran) timespec64_trunc(ts, gran)
+#else
+#define zpl_inode_timespec_trunc(ts, gran) timespec_trunc(ts, gran)
+#endif
+
#endif /* _SYS_ZPL_H */
diff --git a/lib/libspl/Makefile.am b/lib/libspl/Makefile.am
index 59bc8ffb..a6e63cb8 100644
--- a/lib/libspl/Makefile.am
+++ b/lib/libspl/Makefile.am
@@ -19,8 +19,6 @@ noinst_LTLIBRARIES = libspl.la
USER_C = \
getexecname.c \
- gethrtime.c \
- gethrestime.c \
getmntany.c \
list.c \
mkdirp.c \
diff --git a/lib/libspl/gethrestime.c b/lib/libspl/gethrestime.c
deleted file mode 100644
index d37cc2d5..00000000
--- a/lib/libspl/gethrestime.c
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#include <time.h>
-#include <sys/time.h>
-
-void
-gethrestime(timestruc_t *ts)
-{
- struct timeval tv;
-
- gettimeofday(&tv, NULL);
- ts->tv_sec = tv.tv_sec;
- ts->tv_nsec = tv.tv_usec * NSEC_PER_USEC;
-}
diff --git a/lib/libspl/gethrtime.c b/lib/libspl/gethrtime.c
deleted file mode 100644
index 95ceb18e..00000000
--- a/lib/libspl/gethrtime.c
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#include <time.h>
-#include <sys/time.h>
-#include <stdlib.h>
-#include <stdio.h>
-
-hrtime_t
-gethrtime(void)
-{
- struct timespec ts;
- int rc;
-
- rc = clock_gettime(CLOCK_MONOTONIC, &ts);
- if (rc) {
- fprintf(stderr, "Error: clock_gettime() = %d\n", rc);
- abort();
- }
-
- return ((((u_int64_t)ts.tv_sec) * NANOSEC) + ts.tv_nsec);
-}
diff --git a/lib/libspl/include/sys/time.h b/lib/libspl/include/sys/time.h
index dc645fa5..04b3ba87 100644
--- a/lib/libspl/include/sys/time.h
+++ b/lib/libspl/include/sys/time.h
@@ -27,8 +27,9 @@
#ifndef _LIBSPL_SYS_TIME_H
#define _LIBSPL_SYS_TIME_H
-#include_next <sys/time.h>
+#include <time.h>
#include <sys/types.h>
+#include_next <sys/time.h>
#ifndef SEC
#define SEC 1
@@ -70,13 +71,33 @@
#define SEC2NSEC(m) ((hrtime_t)(m) * (NANOSEC / SEC))
#endif
-
typedef long long hrtime_t;
-typedef struct timespec timestruc_t;
-typedef struct timespec timespec_t;
-
-
-extern hrtime_t gethrtime(void);
-extern void gethrestime(timestruc_t *);
+typedef struct timespec timespec_t;
+typedef struct timespec inode_timespec_t;
+
+static inline void
+gethrestime(inode_timespec_t *ts)
+{
+ struct timeval tv;
+ (void) gettimeofday(&tv, NULL);
+ ts->tv_sec = tv.tv_sec;
+ ts->tv_nsec = tv.tv_usec * NSEC_PER_USEC;
+}
+
+static inline time_t
+gethrestime_sec(void)
+{
+ struct timeval tv;
+ (void) gettimeofday(&tv, NULL);
+ return (tv.tv_sec);
+}
+
+static inline hrtime_t
+gethrtime(void)
+{
+ struct timespec ts;
+ (void) clock_gettime(CLOCK_MONOTONIC, &ts);
+ return ((((u_int64_t)ts.tv_sec) * NANOSEC) + ts.tv_nsec);
+}
#endif /* _LIBSPL_SYS_TIME_H */
diff --git a/lib/libzpool/kernel.c b/lib/libzpool/kernel.c
index e67d13c9..3ea8778b 100644
--- a/lib/libzpool/kernel.c
+++ b/lib/libzpool/kernel.c
@@ -498,7 +498,7 @@ cv_timedwait(kcondvar_t *cv, kmutex_t *mp, clock_t abstime)
{
int error;
struct timeval tv;
- timestruc_t ts;
+ struct timespec ts;
clock_t delta;
ASSERT3U(cv->cv_magic, ==, CV_MAGIC);
@@ -536,7 +536,7 @@ cv_timedwait_hires(kcondvar_t *cv, kmutex_t *mp, hrtime_t tim, hrtime_t res,
{
int error;
struct timeval tv;
- timestruc_t ts;
+ struct timespec ts;
hrtime_t delta;
ASSERT(flag == 0 || flag == CALLOUT_FLAG_ABSOLUTE);
diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c
index 3425d542..449ebedf 100644
--- a/module/zfs/dmu_objset.c
+++ b/module/zfs/dmu_objset.c
@@ -860,7 +860,7 @@ dmu_objset_evict_done(objset_t *os)
kmem_free(os, sizeof (objset_t));
}
-timestruc_t
+inode_timespec_t
dmu_objset_snap_cmtime(objset_t *os)
{
return (dsl_dir_snap_cmtime(os->os_dsl_dataset->ds_dir));
diff --git a/module/zfs/dsl_dir.c b/module/zfs/dsl_dir.c
index a3ef5896..deecf6bc 100644
--- a/module/zfs/dsl_dir.c
+++ b/module/zfs/dsl_dir.c
@@ -1975,10 +1975,10 @@ dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd,
return (0);
}
-timestruc_t
+inode_timespec_t
dsl_dir_snap_cmtime(dsl_dir_t *dd)
{
- timestruc_t t;
+ inode_timespec_t t;
mutex_enter(&dd->dd_lock);
t = dd->dd_snap_cmtime;
@@ -1990,7 +1990,7 @@ dsl_dir_snap_cmtime(dsl_dir_t *dd)
void
dsl_dir_snap_cmtime_update(dsl_dir_t *dd)
{
- timestruc_t t;
+ inode_timespec_t t;
gethrestime(&t);
mutex_enter(&dd->dd_lock);
diff --git a/module/zfs/fm.c b/module/zfs/fm.c
index cb148149..9d26cc99 100644
--- a/module/zfs/fm.c
+++ b/module/zfs/fm.c
@@ -508,8 +508,8 @@ zfs_zevent_insert(zevent_t *ev)
int
zfs_zevent_post(nvlist_t *nvl, nvlist_t *detector, zevent_cb_t *cb)
{
+ inode_timespec_t tv;
int64_t tv_array[2];
- timestruc_t tv;
uint64_t eid;
size_t nvl_size = 0;
zevent_t *ev;
diff --git a/module/zfs/zfs_ctldir.c b/module/zfs/zfs_ctldir.c
index 14af55c4..25edea78 100644
--- a/module/zfs/zfs_ctldir.c
+++ b/module/zfs/zfs_ctldir.c
@@ -449,7 +449,7 @@ static struct inode *
zfsctl_inode_alloc(zfsvfs_t *zfsvfs, uint64_t id,
const struct file_operations *fops, const struct inode_operations *ops)
{
- struct timespec now;
+ inode_timespec_t now;
struct inode *ip;
znode_t *zp;
diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c
index 0d2b61a1..34ea751c 100644
--- a/module/zfs/zfs_vnops.c
+++ b/module/zfs/zfs_vnops.c
@@ -3158,7 +3158,7 @@ top:
if (mask & (ATTR_MTIME | ATTR_SIZE)) {
ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
- ZTOI(zp)->i_mtime = timespec_trunc(vap->va_mtime,
+ ZTOI(zp)->i_mtime = zpl_inode_timespec_trunc(vap->va_mtime,
ZTOI(zp)->i_sb->s_time_gran);
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
@@ -3167,7 +3167,7 @@ top:
if (mask & (ATTR_CTIME | ATTR_SIZE)) {
ZFS_TIME_ENCODE(&vap->va_ctime, ctime);
- ZTOI(zp)->i_ctime = timespec_trunc(vap->va_ctime,
+ ZTOI(zp)->i_ctime = zpl_inode_timespec_trunc(vap->va_ctime,
ZTOI(zp)->i_sb->s_time_gran);
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
ctime, sizeof (ctime));
diff --git a/module/zfs/zfs_znode.c b/module/zfs/zfs_znode.c
index f508a248..e222c791 100644
--- a/module/zfs/zfs_znode.c
+++ b/module/zfs/zfs_znode.c
@@ -700,7 +700,7 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
uint64_t rdev = 0;
zfsvfs_t *zfsvfs = ZTOZSB(dzp);
dmu_buf_t *db;
- timestruc_t now;
+ inode_timespec_t now;
uint64_t gen, obj;
int bonuslen;
int dnodesize;
@@ -1349,7 +1349,7 @@ void
zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2],
uint64_t ctime[2])
{
- timestruc_t now;
+ inode_timespec_t now;
gethrestime(&now);
diff --git a/module/zfs/zpl_inode.c b/module/zfs/zpl_inode.c
index 3b5643d0..41b91cab 100644
--- a/module/zfs/zpl_inode.c
+++ b/module/zfs/zpl_inode.c
@@ -384,9 +384,10 @@ zpl_setattr(struct dentry *dentry, struct iattr *ia)
vap->va_mtime = ia->ia_mtime;
vap->va_ctime = ia->ia_ctime;
- if (vap->va_mask & ATTR_ATIME)
- ip->i_atime = timespec_trunc(ia->ia_atime,
+ if (vap->va_mask & ATTR_ATIME) {
+ ip->i_atime = zpl_inode_timespec_trunc(ia->ia_atime,
ip->i_sb->s_time_gran);
+ }
cookie = spl_fstrans_mark();
error = -zfs_setattr(ip, vap, 0, cr);

View File

@ -0,0 +1,808 @@
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Brian Behlendorf <behlendorf1@llnl.gov>
Date: Fri, 15 Jun 2018 15:05:21 -0700
Subject: [PATCH] Linux compat 4.18: check_disk_size_change()
Added support for the bops->check_events() interface which was
added in the 2.6.38 kernel to replace bops->media_changed().
Fully implementing this functionality allows the volume resize
code to rely on revalidate_disk(), which is the preferred
mechanism, and removes the need to use check_disk_size_change().
In order for bops->check_events() to lookup the zvol_state_t
stored in the disk->private_data the zvol_state_lock needs to
be held. Since the check events interface may poll the mutex
has been converted to a rwlock for better concurrently. The
rwlock need only be taken as a writer in the zvol_free() path
when disk->private_data is set to NULL.
The configure checks for the block_device_operations structure
were consolidated in a single kernel-block-device-operations.m4
file.
The ZFS_AC_KERNEL_BDEV_BLOCK_DEVICE_OPERATIONS configure checks
and assoicated dead code was removed. This interface was added
to the 2.6.28 kernel which predates the oldest supported 2.6.32
kernel and will therefore always be available.
Updated maximum Linux version in META file. The 4.17 kernel
was released on 2018-06-03 and ZoL is compatible with the
finalized kernel.
Reviewed-by: Boris Protopopov <boris.protopopov@actifio.com>
Reviewed-by: Sara Hartse <sara.hartse@delphix.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #7611
Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>
---
config/kernel-bdev-block-device-operations.m4 | 34 ---
.../kernel-block-device-operations-release-void.m4 | 29 ---
config/kernel-block-device-operations.m4 | 57 +++++
config/kernel.m4 | 2 +-
include/linux/blkdev_compat.h | 1 +
module/zfs/zvol.c | 259 +++++++++------------
6 files changed, 174 insertions(+), 208 deletions(-)
delete mode 100644 config/kernel-bdev-block-device-operations.m4
delete mode 100644 config/kernel-block-device-operations-release-void.m4
create mode 100644 config/kernel-block-device-operations.m4
diff --git a/config/kernel-bdev-block-device-operations.m4 b/config/kernel-bdev-block-device-operations.m4
deleted file mode 100644
index faacc195..00000000
--- a/config/kernel-bdev-block-device-operations.m4
+++ /dev/null
@@ -1,34 +0,0 @@
-dnl #
-dnl # 2.6.x API change
-dnl #
-AC_DEFUN([ZFS_AC_KERNEL_BDEV_BLOCK_DEVICE_OPERATIONS], [
- AC_MSG_CHECKING([block device operation prototypes])
- tmp_flags="$EXTRA_KCFLAGS"
- EXTRA_KCFLAGS="${NO_UNUSED_BUT_SET_VARIABLE}"
- ZFS_LINUX_TRY_COMPILE([
- #include <linux/blkdev.h>
-
- int blk_open(struct block_device *bdev, fmode_t mode)
- { return 0; }
- int blk_ioctl(struct block_device *bdev, fmode_t mode,
- unsigned x, unsigned long y) { return 0; }
- int blk_compat_ioctl(struct block_device * bdev, fmode_t mode,
- unsigned x, unsigned long y) { return 0; }
-
- static const struct block_device_operations
- bops __attribute__ ((unused)) = {
- .open = blk_open,
- .release = NULL,
- .ioctl = blk_ioctl,
- .compat_ioctl = blk_compat_ioctl,
- };
- ],[
- ],[
- AC_MSG_RESULT(struct block_device)
- AC_DEFINE(HAVE_BDEV_BLOCK_DEVICE_OPERATIONS, 1,
- [struct block_device_operations use bdevs])
- ],[
- AC_MSG_RESULT(struct inode)
- ])
- EXTRA_KCFLAGS="$tmp_flags"
-])
diff --git a/config/kernel-block-device-operations-release-void.m4 b/config/kernel-block-device-operations-release-void.m4
deleted file mode 100644
index a73f8587..00000000
--- a/config/kernel-block-device-operations-release-void.m4
+++ /dev/null
@@ -1,29 +0,0 @@
-dnl #
-dnl # 3.10.x API change
-dnl #
-AC_DEFUN([ZFS_AC_KERNEL_BLOCK_DEVICE_OPERATIONS_RELEASE_VOID], [
- AC_MSG_CHECKING([whether block_device_operations.release is void])
- tmp_flags="$EXTRA_KCFLAGS"
- EXTRA_KCFLAGS="${NO_UNUSED_BUT_SET_VARIABLE}"
- ZFS_LINUX_TRY_COMPILE([
- #include <linux/blkdev.h>
-
- void blk_release(struct gendisk *g, fmode_t mode) { return; }
-
- static const struct block_device_operations
- bops __attribute__ ((unused)) = {
- .open = NULL,
- .release = blk_release,
- .ioctl = NULL,
- .compat_ioctl = NULL,
- };
- ],[
- ],[
- AC_MSG_RESULT(void)
- AC_DEFINE(HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_VOID, 1,
- [struct block_device_operations.release returns void])
- ],[
- AC_MSG_RESULT(int)
- ])
- EXTRA_KCFLAGS="$tmp_flags"
-])
diff --git a/config/kernel-block-device-operations.m4 b/config/kernel-block-device-operations.m4
new file mode 100644
index 00000000..5f2811c1
--- /dev/null
+++ b/config/kernel-block-device-operations.m4
@@ -0,0 +1,57 @@
+dnl #
+dnl # 2.6.38 API change
+dnl #
+AC_DEFUN([ZFS_AC_KERNEL_BLOCK_DEVICE_OPERATIONS_CHECK_EVENTS], [
+ AC_MSG_CHECKING([whether bops->check_events() exists])
+ tmp_flags="$EXTRA_KCFLAGS"
+ EXTRA_KCFLAGS="${NO_UNUSED_BUT_SET_VARIABLE}"
+ ZFS_LINUX_TRY_COMPILE([
+ #include <linux/blkdev.h>
+
+ unsigned int blk_check_events(struct gendisk *disk,
+ unsigned int clearing) { return (0); }
+
+ static const struct block_device_operations
+ bops __attribute__ ((unused)) = {
+ .check_events = blk_check_events,
+ };
+ ],[
+ ],[
+ AC_MSG_RESULT(yes)
+ AC_DEFINE(HAVE_BLOCK_DEVICE_OPERATIONS_CHECK_EVENTS, 1,
+ [bops->check_events() exists])
+ ],[
+ AC_MSG_RESULT(no)
+ ])
+ EXTRA_KCFLAGS="$tmp_flags"
+])
+
+dnl #
+dnl # 3.10.x API change
+dnl #
+AC_DEFUN([ZFS_AC_KERNEL_BLOCK_DEVICE_OPERATIONS_RELEASE_VOID], [
+ AC_MSG_CHECKING([whether bops->release() is void])
+ tmp_flags="$EXTRA_KCFLAGS"
+ EXTRA_KCFLAGS="${NO_UNUSED_BUT_SET_VARIABLE}"
+ ZFS_LINUX_TRY_COMPILE([
+ #include <linux/blkdev.h>
+
+ void blk_release(struct gendisk *g, fmode_t mode) { return; }
+
+ static const struct block_device_operations
+ bops __attribute__ ((unused)) = {
+ .open = NULL,
+ .release = blk_release,
+ .ioctl = NULL,
+ .compat_ioctl = NULL,
+ };
+ ],[
+ ],[
+ AC_MSG_RESULT(void)
+ AC_DEFINE(HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_VOID, 1,
+ [bops->release() returns void])
+ ],[
+ AC_MSG_RESULT(int)
+ ])
+ EXTRA_KCFLAGS="$tmp_flags"
+])
diff --git a/config/kernel.m4 b/config/kernel.m4
index 375e4b79..c7ca260c 100644
--- a/config/kernel.m4
+++ b/config/kernel.m4
@@ -12,7 +12,7 @@ AC_DEFUN([ZFS_AC_CONFIG_KERNEL], [
ZFS_AC_KERNEL_CURRENT_BIO_TAIL
ZFS_AC_KERNEL_SUPER_USER_NS
ZFS_AC_KERNEL_SUBMIT_BIO
- ZFS_AC_KERNEL_BDEV_BLOCK_DEVICE_OPERATIONS
+ ZFS_AC_KERNEL_BLOCK_DEVICE_OPERATIONS_CHECK_EVENTS
ZFS_AC_KERNEL_BLOCK_DEVICE_OPERATIONS_RELEASE_VOID
ZFS_AC_KERNEL_TYPE_FMODE_T
ZFS_AC_KERNEL_3ARG_BLKDEV_GET
diff --git a/include/linux/blkdev_compat.h b/include/linux/blkdev_compat.h
index f99980ab..27f05662 100644
--- a/include/linux/blkdev_compat.h
+++ b/include/linux/blkdev_compat.h
@@ -32,6 +32,7 @@
#include <linux/blkdev.h>
#include <linux/elevator.h>
#include <linux/backing-dev.h>
+#include <linux/msdos_fs.h> /* for SECTOR_* */
#ifndef HAVE_FMODE_T
typedef unsigned __bitwise__ fmode_t;
diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c
index ffa5fac7..03f95630 100644
--- a/module/zfs/zvol.c
+++ b/module/zfs/zvol.c
@@ -99,7 +99,7 @@ unsigned long zvol_max_discard_blocks = 16384;
unsigned int zvol_volmode = ZFS_VOLMODE_GEOM;
static taskq_t *zvol_taskq;
-static kmutex_t zvol_state_lock;
+static krwlock_t zvol_state_lock;
static list_t zvol_state_list;
#define ZVOL_HT_SIZE 1024
@@ -176,17 +176,17 @@ zvol_find_by_dev(dev_t dev)
{
zvol_state_t *zv;
- mutex_enter(&zvol_state_lock);
+ rw_enter(&zvol_state_lock, RW_READER);
for (zv = list_head(&zvol_state_list); zv != NULL;
zv = list_next(&zvol_state_list, zv)) {
mutex_enter(&zv->zv_state_lock);
if (zv->zv_dev == dev) {
- mutex_exit(&zvol_state_lock);
+ rw_exit(&zvol_state_lock);
return (zv);
}
mutex_exit(&zv->zv_state_lock);
}
- mutex_exit(&zvol_state_lock);
+ rw_exit(&zvol_state_lock);
return (NULL);
}
@@ -204,7 +204,7 @@ zvol_find_by_name_hash(const char *name, uint64_t hash, int mode)
zvol_state_t *zv;
struct hlist_node *p = NULL;
- mutex_enter(&zvol_state_lock);
+ rw_enter(&zvol_state_lock, RW_READER);
hlist_for_each(p, ZVOL_HT_HEAD(hash)) {
zv = hlist_entry(p, zvol_state_t, zv_hlink);
mutex_enter(&zv->zv_state_lock);
@@ -227,12 +227,12 @@ zvol_find_by_name_hash(const char *name, uint64_t hash, int mode)
strncmp(zv->zv_name, name, MAXNAMELEN)
== 0);
}
- mutex_exit(&zvol_state_lock);
+ rw_exit(&zvol_state_lock);
return (zv);
}
mutex_exit(&zv->zv_state_lock);
}
- mutex_exit(&zvol_state_lock);
+ rw_exit(&zvol_state_lock);
return (NULL);
}
@@ -339,24 +339,6 @@ zvol_get_stats(objset_t *os, nvlist_t *nv)
return (SET_ERROR(error));
}
-static void
-zvol_size_changed(zvol_state_t *zv, uint64_t volsize)
-{
- struct block_device *bdev;
-
- ASSERT(MUTEX_HELD(&zv->zv_state_lock));
-
- bdev = bdget_disk(zv->zv_disk, 0);
- if (bdev == NULL)
- return;
-
- set_capacity(zv->zv_disk, volsize >> 9);
- zv->zv_volsize = volsize;
- check_disk_size_change(zv->zv_disk, bdev);
-
- bdput(bdev);
-}
-
/*
* Sanity check volume size.
*/
@@ -409,31 +391,17 @@ zvol_update_volsize(uint64_t volsize, objset_t *os)
return (error);
}
-static int
-zvol_update_live_volsize(zvol_state_t *zv, uint64_t volsize)
-{
- zvol_size_changed(zv, volsize);
-
- /*
- * We should post a event here describing the expansion. However,
- * the zfs_ereport_post() interface doesn't nicely support posting
- * events for zvols, it assumes events relate to vdevs or zios.
- */
-
- return (0);
-}
-
/*
- * Set ZFS_PROP_VOLSIZE set entry point.
+ * Set ZFS_PROP_VOLSIZE set entry point. Note that modifying the volume
+ * size will result in a udev "change" event being generated.
*/
int
zvol_set_volsize(const char *name, uint64_t volsize)
{
- zvol_state_t *zv = NULL;
objset_t *os = NULL;
- int error;
- dmu_object_info_t *doi;
+ struct gendisk *disk = NULL;
uint64_t readonly;
+ int error;
boolean_t owned = B_FALSE;
error = dsl_prop_get_integer(name,
@@ -443,7 +411,7 @@ zvol_set_volsize(const char *name, uint64_t volsize)
if (readonly)
return (SET_ERROR(EROFS));
- zv = zvol_find_by_name(name, RW_READER);
+ zvol_state_t *zv = zvol_find_by_name(name, RW_READER);
ASSERT(zv == NULL || (MUTEX_HELD(&zv->zv_state_lock) &&
RW_READ_HELD(&zv->zv_suspend_lock)));
@@ -464,16 +432,18 @@ zvol_set_volsize(const char *name, uint64_t volsize)
os = zv->zv_objset;
}
- doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP);
+ dmu_object_info_t *doi = kmem_alloc(sizeof (*doi), KM_SLEEP);
if ((error = dmu_object_info(os, ZVOL_OBJ, doi)) ||
(error = zvol_check_volsize(volsize, doi->doi_data_block_size)))
goto out;
error = zvol_update_volsize(volsize, os);
-
- if (error == 0 && zv != NULL)
- error = zvol_update_live_volsize(zv, volsize);
+ if (error == 0 && zv != NULL) {
+ zv->zv_volsize = volsize;
+ zv->zv_changed = 1;
+ disk = zv->zv_disk;
+ }
out:
kmem_free(doi, sizeof (dmu_object_info_t));
@@ -488,6 +458,9 @@ out:
if (zv != NULL)
mutex_exit(&zv->zv_state_lock);
+ if (disk != NULL)
+ revalidate_disk(disk);
+
return (SET_ERROR(error));
}
@@ -543,8 +516,8 @@ zvol_set_volblocksize(const char *name, uint64_t volblocksize)
if (zv == NULL)
return (SET_ERROR(ENXIO));
- ASSERT(MUTEX_HELD(&zv->zv_state_lock) &&
- RW_READ_HELD(&zv->zv_suspend_lock));
+ ASSERT(MUTEX_HELD(&zv->zv_state_lock));
+ ASSERT(RW_READ_HELD(&zv->zv_suspend_lock));
if (zv->zv_flags & ZVOL_RDONLY) {
mutex_exit(&zv->zv_state_lock);
@@ -1120,7 +1093,7 @@ zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
static void
zvol_insert(zvol_state_t *zv)
{
- ASSERT(MUTEX_HELD(&zvol_state_lock));
+ ASSERT(RW_WRITE_HELD(&zvol_state_lock));
ASSERT3U(MINOR(zv->zv_dev) & ZVOL_MINOR_MASK, ==, 0);
list_insert_head(&zvol_state_list, zv);
hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash));
@@ -1132,7 +1105,7 @@ zvol_insert(zvol_state_t *zv)
static void
zvol_remove(zvol_state_t *zv)
{
- ASSERT(MUTEX_HELD(&zvol_state_lock));
+ ASSERT(RW_WRITE_HELD(&zvol_state_lock));
list_remove(&zvol_state_list, zv);
hlist_del(&zv->zv_hlink);
}
@@ -1148,8 +1121,8 @@ zvol_setup_zv(zvol_state_t *zv)
uint64_t ro;
objset_t *os = zv->zv_objset;
- ASSERT(MUTEX_HELD(&zv->zv_state_lock) &&
- RW_LOCK_HELD(&zv->zv_suspend_lock));
+ ASSERT(MUTEX_HELD(&zv->zv_state_lock));
+ ASSERT(RW_LOCK_HELD(&zv->zv_suspend_lock));
error = dsl_prop_get_integer(zv->zv_name, "readonly", &ro, NULL);
if (error)
@@ -1227,8 +1200,8 @@ zvol_suspend(const char *name)
return (NULL);
/* block all I/O, release in zvol_resume. */
- ASSERT(MUTEX_HELD(&zv->zv_state_lock) &&
- RW_WRITE_HELD(&zv->zv_suspend_lock));
+ ASSERT(MUTEX_HELD(&zv->zv_state_lock));
+ ASSERT(RW_WRITE_HELD(&zv->zv_suspend_lock));
atomic_inc(&zv->zv_suspend_ref);
@@ -1349,9 +1322,7 @@ zvol_open(struct block_device *bdev, fmode_t flag)
int error = 0;
boolean_t drop_suspend = B_TRUE;
- ASSERT(!MUTEX_HELD(&zvol_state_lock));
-
- mutex_enter(&zvol_state_lock);
+ rw_enter(&zvol_state_lock, RW_READER);
/*
* Obtain a copy of private_data under the zvol_state_lock to make
* sure that either the result of zvol free code path setting
@@ -1360,7 +1331,7 @@ zvol_open(struct block_device *bdev, fmode_t flag)
*/
zv = bdev->bd_disk->private_data;
if (zv == NULL) {
- mutex_exit(&zvol_state_lock);
+ rw_exit(&zvol_state_lock);
return (SET_ERROR(-ENXIO));
}
@@ -1384,7 +1355,7 @@ zvol_open(struct block_device *bdev, fmode_t flag)
} else {
drop_suspend = B_FALSE;
}
- mutex_exit(&zvol_state_lock);
+ rw_exit(&zvol_state_lock);
ASSERT(MUTEX_HELD(&zv->zv_state_lock));
ASSERT(zv->zv_open_count != 0 || RW_READ_HELD(&zv->zv_suspend_lock));
@@ -1402,11 +1373,18 @@ zvol_open(struct block_device *bdev, fmode_t flag)
zv->zv_open_count++;
+ mutex_exit(&zv->zv_state_lock);
+ if (drop_suspend)
+ rw_exit(&zv->zv_suspend_lock);
+
check_disk_change(bdev);
+ return (0);
+
out_open_count:
if (zv->zv_open_count == 0)
zvol_last_close(zv);
+
out_mutex:
mutex_exit(&zv->zv_state_lock);
if (drop_suspend)
@@ -1427,9 +1405,7 @@ zvol_release(struct gendisk *disk, fmode_t mode)
zvol_state_t *zv;
boolean_t drop_suspend = B_TRUE;
- ASSERT(!MUTEX_HELD(&zvol_state_lock));
-
- mutex_enter(&zvol_state_lock);
+ rw_enter(&zvol_state_lock, RW_READER);
zv = disk->private_data;
mutex_enter(&zv->zv_state_lock);
@@ -1453,7 +1429,7 @@ zvol_release(struct gendisk *disk, fmode_t mode)
} else {
drop_suspend = B_FALSE;
}
- mutex_exit(&zvol_state_lock);
+ rw_exit(&zvol_state_lock);
ASSERT(MUTEX_HELD(&zv->zv_state_lock));
ASSERT(zv->zv_open_count != 1 || RW_READ_HELD(&zv->zv_suspend_lock));
@@ -1479,7 +1455,7 @@ zvol_ioctl(struct block_device *bdev, fmode_t mode,
zvol_state_t *zv = bdev->bd_disk->private_data;
int error = 0;
- ASSERT(zv && zv->zv_open_count > 0);
+ ASSERT3U(zv->zv_open_count, >, 0);
switch (cmd) {
case BLKFLSBUF:
@@ -1519,23 +1495,62 @@ zvol_compat_ioctl(struct block_device *bdev, fmode_t mode,
#define zvol_compat_ioctl NULL
#endif
+/*
+ * Linux 2.6.38 preferred interface.
+ */
+#ifdef HAVE_BLOCK_DEVICE_OPERATIONS_CHECK_EVENTS
+static unsigned int
+zvol_check_events(struct gendisk *disk, unsigned int clearing)
+{
+ unsigned int mask = 0;
+
+ rw_enter(&zvol_state_lock, RW_READER);
+
+ zvol_state_t *zv = disk->private_data;
+ if (zv != NULL) {
+ mutex_enter(&zv->zv_state_lock);
+ mask = zv->zv_changed ? DISK_EVENT_MEDIA_CHANGE : 0;
+ zv->zv_changed = 0;
+ mutex_exit(&zv->zv_state_lock);
+ }
+
+ rw_exit(&zvol_state_lock);
+
+ return (mask);
+}
+#else
static int zvol_media_changed(struct gendisk *disk)
{
+ int changed = 0;
+
+ rw_enter(&zvol_state_lock, RW_READER);
+
zvol_state_t *zv = disk->private_data;
+ if (zv != NULL) {
+ mutex_enter(&zv->zv_state_lock);
+ changed = zv->zv_changed;
+ zv->zv_changed = 0;
+ mutex_exit(&zv->zv_state_lock);
+ }
- ASSERT(zv && zv->zv_open_count > 0);
+ rw_exit(&zvol_state_lock);
- return (zv->zv_changed);
+ return (changed);
}
+#endif
static int zvol_revalidate_disk(struct gendisk *disk)
{
- zvol_state_t *zv = disk->private_data;
+ rw_enter(&zvol_state_lock, RW_READER);
- ASSERT(zv && zv->zv_open_count > 0);
+ zvol_state_t *zv = disk->private_data;
+ if (zv != NULL) {
+ mutex_enter(&zv->zv_state_lock);
+ set_capacity(zv->zv_disk, zv->zv_volsize >> SECTOR_BITS);
+ mutex_exit(&zv->zv_state_lock);
+ }
- zv->zv_changed = 0;
- set_capacity(zv->zv_disk, zv->zv_volsize >> 9);
+ rw_exit(&zvol_state_lock);
return (0);
}
@@ -1552,7 +1567,7 @@ zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo)
zvol_state_t *zv = bdev->bd_disk->private_data;
sector_t sectors;
- ASSERT(zv && zv->zv_open_count > 0);
+ ASSERT3U(zv->zv_open_count, >, 0);
sectors = get_capacity(zv->zv_disk);
@@ -1585,68 +1600,20 @@ zvol_probe(dev_t dev, int *part, void *arg)
return (kobj);
}
-#ifdef HAVE_BDEV_BLOCK_DEVICE_OPERATIONS
static struct block_device_operations zvol_ops = {
.open = zvol_open,
.release = zvol_release,
.ioctl = zvol_ioctl,
.compat_ioctl = zvol_compat_ioctl,
- .media_changed = zvol_media_changed,
- .revalidate_disk = zvol_revalidate_disk,
- .getgeo = zvol_getgeo,
- .owner = THIS_MODULE,
-};
-
-#else /* HAVE_BDEV_BLOCK_DEVICE_OPERATIONS */
-
-static int
-zvol_open_by_inode(struct inode *inode, struct file *file)
-{
- return (zvol_open(inode->i_bdev, file->f_mode));
-}
-
-static int
-zvol_release_by_inode(struct inode *inode, struct file *file)
-{
- return (zvol_release(inode->i_bdev->bd_disk, file->f_mode));
-}
-
-static int
-zvol_ioctl_by_inode(struct inode *inode, struct file *file,
- unsigned int cmd, unsigned long arg)
-{
- if (file == NULL || inode == NULL)
- return (SET_ERROR(-EINVAL));
-
- return (zvol_ioctl(inode->i_bdev, file->f_mode, cmd, arg));
-}
-
-#ifdef CONFIG_COMPAT
-static long
-zvol_compat_ioctl_by_inode(struct file *file,
- unsigned int cmd, unsigned long arg)
-{
- if (file == NULL)
- return (SET_ERROR(-EINVAL));
-
- return (zvol_compat_ioctl(file->f_dentry->d_inode->i_bdev,
- file->f_mode, cmd, arg));
-}
+#ifdef HAVE_BLOCK_DEVICE_OPERATIONS_CHECK_EVENTS
+ .check_events = zvol_check_events,
#else
-#define zvol_compat_ioctl_by_inode NULL
-#endif
-
-static struct block_device_operations zvol_ops = {
- .open = zvol_open_by_inode,
- .release = zvol_release_by_inode,
- .ioctl = zvol_ioctl_by_inode,
- .compat_ioctl = zvol_compat_ioctl_by_inode,
.media_changed = zvol_media_changed,
+#endif
.revalidate_disk = zvol_revalidate_disk,
.getgeo = zvol_getgeo,
.owner = THIS_MODULE,
};
-#endif /* HAVE_BDEV_BLOCK_DEVICE_OPERATIONS */
/*
* Allocate memory for a new zvol_state_t and setup the required
@@ -1699,6 +1666,10 @@ zvol_alloc(dev_t dev, const char *name)
rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL);
zv->zv_disk->major = zvol_major;
+#ifdef HAVE_BLOCK_DEVICE_OPERATIONS_CHECK_EVENTS
+ zv->zv_disk->events = DISK_EVENT_MEDIA_CHANGE;
+#endif
+
if (volmode == ZFS_VOLMODE_DEV) {
/*
* ZFS_VOLMODE_DEV disable partitioning on ZVOL devices: set
@@ -1743,7 +1714,6 @@ zvol_free(void *arg)
{
zvol_state_t *zv = arg;
- ASSERT(!MUTEX_HELD(&zvol_state_lock));
ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
ASSERT(!MUTEX_HELD(&zv->zv_state_lock));
ASSERT(zv->zv_open_count == 0);
@@ -1870,9 +1840,9 @@ out_doi:
kmem_free(doi, sizeof (dmu_object_info_t));
if (error == 0) {
- mutex_enter(&zvol_state_lock);
+ rw_enter(&zvol_state_lock, RW_WRITER);
zvol_insert(zv);
- mutex_exit(&zvol_state_lock);
+ rw_exit(&zvol_state_lock);
add_disk(zv->zv_disk);
} else {
ida_simple_remove(&zvol_ida, idx);
@@ -1889,7 +1859,7 @@ zvol_rename_minor(zvol_state_t *zv, const char *newname)
{
int readonly = get_disk_ro(zv->zv_disk);
- ASSERT(MUTEX_HELD(&zvol_state_lock));
+ ASSERT(RW_LOCK_HELD(&zvol_state_lock));
ASSERT(MUTEX_HELD(&zv->zv_state_lock));
strlcpy(zv->zv_name, newname, sizeof (zv->zv_name));
@@ -2129,7 +2099,7 @@ zvol_remove_minors_impl(const char *name)
list_create(&free_list, sizeof (zvol_state_t),
offsetof(zvol_state_t, zv_next));
- mutex_enter(&zvol_state_lock);
+ rw_enter(&zvol_state_lock, RW_WRITER);
for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) {
zv_next = list_next(&zvol_state_list, zv);
@@ -2154,15 +2124,15 @@ zvol_remove_minors_impl(const char *name)
zvol_remove(zv);
/*
- * clear this while holding zvol_state_lock so
- * zvol_open won't open it
+ * Cleared while holding zvol_state_lock as a writer
+ * which will prevent zvol_open() from opening it.
*/
zv->zv_disk->private_data = NULL;
/* Drop zv_state_lock before zvol_free() */
mutex_exit(&zv->zv_state_lock);
- /* try parallel zv_free, if failed do it in place */
+ /* Try parallel zv_free, if failed do it in place */
t = taskq_dispatch(system_taskq, zvol_free, zv,
TQ_SLEEP);
if (t == TASKQID_INVALID)
@@ -2173,11 +2143,9 @@ zvol_remove_minors_impl(const char *name)
mutex_exit(&zv->zv_state_lock);
}
}
- mutex_exit(&zvol_state_lock);
+ rw_exit(&zvol_state_lock);
- /*
- * Drop zvol_state_lock before calling zvol_free()
- */
+ /* Drop zvol_state_lock before calling zvol_free() */
while ((zv = list_head(&free_list)) != NULL) {
list_remove(&free_list, zv);
zvol_free(zv);
@@ -2196,7 +2164,7 @@ zvol_remove_minor_impl(const char *name)
if (zvol_inhibit_dev)
return;
- mutex_enter(&zvol_state_lock);
+ rw_enter(&zvol_state_lock, RW_WRITER);
for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) {
zv_next = list_next(&zvol_state_list, zv);
@@ -2216,7 +2184,10 @@ zvol_remove_minor_impl(const char *name)
}
zvol_remove(zv);
- /* clear this so zvol_open won't open it */
+ /*
+ * Cleared while holding zvol_state_lock as a writer
+ * which will prevent zvol_open() from opening it.
+ */
zv->zv_disk->private_data = NULL;
mutex_exit(&zv->zv_state_lock);
@@ -2227,7 +2198,7 @@ zvol_remove_minor_impl(const char *name)
}
/* Drop zvol_state_lock before calling zvol_free() */
- mutex_exit(&zvol_state_lock);
+ rw_exit(&zvol_state_lock);
if (zv != NULL)
zvol_free(zv);
@@ -2248,7 +2219,7 @@ zvol_rename_minors_impl(const char *oldname, const char *newname)
oldnamelen = strlen(oldname);
newnamelen = strlen(newname);
- mutex_enter(&zvol_state_lock);
+ rw_enter(&zvol_state_lock, RW_READER);
for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) {
zv_next = list_next(&zvol_state_list, zv);
@@ -2276,7 +2247,7 @@ zvol_rename_minors_impl(const char *oldname, const char *newname)
mutex_exit(&zv->zv_state_lock);
}
- mutex_exit(&zvol_state_lock);
+ rw_exit(&zvol_state_lock);
}
typedef struct zvol_snapdev_cb_arg {
@@ -2653,7 +2624,7 @@ zvol_init(void)
list_create(&zvol_state_list, sizeof (zvol_state_t),
offsetof(zvol_state_t, zv_next));
- mutex_init(&zvol_state_lock, NULL, MUTEX_DEFAULT, NULL);
+ rw_init(&zvol_state_lock, NULL, RW_DEFAULT, NULL);
ida_init(&zvol_ida);
zvol_taskq = taskq_create(ZVOL_DRIVER, threads, maxclsyspri,
@@ -2690,7 +2661,7 @@ out_taskq:
taskq_destroy(zvol_taskq);
out:
ida_destroy(&zvol_ida);
- mutex_destroy(&zvol_state_lock);
+ rw_destroy(&zvol_state_lock);
list_destroy(&zvol_state_list);
return (SET_ERROR(error));
@@ -2707,7 +2678,7 @@ zvol_fini(void)
taskq_destroy(zvol_taskq);
list_destroy(&zvol_state_list);
- mutex_destroy(&zvol_state_lock);
+ rw_destroy(&zvol_state_lock);
ida_destroy(&zvol_ida);
}

View File

@ -0,0 +1,368 @@
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Prakash Surya <prakash.surya@delphix.com>
Date: Mon, 8 Jan 2018 13:45:53 -0800
Subject: [PATCH] OpenZFS 8997 - ztest assertion failure in zil_lwb_write_issue
PROBLEM
=======
When `dmu_tx_assign` is called from `zil_lwb_write_issue`, it's possible
for either `ERESTART` or `EIO` to be returned.
If `ERESTART` is returned, this will cause an assertion to fail directly
in `zil_lwb_write_issue`, where the code assumes the return value is
`EIO` if `dmu_tx_assign` returns a non-zero value. This can occur if the
SPA is suspended when `dmu_tx_assign` is called, and most often occurs
when running `zloop`.
If `EIO` is returned, this can cause assertions to fail elsewhere in the
ZIL code. For example, `zil_commit_waiter_timeout` contains the
following logic:
lwb_t *nlwb = zil_lwb_write_issue(zilog, lwb);
ASSERT3S(lwb->lwb_state, !=, LWB_STATE_OPENED);
In this case, if `dmu_tx_assign` returned `EIO` from within
`zil_lwb_write_issue`, the `lwb` variable passed in will not be issued
to disk. Thus, it's `lwb_state` field will remain `LWB_STATE_OPENED` and
this assertion will fail. `zil_commit_waiter_timeout` assumes that after
it calls `zil_lwb_write_issue`, the `lwb` will be issued to disk, and
doesn't handle the case where this is not true; i.e. it doesn't handle
the case where `dmu_tx_assign` returns `EIO`.
SOLUTION
========
This change modifies the `dmu_tx_assign` function such that `txg_how` is
a bitmask, rather than of the `txg_how_t` enum type. Now, the previous
`TXG_WAITED` semantics can be used via `TXG_NOTHROTTLE`, along with
specifying either `TXG_NOWAIT` or `TXG_WAIT` semantics.
Previously, when `TXG_WAITED` was specified, `TXG_NOWAIT` semantics was
automatically invoked. This was not ideal when using `TXG_WAITED` within
`zil_lwb_write_issued`, leading the problem described above. Rather, we
want to achieve the semantics of `TXG_WAIT`, while also preventing the
`tx` from being penalized via the dirty delay throttling.
With this change, `zil_lwb_write_issued` can acheive the semtantics that
it requires by passing in the value `TXG_WAIT | TXG_NOTHROTTLE` to
`dmu_tx_assign`.
Further, consumers of `dmu_tx_assign` wishing to achieve the old
`TXG_WAITED` semantics can pass in the value `TXG_NOWAIT | TXG_NOTHROTTLE`.
Authored by: Prakash Surya <prakash.surya@delphix.com>
Approved by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: Andriy Gapon <avg@FreeBSD.org>
Ported-by: Brian Behlendorf <behlendorf1@llnl.gov>
Porting Notes:
- Additionally updated `zfs_tmpfile` to use `TXG_NOTHROTTLE`
OpenZFS-issue: https://www.illumos.org/issues/8997
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/19ea6cb0f9
Closes #7084
Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>
---
include/sys/dmu.h | 15 +++++++------
include/sys/dmu_tx.h | 8 +++----
module/zfs/dmu_tx.c | 57 ++++++++++++++++++++++++++------------------------
module/zfs/zfs_vnops.c | 21 ++++++++++---------
module/zfs/zil.c | 10 ++++++++-
5 files changed, 63 insertions(+), 48 deletions(-)
diff --git a/include/sys/dmu.h b/include/sys/dmu.h
index 755a9056..5b355afb 100644
--- a/include/sys/dmu.h
+++ b/include/sys/dmu.h
@@ -227,11 +227,14 @@ typedef enum dmu_object_type {
DMU_OTN_ZAP_METADATA = DMU_OT(DMU_BSWAP_ZAP, B_TRUE),
} dmu_object_type_t;
-typedef enum txg_how {
- TXG_WAIT = 1,
- TXG_NOWAIT,
- TXG_WAITED,
-} txg_how_t;
+/*
+ * These flags are intended to be used to specify the "txg_how"
+ * parameter when calling the dmu_tx_assign() function. See the comment
+ * above dmu_tx_assign() for more details on the meaning of these flags.
+ */
+#define TXG_NOWAIT (0ULL)
+#define TXG_WAIT (1ULL<<0)
+#define TXG_NOTHROTTLE (1ULL<<1)
void byteswap_uint64_array(void *buf, size_t size);
void byteswap_uint32_array(void *buf, size_t size);
@@ -694,7 +697,7 @@ void dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object);
void dmu_tx_hold_sa(dmu_tx_t *tx, struct sa_handle *hdl, boolean_t may_grow);
void dmu_tx_hold_sa_create(dmu_tx_t *tx, int total_size);
void dmu_tx_abort(dmu_tx_t *tx);
-int dmu_tx_assign(dmu_tx_t *tx, enum txg_how txg_how);
+int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how);
void dmu_tx_wait(dmu_tx_t *tx);
void dmu_tx_commit(dmu_tx_t *tx);
void dmu_tx_mark_netfree(dmu_tx_t *tx);
diff --git a/include/sys/dmu_tx.h b/include/sys/dmu_tx.h
index d82a7931..74b7e111 100644
--- a/include/sys/dmu_tx.h
+++ b/include/sys/dmu_tx.h
@@ -67,9 +67,6 @@ struct dmu_tx {
/* placeholder for syncing context, doesn't need specific holds */
boolean_t tx_anyobj;
- /* has this transaction already been delayed? */
- boolean_t tx_waited;
-
/* transaction is marked as being a "net free" of space */
boolean_t tx_netfree;
@@ -79,6 +76,9 @@ struct dmu_tx {
/* need to wait for sufficient dirty space */
boolean_t tx_wait_dirty;
+ /* has this transaction already been delayed? */
+ boolean_t tx_dirty_delayed;
+
int tx_err;
};
@@ -138,7 +138,7 @@ extern dmu_tx_stats_t dmu_tx_stats;
* These routines are defined in dmu.h, and are called by the user.
*/
dmu_tx_t *dmu_tx_create(objset_t *dd);
-int dmu_tx_assign(dmu_tx_t *tx, txg_how_t txg_how);
+int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how);
void dmu_tx_commit(dmu_tx_t *tx);
void dmu_tx_abort(dmu_tx_t *tx);
uint64_t dmu_tx_get_txg(dmu_tx_t *tx);
diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c
index c3cc03a6..6ebff267 100644
--- a/module/zfs/dmu_tx.c
+++ b/module/zfs/dmu_tx.c
@@ -854,7 +854,7 @@ dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty)
* decreasing performance.
*/
static int
-dmu_tx_try_assign(dmu_tx_t *tx, txg_how_t txg_how)
+dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how)
{
spa_t *spa = tx->tx_pool->dp_spa;
@@ -878,13 +878,13 @@ dmu_tx_try_assign(dmu_tx_t *tx, txg_how_t txg_how)
* of the failuremode setting.
*/
if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE &&
- txg_how != TXG_WAIT)
+ !(txg_how & TXG_WAIT))
return (SET_ERROR(EIO));
return (SET_ERROR(ERESTART));
}
- if (!tx->tx_waited &&
+ if (!tx->tx_dirty_delayed &&
dsl_pool_need_dirty_delay(tx->tx_pool)) {
tx->tx_wait_dirty = B_TRUE;
DMU_TX_STAT_BUMP(dmu_tx_dirty_delay);
@@ -976,41 +976,44 @@ dmu_tx_unassign(dmu_tx_t *tx)
}
/*
- * Assign tx to a transaction group. txg_how can be one of:
+ * Assign tx to a transaction group; txg_how is a bitmask:
*
- * (1) TXG_WAIT. If the current open txg is full, waits until there's
- * a new one. This should be used when you're not holding locks.
- * It will only fail if we're truly out of space (or over quota).
+ * If TXG_WAIT is set and the currently open txg is full, this function
+ * will wait until there's a new txg. This should be used when no locks
+ * are being held. With this bit set, this function will only fail if
+ * we're truly out of space (or over quota).
*
- * (2) TXG_NOWAIT. If we can't assign into the current open txg without
- * blocking, returns immediately with ERESTART. This should be used
- * whenever you're holding locks. On an ERESTART error, the caller
- * should drop locks, do a dmu_tx_wait(tx), and try again.
+ * If TXG_WAIT is *not* set and we can't assign into the currently open
+ * txg without blocking, this function will return immediately with
+ * ERESTART. This should be used whenever locks are being held. On an
+ * ERESTART error, the caller should drop all locks, call dmu_tx_wait(),
+ * and try again.
*
- * (3) TXG_WAITED. Like TXG_NOWAIT, but indicates that dmu_tx_wait()
- * has already been called on behalf of this operation (though
- * most likely on a different tx).
+ * If TXG_NOTHROTTLE is set, this indicates that this tx should not be
+ * delayed due on the ZFS Write Throttle (see comments in dsl_pool.c for
+ * details on the throttle). This is used by the VFS operations, after
+ * they have already called dmu_tx_wait() (though most likely on a
+ * different tx).
*/
int
-dmu_tx_assign(dmu_tx_t *tx, txg_how_t txg_how)
+dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how)
{
int err;
ASSERT(tx->tx_txg == 0);
- ASSERT(txg_how == TXG_WAIT || txg_how == TXG_NOWAIT ||
- txg_how == TXG_WAITED);
+ ASSERT0(txg_how & ~(TXG_WAIT | TXG_NOTHROTTLE));
ASSERT(!dsl_pool_sync_context(tx->tx_pool));
- if (txg_how == TXG_WAITED)
- tx->tx_waited = B_TRUE;
-
/* If we might wait, we must not hold the config lock. */
- ASSERT(txg_how != TXG_WAIT || !dsl_pool_config_held(tx->tx_pool));
+ IMPLY((txg_how & TXG_WAIT), !dsl_pool_config_held(tx->tx_pool));
+
+ if ((txg_how & TXG_NOTHROTTLE))
+ tx->tx_dirty_delayed = B_TRUE;
while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) {
dmu_tx_unassign(tx);
- if (err != ERESTART || txg_how != TXG_WAIT)
+ if (err != ERESTART || !(txg_how & TXG_WAIT))
return (err);
dmu_tx_wait(tx);
@@ -1054,12 +1057,12 @@ dmu_tx_wait(dmu_tx_t *tx)
tx->tx_wait_dirty = B_FALSE;
/*
- * Note: setting tx_waited only has effect if the caller
- * used TX_WAIT. Otherwise they are going to destroy
- * this tx and try again. The common case, zfs_write(),
- * uses TX_WAIT.
+ * Note: setting tx_dirty_delayed only has effect if the
+ * caller used TX_WAIT. Otherwise they are going to
+ * destroy this tx and try again. The common case,
+ * zfs_write(), uses TX_WAIT.
*/
- tx->tx_waited = B_TRUE;
+ tx->tx_dirty_delayed = B_TRUE;
} else if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) {
/*
* If the pool is suspended we need to wait until it
diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c
index 34ea751c..4805f897 100644
--- a/module/zfs/zfs_vnops.c
+++ b/module/zfs/zfs_vnops.c
@@ -129,7 +129,7 @@
*
* If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
* then drop all locks, call dmu_tx_wait(), and try again. On subsequent
- * calls to dmu_tx_assign(), pass TXG_WAITED rather than TXG_NOWAIT,
+ * calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT,
* to indicate that this operation has already called dmu_tx_wait().
* This will ensure that we don't retry forever, waiting a short bit
* each time.
@@ -154,7 +154,7 @@
* rw_enter(...); // grab any other locks you need
* tx = dmu_tx_create(...); // get DMU tx
* dmu_tx_hold_*(); // hold each object you might modify
- * error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
+ * error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
* if (error) {
* rw_exit(...); // drop locks
* zfs_dirent_unlock(dl); // unlock directory entry
@@ -1427,7 +1427,8 @@ top:
dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
0, acl_ids.z_aclp->z_acl_bytes);
}
- error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
+ error = dmu_tx_assign(tx,
+ (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
if (error) {
zfs_dirent_unlock(dl);
if (error == ERESTART) {
@@ -1602,7 +1603,7 @@ top:
dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
0, acl_ids.z_aclp->z_acl_bytes);
}
- error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
+ error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
if (error) {
if (error == ERESTART) {
waited = B_TRUE;
@@ -1775,7 +1776,7 @@ top:
*/
dmu_tx_mark_netfree(tx);
- error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
+ error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
if (error) {
zfs_dirent_unlock(dl);
if (error == ERESTART) {
@@ -2017,7 +2018,7 @@ top:
dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
ZFS_SA_BASE_ATTR_SIZE);
- error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
+ error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
if (error) {
zfs_dirent_unlock(dl);
if (error == ERESTART) {
@@ -2156,7 +2157,7 @@ top:
zfs_sa_upgrade_txholds(tx, zp);
zfs_sa_upgrade_txholds(tx, dzp);
dmu_tx_mark_netfree(tx);
- error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
+ error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
if (error) {
rw_exit(&zp->z_parent_lock);
rw_exit(&zp->z_name_lock);
@@ -3623,7 +3624,7 @@ top:
zfs_sa_upgrade_txholds(tx, szp);
dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
- error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
+ error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
if (error) {
if (zl != NULL)
zfs_rename_unlock(&zl);
@@ -3815,7 +3816,7 @@ top:
}
if (fuid_dirtied)
zfs_fuid_txhold(zfsvfs, tx);
- error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
+ error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
if (error) {
zfs_dirent_unlock(dl);
if (error == ERESTART) {
@@ -4041,7 +4042,7 @@ top:
zfs_sa_upgrade_txholds(tx, szp);
zfs_sa_upgrade_txholds(tx, dzp);
- error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
+ error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
if (error) {
zfs_dirent_unlock(dl);
if (error == ERESTART) {
diff --git a/module/zfs/zil.c b/module/zfs/zil.c
index 645b1d4d..a2bbdcb9 100644
--- a/module/zfs/zil.c
+++ b/module/zfs/zil.c
@@ -1009,7 +1009,15 @@ zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb)
* to clean up in the event of allocation failure or I/O failure.
*/
tx = dmu_tx_create(zilog->zl_os);
- VERIFY(dmu_tx_assign(tx, TXG_WAIT) == 0);
+
+ /*
+ * Since we are not going to create any new dirty data, and we
+ * can even help with clearing the existing dirty data, we
+ * should not be subject to the dirty data based delays. We
+ * use TXG_NOTHROTTLE to bypass the delay mechanism.
+ */
+ VERIFY0(dmu_tx_assign(tx, TXG_WAIT | TXG_NOTHROTTLE));
+
dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
txg = dmu_tx_get_txg(tx);

View File

@ -0,0 +1,34 @@
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Olaf Faaland <faaland1@llnl.gov>
Date: Fri, 6 Apr 2018 13:29:11 -0700
Subject: [PATCH] Fix divide-by-zero in mmp_delay_update()
vdev_count_leaves() in the denominator may return 0, caught by Coverity.
Introduced by
* 533ea04 Update mmp_delay on sync or skipped, failed write
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Giuseppe Di Natale <dinatale2@llnl.gov>
Reviewed-by: George Melikov <mail@gmelikov.ru>
Signed-off-by: Olaf Faaland <faaland1@llnl.gov>
Closes #7391
Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>
---
module/zfs/mmp.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/module/zfs/mmp.c b/module/zfs/mmp.c
index 1ae5f31f..3b74a6b6 100644
--- a/module/zfs/mmp.c
+++ b/module/zfs/mmp.c
@@ -327,7 +327,7 @@ mmp_delay_update(spa_t *spa, boolean_t write_completed)
*/
if (delay < mts->mmp_delay) {
hrtime_t min_delay = MSEC2NSEC(zfs_multihost_interval) /
- vdev_count_leaves(spa);
+ MAX(1, vdev_count_leaves(spa));
mts->mmp_delay = MAX(((delay + mts->mmp_delay * 127) / 128),
min_delay);
}

View File

@ -0,0 +1,867 @@
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Chunwei Chen <tuxoko@gmail.com>
Date: Wed, 18 Apr 2018 14:19:50 -0700
Subject: [PATCH] Fix ENOSPC in "Handle zap_add() failures in ..."
Commit cc63068 caused ENOSPC error when copy a large amount of files
between two directories. The reason is that the patch limits zap leaf
expansion to 2 retries, and return ENOSPC when failed.
The intent for limiting retries is to prevent pointlessly growing table
to max size when adding a block full of entries with same name in
different case in mixed mode. However, it turns out we cannot use any
limit on the retry. When we copy files from one directory in readdir
order, we are copying in hash order, one leaf block at a time. Which
means that if the leaf block in source directory has expanded 6 times,
and you copy those entries in that block, by the time you need to expand
the leaf in destination directory, you need to expand it 6 times in one
go. So any limit on the retry will result in error where it shouldn't.
Note that while we do use different salt for different directories, it
seems that the salt/hash function doesn't provide enough randomization
to the hash distance to prevent this from happening.
Since cc63068 has already been reverted. This patch adds it back and
removes the retry limit.
Also, as it turn out, failing on zap_add() has a serious side effect for
mzap_upgrade(). When upgrading from micro zap to fat zap, it will
call zap_add() to transfer entries one at a time. If it hit any error
halfway through, the remaining entries will be lost, causing those files
to become orphan. This patch add a VERIFY to catch it.
Reviewed-by: Sanjeev Bagewadi <sanjeev.bagewadi@gmail.com>
Reviewed-by: Richard Yao <ryao@gentoo.org>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Albert Lee <trisk@forkgnu.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Signed-off-by: Chunwei Chen <david.chen@nutanix.com>
Closes #7401
Closes #7421
Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>
---
configure.ac | 1 +
include/sys/zap_leaf.h | 15 ++-
module/zfs/zap.c | 10 +-
module/zfs/zap_leaf.c | 2 +-
module/zfs/zap_micro.c | 47 ++++++-
module/zfs/zfs_dir.c | 29 ++++-
module/zfs/zfs_vnops.c | 74 ++++++++---
tests/runfiles/linux.run | 6 +-
tests/zfs-tests/tests/functional/Makefile.am | 1 +
.../tests/functional/casenorm/Makefile.am | 1 +
.../functional/casenorm/mixed_create_failure.ksh | 136 +++++++++++++++++++++
.../zfs-tests/tests/functional/cp_files/.gitignore | 1 +
.../tests/functional/cp_files/Makefile.am | 13 ++
.../tests/functional/cp_files/cleanup.ksh | 34 ++++++
.../zfs-tests/tests/functional/cp_files/cp_files.c | 58 +++++++++
.../tests/functional/cp_files/cp_files_001_pos.ksh | 74 +++++++++++
.../zfs-tests/tests/functional/cp_files/setup.ksh | 35 ++++++
17 files changed, 500 insertions(+), 37 deletions(-)
create mode 100755 tests/zfs-tests/tests/functional/casenorm/mixed_create_failure.ksh
create mode 100644 tests/zfs-tests/tests/functional/cp_files/.gitignore
create mode 100644 tests/zfs-tests/tests/functional/cp_files/Makefile.am
create mode 100755 tests/zfs-tests/tests/functional/cp_files/cleanup.ksh
create mode 100644 tests/zfs-tests/tests/functional/cp_files/cp_files.c
create mode 100755 tests/zfs-tests/tests/functional/cp_files/cp_files_001_pos.ksh
create mode 100755 tests/zfs-tests/tests/functional/cp_files/setup.ksh
diff --git a/configure.ac b/configure.ac
index d9441a0f..3f4925c3 100644
--- a/configure.ac
+++ b/configure.ac
@@ -238,6 +238,7 @@ AC_CONFIG_FILES([
tests/zfs-tests/tests/functional/cli_user/zpool_iostat/Makefile
tests/zfs-tests/tests/functional/cli_user/zpool_list/Makefile
tests/zfs-tests/tests/functional/compression/Makefile
+ tests/zfs-tests/tests/functional/cp_files/Makefile
tests/zfs-tests/tests/functional/ctime/Makefile
tests/zfs-tests/tests/functional/delegate/Makefile
tests/zfs-tests/tests/functional/devices/Makefile
diff --git a/include/sys/zap_leaf.h b/include/sys/zap_leaf.h
index e784c596..a3da1036 100644
--- a/include/sys/zap_leaf.h
+++ b/include/sys/zap_leaf.h
@@ -46,10 +46,15 @@ struct zap_stats;
* block size (1<<l->l_bs) - hash entry size (2) * number of hash
* entries - header space (2*chunksize)
*/
-#define ZAP_LEAF_NUMCHUNKS(l) \
- (((1<<(l)->l_bs) - 2*ZAP_LEAF_HASH_NUMENTRIES(l)) / \
+#define ZAP_LEAF_NUMCHUNKS_BS(bs) \
+ (((1<<(bs)) - 2*ZAP_LEAF_HASH_NUMENTRIES_BS(bs)) / \
ZAP_LEAF_CHUNKSIZE - 2)
+#define ZAP_LEAF_NUMCHUNKS(l) (ZAP_LEAF_NUMCHUNKS_BS(((l)->l_bs)))
+
+#define ZAP_LEAF_NUMCHUNKS_DEF \
+ (ZAP_LEAF_NUMCHUNKS_BS(fzap_default_block_shift))
+
/*
* The amount of space within the chunk available for the array is:
* chunk size - space for type (1) - space for next pointer (2)
@@ -74,8 +79,10 @@ struct zap_stats;
* which is less than block size / CHUNKSIZE (24) / minimum number of
* chunks per entry (3).
*/
-#define ZAP_LEAF_HASH_SHIFT(l) ((l)->l_bs - 5)
-#define ZAP_LEAF_HASH_NUMENTRIES(l) (1 << ZAP_LEAF_HASH_SHIFT(l))
+#define ZAP_LEAF_HASH_SHIFT_BS(bs) ((bs) - 5)
+#define ZAP_LEAF_HASH_NUMENTRIES_BS(bs) (1 << ZAP_LEAF_HASH_SHIFT_BS(bs))
+#define ZAP_LEAF_HASH_SHIFT(l) (ZAP_LEAF_HASH_SHIFT_BS(((l)->l_bs)))
+#define ZAP_LEAF_HASH_NUMENTRIES(l) (ZAP_LEAF_HASH_NUMENTRIES_BS(((l)->l_bs)))
/*
* The chunks start immediately after the hash table. The end of the
diff --git a/module/zfs/zap.c b/module/zfs/zap.c
index ee9962bf..47b4c1ab 100644
--- a/module/zfs/zap.c
+++ b/module/zfs/zap.c
@@ -853,8 +853,16 @@ retry:
} else if (err == EAGAIN) {
err = zap_expand_leaf(zn, l, tag, tx, &l);
zap = zn->zn_zap; /* zap_expand_leaf() may change zap */
- if (err == 0)
+ if (err == 0) {
goto retry;
+ } else if (err == ENOSPC) {
+ /*
+ * If we failed to expand the leaf, then bailout
+ * as there is no point trying
+ * zap_put_leaf_maybe_grow_ptrtbl().
+ */
+ return (err);
+ }
}
out:
diff --git a/module/zfs/zap_leaf.c b/module/zfs/zap_leaf.c
index c342695c..526e4660 100644
--- a/module/zfs/zap_leaf.c
+++ b/module/zfs/zap_leaf.c
@@ -53,7 +53,7 @@ static uint16_t *zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry);
((h) >> \
(64 - ZAP_LEAF_HASH_SHIFT(l) - zap_leaf_phys(l)->l_hdr.lh_prefix_len)))
-#define LEAF_HASH_ENTPTR(l, h) (&zap_leaf_phys(l)->l_hash[LEAF_HASH(l, h)])
+#define LEAF_HASH_ENTPTR(l, h) (&zap_leaf_phys(l)->l_hash[LEAF_HASH(l, h)])
extern inline zap_leaf_phys_t *zap_leaf_phys(zap_leaf_t *l);
diff --git a/module/zfs/zap_micro.c b/module/zfs/zap_micro.c
index 3ebf995c..60e193ef 100644
--- a/module/zfs/zap_micro.c
+++ b/module/zfs/zap_micro.c
@@ -363,6 +363,41 @@ mze_find_unused_cd(zap_t *zap, uint64_t hash)
return (cd);
}
+/*
+ * Each mzap entry requires at max : 4 chunks
+ * 3 chunks for names + 1 chunk for value.
+ */
+#define MZAP_ENT_CHUNKS (1 + ZAP_LEAF_ARRAY_NCHUNKS(MZAP_NAME_LEN) + \
+ ZAP_LEAF_ARRAY_NCHUNKS(sizeof (uint64_t)))
+
+/*
+ * Check if the current entry keeps the colliding entries under the fatzap leaf
+ * size.
+ */
+static boolean_t
+mze_canfit_fzap_leaf(zap_name_t *zn, uint64_t hash)
+{
+ zap_t *zap = zn->zn_zap;
+ mzap_ent_t mze_tofind;
+ mzap_ent_t *mze;
+ avl_index_t idx;
+ avl_tree_t *avl = &zap->zap_m.zap_avl;
+ uint32_t mzap_ents = 0;
+
+ mze_tofind.mze_hash = hash;
+ mze_tofind.mze_cd = 0;
+
+ for (mze = avl_find(avl, &mze_tofind, &idx);
+ mze && mze->mze_hash == hash; mze = AVL_NEXT(avl, mze)) {
+ mzap_ents++;
+ }
+
+ /* Include the new entry being added */
+ mzap_ents++;
+
+ return (ZAP_LEAF_NUMCHUNKS_DEF > (mzap_ents * MZAP_ENT_CHUNKS));
+}
+
static void
mze_remove(zap_t *zap, mzap_ent_t *mze)
{
@@ -639,16 +674,15 @@ mzap_upgrade(zap_t **zapp, void *tag, dmu_tx_t *tx, zap_flags_t flags)
dprintf("adding %s=%llu\n",
mze->mze_name, mze->mze_value);
zn = zap_name_alloc(zap, mze->mze_name, 0);
- err = fzap_add_cd(zn, 8, 1, &mze->mze_value, mze->mze_cd,
- tag, tx);
+ /* If we fail here, we would end up losing entries */
+ VERIFY0(fzap_add_cd(zn, 8, 1, &mze->mze_value, mze->mze_cd,
+ tag, tx));
zap = zn->zn_zap; /* fzap_add_cd() may change zap */
zap_name_free(zn);
- if (err)
- break;
}
vmem_free(mzp, sz);
*zapp = zap;
- return (err);
+ return (0);
}
/*
@@ -1191,7 +1225,8 @@ zap_add_impl(zap_t *zap, const char *key,
err = fzap_add(zn, integer_size, num_integers, val, tag, tx);
zap = zn->zn_zap; /* fzap_add() may change zap */
} else if (integer_size != 8 || num_integers != 1 ||
- strlen(key) >= MZAP_NAME_LEN) {
+ strlen(key) >= MZAP_NAME_LEN ||
+ !mze_canfit_fzap_leaf(zn, zn->zn_hash)) {
err = mzap_upgrade(&zn->zn_zap, tag, tx, 0);
if (err == 0) {
err = fzap_add(zn, integer_size, num_integers, val,
diff --git a/module/zfs/zfs_dir.c b/module/zfs/zfs_dir.c
index 9a8bbccd..6398a1d1 100644
--- a/module/zfs/zfs_dir.c
+++ b/module/zfs/zfs_dir.c
@@ -742,7 +742,11 @@ zfs_dirent(znode_t *zp, uint64_t mode)
}
/*
- * Link zp into dl. Can only fail if zp has been unlinked.
+ * Link zp into dl. Can fail in the following cases :
+ * - if zp has been unlinked.
+ * - if the number of entries with the same hash (aka. colliding entries)
+ * exceed the capacity of a leaf-block of fatzap and splitting of the
+ * leaf-block does not help.
*/
int
zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag)
@@ -776,6 +780,24 @@ zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag)
NULL, &links, sizeof (links));
}
}
+
+ value = zfs_dirent(zp, zp->z_mode);
+ error = zap_add(ZTOZSB(zp)->z_os, dzp->z_id, dl->dl_name, 8, 1,
+ &value, tx);
+
+ /*
+ * zap_add could fail to add the entry if it exceeds the capacity of the
+ * leaf-block and zap_leaf_split() failed to help.
+ * The caller of this routine is responsible for failing the transaction
+ * which will rollback the SA updates done above.
+ */
+ if (error != 0) {
+ if (!(flag & ZRENAMING) && !(flag & ZNEW))
+ drop_nlink(ZTOI(zp));
+ mutex_exit(&zp->z_lock);
+ return (error);
+ }
+
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL,
&dzp->z_id, sizeof (dzp->z_id));
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
@@ -813,11 +835,6 @@ zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag)
ASSERT(error == 0);
mutex_exit(&dzp->z_lock);
- value = zfs_dirent(zp, zp->z_mode);
- error = zap_add(ZTOZSB(zp)->z_os, dzp->z_id, dl->dl_name,
- 8, 1, &value, tx);
- ASSERT(error == 0);
-
return (0);
}
diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c
index 4805f897..5a2e55eb 100644
--- a/module/zfs/zfs_vnops.c
+++ b/module/zfs/zfs_vnops.c
@@ -1427,6 +1427,7 @@ top:
dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
0, acl_ids.z_aclp->z_acl_bytes);
}
+
error = dmu_tx_assign(tx,
(waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
if (error) {
@@ -1444,10 +1445,22 @@ top:
}
zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
+ error = zfs_link_create(dl, zp, tx, ZNEW);
+ if (error != 0) {
+ /*
+ * Since, we failed to add the directory entry for it,
+ * delete the newly created dnode.
+ */
+ zfs_znode_delete(zp, tx);
+ remove_inode_hash(ZTOI(zp));
+ zfs_acl_ids_free(&acl_ids);
+ dmu_tx_commit(tx);
+ goto out;
+ }
+
if (fuid_dirtied)
zfs_fuid_sync(zfsvfs, tx);
- (void) zfs_link_create(dl, zp, tx, ZNEW);
txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
if (flag & FIGNORECASE)
txtype |= TX_CI;
@@ -2038,13 +2051,18 @@ top:
*/
zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
- if (fuid_dirtied)
- zfs_fuid_sync(zfsvfs, tx);
-
/*
* Now put new name in parent dir.
*/
- (void) zfs_link_create(dl, zp, tx, ZNEW);
+ error = zfs_link_create(dl, zp, tx, ZNEW);
+ if (error != 0) {
+ zfs_znode_delete(zp, tx);
+ remove_inode_hash(ZTOI(zp));
+ goto out;
+ }
+
+ if (fuid_dirtied)
+ zfs_fuid_sync(zfsvfs, tx);
*ipp = ZTOI(zp);
@@ -2054,6 +2072,7 @@ top:
zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp,
acl_ids.z_fuidp, vap);
+out:
zfs_acl_ids_free(&acl_ids);
dmu_tx_commit(tx);
@@ -2063,10 +2082,14 @@ top:
if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
zil_commit(zilog, 0);
- zfs_inode_update(dzp);
- zfs_inode_update(zp);
+ if (error != 0) {
+ iput(ZTOI(zp));
+ } else {
+ zfs_inode_update(dzp);
+ zfs_inode_update(zp);
+ }
ZFS_EXIT(zfsvfs);
- return (0);
+ return (error);
}
/*
@@ -3684,6 +3707,13 @@ top:
VERIFY3U(zfs_link_destroy(tdl, szp, tx,
ZRENAMING, NULL), ==, 0);
}
+ } else {
+ /*
+ * If we had removed the existing target, subsequent
+ * call to zfs_link_create() to add back the same entry
+ * but, the new dnode (szp) should not fail.
+ */
+ ASSERT(tzp == NULL);
}
}
@@ -3854,14 +3884,18 @@ top:
/*
* Insert the new object into the directory.
*/
- (void) zfs_link_create(dl, zp, tx, ZNEW);
-
- if (flags & FIGNORECASE)
- txtype |= TX_CI;
- zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
+ error = zfs_link_create(dl, zp, tx, ZNEW);
+ if (error != 0) {
+ zfs_znode_delete(zp, tx);
+ remove_inode_hash(ZTOI(zp));
+ } else {
+ if (flags & FIGNORECASE)
+ txtype |= TX_CI;
+ zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
- zfs_inode_update(dzp);
- zfs_inode_update(zp);
+ zfs_inode_update(dzp);
+ zfs_inode_update(zp);
+ }
zfs_acl_ids_free(&acl_ids);
@@ -3869,10 +3903,14 @@ top:
zfs_dirent_unlock(dl);
- *ipp = ZTOI(zp);
+ if (error == 0) {
+ *ipp = ZTOI(zp);
- if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
- zil_commit(zilog, 0);
+ if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ zil_commit(zilog, 0);
+ } else {
+ iput(ZTOI(zp));
+ }
ZFS_EXIT(zfsvfs);
return (error);
diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run
index 272c8c77..379c9f73 100644
--- a/tests/runfiles/linux.run
+++ b/tests/runfiles/linux.run
@@ -55,7 +55,7 @@ tags = ['functional', 'cachefile']
# 'mixed_none_lookup', 'mixed_none_lookup_ci', 'mixed_none_delete',
# 'mixed_formd_lookup', 'mixed_formd_lookup_ci', 'mixed_formd_delete']
[tests/functional/casenorm]
-tests = ['case_all_values', 'norm_all_values']
+tests = ['case_all_values', 'norm_all_values', 'mixed_create_failure']
tags = ['functional', 'casenorm']
[tests/functional/chattr]
@@ -394,6 +394,10 @@ tests = ['compress_001_pos', 'compress_002_pos', 'compress_003_pos',
'compress_004_pos']
tags = ['functional', 'compression']
+[tests/functional/cp_files]
+tests = ['cp_files_001_pos']
+tags = ['functional', 'cp_files']
+
[tests/functional/ctime]
tests = ['ctime_001_pos' ]
tags = ['functional', 'ctime']
diff --git a/tests/zfs-tests/tests/functional/Makefile.am b/tests/zfs-tests/tests/functional/Makefile.am
index cd60324f..ea52205a 100644
--- a/tests/zfs-tests/tests/functional/Makefile.am
+++ b/tests/zfs-tests/tests/functional/Makefile.am
@@ -11,6 +11,7 @@ SUBDIRS = \
cli_root \
cli_user \
compression \
+ cp_files \
ctime \
delegate \
devices \
diff --git a/tests/zfs-tests/tests/functional/casenorm/Makefile.am b/tests/zfs-tests/tests/functional/casenorm/Makefile.am
index 65dd156e..b284a256 100644
--- a/tests/zfs-tests/tests/functional/casenorm/Makefile.am
+++ b/tests/zfs-tests/tests/functional/casenorm/Makefile.am
@@ -7,6 +7,7 @@ dist_pkgdata_SCRIPTS = \
insensitive_formd_lookup.ksh \
insensitive_none_delete.ksh \
insensitive_none_lookup.ksh \
+ mixed_create_failure.ksh \
mixed_formd_delete.ksh \
mixed_formd_lookup_ci.ksh \
mixed_formd_lookup.ksh \
diff --git a/tests/zfs-tests/tests/functional/casenorm/mixed_create_failure.ksh b/tests/zfs-tests/tests/functional/casenorm/mixed_create_failure.ksh
new file mode 100755
index 00000000..51b5bb3f
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/casenorm/mixed_create_failure.ksh
@@ -0,0 +1,136 @@
+#!/bin/ksh -p
+#
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+#
+# Copyright 2018 Nutanix Inc. All rights reserved.
+#
+
+. $STF_SUITE/tests/functional/casenorm/casenorm.kshlib
+
+# DESCRIPTION:
+# For the filesystem with casesensitivity=mixed, normalization=none,
+# when multiple files with the same name (differing only in case) are created,
+# the number of files is limited to what can fit in a fatzap leaf-block.
+# And beyond that, it fails with ENOSPC.
+#
+# Ensure that the create/rename operations fail gracefully and not trigger an
+# ASSERT.
+#
+# STRATEGY:
+# Repeat the below steps for objects: files, directories, symlinks and hardlinks
+# 1. Create objects with same name but varying in case.
+# E.g. 'abcdefghijklmnop', 'Abcdefghijklmnop', 'ABcdefghijklmnop' etc.
+# The create should fail with ENOSPC.
+# 2. Create an object with name 'tmp_obj' and try to rename it to name that we
+# failed to add in step 1 above.
+# This should fail as well.
+
+verify_runnable "global"
+
+function cleanup
+{
+ destroy_testfs
+}
+
+log_onexit cleanup
+log_assert "With mixed mode: ensure create fails with ENOSPC beyond a certain limit"
+
+create_testfs "-o casesensitivity=mixed -o normalization=none"
+
+# Different object types
+obj_type=('file' 'dir' 'symlink' 'hardlink')
+
+# Commands to create different object types
+typeset -A ops
+ops['file']='touch'
+ops['dir']='mkdir'
+ops['symlink']='ln -s'
+ops['hardlink']='ln'
+
+# This function tests the following for a give object type :
+# - Create multiple objects with the same name (varying only in case).
+# Ensure that it eventually fails once the leaf-block limit is exceeded.
+# - Create another object with a different name. And attempt rename it to the
+# name (for which the create had failed in the previous step).
+# This should fail as well.
+# Args :
+# $1 - object type (file/dir/symlink/hardlink)
+# $2 - test directory
+#
+function test_ops
+{
+ typeset obj_type=$1
+ typeset testdir=$2
+
+ target_obj='target-file'
+
+ op="${ops[$obj_type]}"
+
+ log_note "The op : $op"
+ log_note "testdir=$testdir obj_type=$obj_type"
+
+ test_path="$testdir/$obj_type"
+ mkdir $test_path
+ log_note "Created test dir $test_path"
+
+ if [[ $obj_type = "symlink" || $obj_type = "hardlink" ]]; then
+ touch $test_path/$target_obj
+ log_note "Created target: $test_path/$target_obj"
+ op="$op $test_path/$target_obj"
+ fi
+
+ log_note "op : $op"
+ names='{a,A}{b,B}{c,C}{d,D}{e,E}{f,F}{g,G}{h,H}{i,I}{j,J}{k,K}{l,L}'
+ for name in $names; do
+ cmd="$op $test_path/$name"
+ out=$($cmd 2>&1)
+ ret=$?
+ log_note "cmd: $cmd ret: $ret out=$out"
+ if (($ret != 0)); then
+ if [[ $out = *@(No space left on device)* ]]; then
+ save_name="$test_path/$name"
+ break;
+ else
+ log_err "$cmd failed with unexpected error : $out"
+ fi
+ fi
+ done
+
+ log_note 'Test rename \"sample_name\" rename'
+ TMP_OBJ="$test_path/tmp_obj"
+ cmd="$op $TMP_OBJ"
+ out=$($cmd 2>&1)
+ ret=$?
+ if (($ret != 0)); then
+ log_err "cmd:$cmd failed out:$out"
+ fi
+
+ # Now, try to rename the tmp_obj to the name which we failed to add earlier.
+ # This should fail as well.
+ out=$(mv $TMP_OBJ $save_name 2>&1)
+ ret=$?
+ if (($ret != 0)); then
+ if [[ $out = *@(No space left on device)* ]]; then
+ log_note "$cmd failed as expected : $out"
+ else
+ log_err "$cmd failed with : $out"
+ fi
+ fi
+}
+
+for obj_type in ${obj_type[*]};
+do
+ log_note "Testing create of $obj_type"
+ test_ops $obj_type $TESTDIR
+done
+
+log_pass "Mixed mode FS: Ops on large number of colliding names fail gracefully"
diff --git a/tests/zfs-tests/tests/functional/cp_files/.gitignore b/tests/zfs-tests/tests/functional/cp_files/.gitignore
new file mode 100644
index 00000000..eac05e15
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/cp_files/.gitignore
@@ -0,0 +1 @@
+/cp_files
diff --git a/tests/zfs-tests/tests/functional/cp_files/Makefile.am b/tests/zfs-tests/tests/functional/cp_files/Makefile.am
new file mode 100644
index 00000000..06c31f5f
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/cp_files/Makefile.am
@@ -0,0 +1,13 @@
+include $(top_srcdir)/config/Rules.am
+
+pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/cp_files
+
+dist_pkgdata_SCRIPTS = \
+ cp_files_001_pos.ksh \
+ cleanup.ksh \
+ setup.ksh
+
+pkgexecdir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/cp_files
+
+pkgexec_PROGRAMS = cp_files
+cp_files_SOURCES= cp_files.c
diff --git a/tests/zfs-tests/tests/functional/cp_files/cleanup.ksh b/tests/zfs-tests/tests/functional/cp_files/cleanup.ksh
new file mode 100755
index 00000000..3166bd6e
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/cp_files/cleanup.ksh
@@ -0,0 +1,34 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+# Use is subject to license terms.
+#
+
+#
+# Copyright (c) 2013 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+default_cleanup
diff --git a/tests/zfs-tests/tests/functional/cp_files/cp_files.c b/tests/zfs-tests/tests/functional/cp_files/cp_files.c
new file mode 100644
index 00000000..9af64a11
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/cp_files/cp_files.c
@@ -0,0 +1,58 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <dirent.h>
+#include <errno.h>
+#include <string.h>
+
+int
+main(int argc, char *argv[])
+{
+ int tfd;
+ DIR *sdir;
+ struct dirent *dirent;
+
+ if (argc != 3) {
+ fprintf(stderr, "Usage: %s SRC DST\n", argv[0]);
+ exit(1);
+ }
+
+ sdir = opendir(argv[1]);
+ if (sdir == NULL) {
+ fprintf(stderr, "Failed to open %s: %s\n",
+ argv[1], strerror(errno));
+ exit(2);
+ }
+
+ tfd = open(argv[2], O_DIRECTORY);
+ if (tfd < 0) {
+ fprintf(stderr, "Failed to open %s: %s\n",
+ argv[2], strerror(errno));
+ closedir(sdir);
+ exit(3);
+ }
+
+ while ((dirent = readdir(sdir)) != NULL) {
+ if (dirent->d_name[0] == '.' &&
+ (dirent->d_name[1] == '.' || dirent->d_name[1] == '\0'))
+ continue;
+
+ int fd = openat(tfd, dirent->d_name, O_CREAT|O_WRONLY, 0666);
+ if (fd < 0) {
+ fprintf(stderr, "Failed to create %s/%s: %s\n",
+ argv[2], dirent->d_name, strerror(errno));
+ closedir(sdir);
+ close(tfd);
+ exit(4);
+ }
+ close(fd);
+ }
+
+ closedir(sdir);
+ close(tfd);
+
+ return (0);
+}
diff --git a/tests/zfs-tests/tests/functional/cp_files/cp_files_001_pos.ksh b/tests/zfs-tests/tests/functional/cp_files/cp_files_001_pos.ksh
new file mode 100755
index 00000000..3e138cfc
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/cp_files/cp_files_001_pos.ksh
@@ -0,0 +1,74 @@
+#! /bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2018 by Nutanix. All rights reserved.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+#
+# DESCRIPTION:
+# Copy a large number of files between 2 directories
+# within a zfs filesystem works without errors.
+# This make sure zap upgrading and expanding works.
+#
+# STRATEGY:
+#
+# 1. Create NR_FILES files in directory src
+# 2. Check the number of files is correct
+# 3. Copy files from src to dst in readdir order
+# 4. Check the number of files is correct
+#
+
+verify_runnable "global"
+
+function cleanup
+{
+ rm -rf $TESTDIR/src $TESTDIR/dst
+}
+
+log_assert "Copy a large number of files between 2 directories" \
+ "within a zfs filesystem works without errors"
+
+log_onexit cleanup
+
+NR_FILES=60000
+BATCH=1000
+
+log_must mkdir $TESTDIR/src
+log_must mkdir $TESTDIR/dst
+
+WD=$(pwd)
+cd $TESTDIR/src
+# create NR_FILES in BATCH at a time to prevent overflowing argument buffer
+for i in $(seq $(($NR_FILES/$BATCH))); do touch $(seq $((($i-1)*$BATCH+1)) $(($i*$BATCH))); done
+cd $WD
+
+log_must test $NR_FILES -eq $(ls -U $TESTDIR/src | wc -l)
+
+# copy files from src to dst, use cp_files to make sure we copy in readdir order
+log_must $STF_SUITE/tests/functional/cp_files/cp_files $TESTDIR/src $TESTDIR/dst
+
+log_must test $NR_FILES -eq $(ls -U $TESTDIR/dst | wc -l)
+
+log_pass
diff --git a/tests/zfs-tests/tests/functional/cp_files/setup.ksh b/tests/zfs-tests/tests/functional/cp_files/setup.ksh
new file mode 100755
index 00000000..fc5cec30
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/cp_files/setup.ksh
@@ -0,0 +1,35 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+# Use is subject to license terms.
+#
+
+#
+# Copyright (c) 2013 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+DISK=${DISKS%% *}
+default_setup $DISK

View File

@ -0,0 +1,155 @@
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Brian Behlendorf <behlendorf1@llnl.gov>
Date: Tue, 5 Sep 2017 13:41:32 -0700
Subject: [PATCH] Trim new line from zfs_vdev_scheduler
Add a helper function to trim the tailing new line. While we're
here use this new hook to immediately apply the new scheduler.
Reviewed-by: Giuseppe Di Natale <dinatale2@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #3356
Closes #6573
Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>
---
module/zfs/vdev_disk.c | 71 +++++++++++++++++++++++++++++++++++++-------------
1 file changed, 53 insertions(+), 18 deletions(-)
diff --git a/module/zfs/vdev_disk.c b/module/zfs/vdev_disk.c
index 5ae50a31..d6212835 100644
--- a/module/zfs/vdev_disk.c
+++ b/module/zfs/vdev_disk.c
@@ -27,13 +27,14 @@
*/
#include <sys/zfs_context.h>
-#include <sys/spa.h>
+#include <sys/spa_impl.h>
#include <sys/vdev_disk.h>
#include <sys/vdev_impl.h>
#include <sys/abd.h>
#include <sys/fs/zfs.h>
#include <sys/zio.h>
#include <sys/sunldi.h>
+#include <linux/mod_compat.h>
char *zfs_vdev_scheduler = VDEV_SCHEDULER;
static void *zfs_vdev_holder = VDEV_HOLDER;
@@ -113,15 +114,23 @@ vdev_disk_error(zio_t *zio)
* physical device. This yields the largest possible requests for
* the device with the lowest total overhead.
*/
-static int
+static void
vdev_elevator_switch(vdev_t *v, char *elevator)
{
vdev_disk_t *vd = v->vdev_tsd;
- struct block_device *bdev = vd->vd_bdev;
- struct request_queue *q = bdev_get_queue(bdev);
- char *device = bdev->bd_disk->disk_name;
+ struct request_queue *q;
+ char *device;
int error;
+ for (int c = 0; c < v->vdev_children; c++)
+ vdev_elevator_switch(v->vdev_child[c], elevator);
+
+ if (!v->vdev_ops->vdev_op_leaf || vd->vd_bdev == NULL)
+ return;
+
+ q = bdev_get_queue(vd->vd_bdev);
+ device = vd->vd_bdev->bd_disk->disk_name;
+
/*
* Skip devices which are not whole disks (partitions).
* Device-mapper devices are excepted since they may be whole
@@ -131,15 +140,15 @@ vdev_elevator_switch(vdev_t *v, char *elevator)
* "Skip devices without schedulers" check below will fail.
*/
if (!v->vdev_wholedisk && strncmp(device, "dm-", 3) != 0)
- return (0);
+ return;
/* Skip devices without schedulers (loop, ram, dm, etc) */
if (!q->elevator || !blk_queue_stackable(q))
- return (0);
+ return;
/* Leave existing scheduler when set to "none" */
if ((strncmp(elevator, "none", 4) == 0) && (strlen(elevator) == 4))
- return (0);
+ return;
#ifdef HAVE_ELEVATOR_CHANGE
error = elevator_change(q, elevator);
@@ -156,20 +165,16 @@ vdev_elevator_switch(vdev_t *v, char *elevator)
" 2>/dev/null; " \
"echo %s"
- {
- char *argv[] = { "/bin/sh", "-c", NULL, NULL };
- char *envp[] = { NULL };
+ char *argv[] = { "/bin/sh", "-c", NULL, NULL };
+ char *envp[] = { NULL };
- argv[2] = kmem_asprintf(SET_SCHEDULER_CMD, device, elevator);
- error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
- strfree(argv[2]);
- }
+ argv[2] = kmem_asprintf(SET_SCHEDULER_CMD, device, elevator);
+ error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
+ strfree(argv[2]);
#endif /* HAVE_ELEVATOR_CHANGE */
if (error)
printk("ZFS: Unable to set \"%s\" scheduler for %s (%s): %d\n",
elevator, v->vdev_path, device, error);
-
- return (error);
}
/*
@@ -798,6 +803,35 @@ vdev_disk_rele(vdev_t *vd)
/* XXX: Implement me as a vnode rele for the device */
}
+static int
+param_set_vdev_scheduler(const char *val, zfs_kernel_param_t *kp)
+{
+ spa_t *spa = NULL;
+ char *p;
+
+ if (val == NULL)
+ return (SET_ERROR(-EINVAL));
+
+ if ((p = strchr(val, '\n')) != NULL)
+ *p = '\0';
+
+ mutex_enter(&spa_namespace_lock);
+ while ((spa = spa_next(spa)) != NULL) {
+ if (spa_state(spa) != POOL_STATE_ACTIVE ||
+ !spa_writeable(spa) || spa_suspended(spa))
+ continue;
+
+ spa_open_ref(spa, FTAG);
+ mutex_exit(&spa_namespace_lock);
+ vdev_elevator_switch(spa->spa_root_vdev, (char *)val);
+ mutex_enter(&spa_namespace_lock);
+ spa_close(spa, FTAG);
+ }
+ mutex_exit(&spa_namespace_lock);
+
+ return (param_set_charp(val, kp));
+}
+
vdev_ops_t vdev_disk_ops = {
vdev_disk_open,
vdev_disk_close,
@@ -812,5 +846,6 @@ vdev_ops_t vdev_disk_ops = {
B_TRUE /* leaf vdev */
};
-module_param(zfs_vdev_scheduler, charp, 0644);
+module_param_call(zfs_vdev_scheduler, param_set_vdev_scheduler,
+ param_get_charp, &zfs_vdev_scheduler, 0644);
MODULE_PARM_DESC(zfs_vdev_scheduler, "I/O scheduler");

View File

@ -0,0 +1,84 @@
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Olaf Faaland <faaland1@llnl.gov>
Date: Fri, 11 May 2018 12:46:07 -0700
Subject: [PATCH] module param callbacks check for initialized spa
Callbacks provided for module parameters are executed both
after the module is loaded, when a user alters it via sysfs, e.g
echo bar > /sys/modules/zfs/parameters/foo
as well as when the module is loaded with an argument, e.g.
modprobe zfs foo=bar
In the latter case, the init functions likely have not run yet,
including spa_init() which initializes the namespace lock so it is safe
to use.
Instead of immediately taking the namespace lock and attemping to
iterate over initialized spa structures, check whether spa_mode_global
is nonzero. This is set by spa_init() after it has initialized the
namespace lock.
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Olaf Faaland <faaland1@llnl.gov>
Closes #7496
Closes #7521
Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>
---
module/zfs/mmp.c | 3 ++-
module/zfs/vdev_disk.c | 24 +++++++++++++-----------
2 files changed, 15 insertions(+), 12 deletions(-)
diff --git a/module/zfs/mmp.c b/module/zfs/mmp.c
index 3b74a6b6..7523310c 100644
--- a/module/zfs/mmp.c
+++ b/module/zfs/mmp.c
@@ -607,7 +607,8 @@ param_set_multihost_interval(const char *val, zfs_kernel_param_t *kp)
if (ret < 0)
return (ret);
- mmp_signal_all_threads();
+ if (spa_mode_global != 0)
+ mmp_signal_all_threads();
return (ret);
}
diff --git a/module/zfs/vdev_disk.c b/module/zfs/vdev_disk.c
index d6212835..6761e755 100644
--- a/module/zfs/vdev_disk.c
+++ b/module/zfs/vdev_disk.c
@@ -815,19 +815,21 @@ param_set_vdev_scheduler(const char *val, zfs_kernel_param_t *kp)
if ((p = strchr(val, '\n')) != NULL)
*p = '\0';
- mutex_enter(&spa_namespace_lock);
- while ((spa = spa_next(spa)) != NULL) {
- if (spa_state(spa) != POOL_STATE_ACTIVE ||
- !spa_writeable(spa) || spa_suspended(spa))
- continue;
-
- spa_open_ref(spa, FTAG);
- mutex_exit(&spa_namespace_lock);
- vdev_elevator_switch(spa->spa_root_vdev, (char *)val);
+ if (spa_mode_global != 0) {
mutex_enter(&spa_namespace_lock);
- spa_close(spa, FTAG);
+ while ((spa = spa_next(spa)) != NULL) {
+ if (spa_state(spa) != POOL_STATE_ACTIVE ||
+ !spa_writeable(spa) || spa_suspended(spa))
+ continue;
+
+ spa_open_ref(spa, FTAG);
+ mutex_exit(&spa_namespace_lock);
+ vdev_elevator_switch(spa->spa_root_vdev, (char *)val);
+ mutex_enter(&spa_namespace_lock);
+ spa_close(spa, FTAG);
+ }
+ mutex_exit(&spa_namespace_lock);
}
- mutex_exit(&spa_namespace_lock);
return (param_set_charp(val, kp));
}

View File

@ -0,0 +1,52 @@
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Antonio Russo <antonio.e.russo@gmail.com>
Date: Sat, 26 May 2018 13:56:24 -0400
Subject: [PATCH] Support Debian DKMS builds
scripts/dkms.mkconf calls configure with
`--with-linux=${kernel_source_dir}`, but Debian puts it kernel source at
`/lib/modules/<version>/source`. This patch adds the same logic to the
DKMS file produced by `scripts/dkms.mkconf` that Debian has shipped in
its official ZFS packaging: at DKMS build time, it checks if the system
is a Debian system, and adjusts the path accordingly.
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: George Melikov <mail@gmelikov.ru>
Signed-off-by: Antonio Russo <antonio.e.russo@gmail.com>
Closes #7358
Closes #7540
Closes #7554
Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>
---
scripts/dkms.mkconf | 17 ++++++++++++++++-
1 file changed, 16 insertions(+), 1 deletion(-)
diff --git a/scripts/dkms.mkconf b/scripts/dkms.mkconf
index 880510ab..88c28938 100755
--- a/scripts/dkms.mkconf
+++ b/scripts/dkms.mkconf
@@ -25,7 +25,22 @@ PACKAGE_CONFIG="${pkgcfg}"
PRE_BUILD="configure
--prefix=/usr
--with-config=kernel
- --with-linux=\${kernel_source_dir}
+ --with-linux=\$(
+ case \`lsb_release -is\` in
+ (Debian|Devuan)
+ if [[ -e \${kernel_source_dir/%build/source} ]]
+ then
+ echo \${kernel_source_dir/%build/source}
+ else
+ # A kpkg exception for Proxmox 2.0
+ echo \${kernel_source_dir}
+ fi
+ ;;
+ (*)
+ echo \${kernel_source_dir}
+ ;;
+ esac
+ )
--with-linux-obj=\${kernel_source_dir}
--with-spl=\${source_tree}/spl-\${PACKAGE_VERSION}
--with-spl-obj=\${dkms_tree}/spl/\${PACKAGE_VERSION}/\${kernelver}/\${arch}

View File

@ -0,0 +1,376 @@
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Sara Hartse <sara.hartse@gmail.com>
Date: Thu, 31 May 2018 10:36:37 -0700
Subject: [PATCH] zpool reopen should detect expanded devices
Update bdev_capacity to have wholedisk vdevs query the
size of the underlying block device (correcting for the size
of the efi parition and partition alignment) and therefore detect
expanded space.
Correct vdev_get_stats_ex so that the expandsize is aligned
to metaslab size and new space is only reported if it is large
enough for a new metaslab.
Reviewed by: Don Brady <don.brady@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed-by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: John Wren Kennedy <jwk404@gmail.com>
Signed-off-by: sara hartse <sara.hartse@delphix.com>
External-issue: LX-165
Closes #7546
Issue #7582
Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>
---
include/sys/vdev_disk.h | 12 +++++
lib/libefi/rdwr_efi.c | 20 +++++++-
lib/libzfs/libzfs_pool.c | 14 +-----
module/zfs/vdev.c | 3 +-
module/zfs/vdev_disk.c | 46 +++++++++++++-----
.../cli_root/zpool_expand/zpool_expand_002_pos.ksh | 54 +++++++++++++++-------
6 files changed, 107 insertions(+), 42 deletions(-)
diff --git a/include/sys/vdev_disk.h b/include/sys/vdev_disk.h
index 15570b10..b8a32b31 100644
--- a/include/sys/vdev_disk.h
+++ b/include/sys/vdev_disk.h
@@ -23,11 +23,23 @@
* Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
* Written by Brian Behlendorf <behlendorf1@llnl.gov>.
* LLNL-CODE-403049.
+ * Copyright (c) 2018 by Delphix. All rights reserved.
*/
#ifndef _SYS_VDEV_DISK_H
#define _SYS_VDEV_DISK_H
+/*
+ * Don't start the slice at the default block of 34; many storage
+ * devices will use a stripe width of 128k, other vendors prefer a 1m
+ * alignment. It is best to play it safe and ensure a 1m alignment
+ * given 512B blocks. When the block size is larger by a power of 2
+ * we will still be 1m aligned. Some devices are sensitive to the
+ * partition ending alignment as well.
+ */
+#define NEW_START_BLOCK 2048
+#define PARTITION_END_ALIGNMENT 2048
+
#ifdef _KERNEL
#include <sys/vdev.h>
diff --git a/lib/libefi/rdwr_efi.c b/lib/libefi/rdwr_efi.c
index 7935047e..19cb17e5 100644
--- a/lib/libefi/rdwr_efi.c
+++ b/lib/libefi/rdwr_efi.c
@@ -22,6 +22,7 @@
/*
* Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2012 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2018 by Delphix. All rights reserved.
*/
#include <stdio.h>
@@ -1153,7 +1154,7 @@ efi_use_whole_disk(int fd)
/*
* Find the last physically non-zero partition.
- * This is the reserved partition.
+ * This should be the reserved partition.
*/
for (i = 0; i < efi_label->efi_nparts; i ++) {
if (resv_start < efi_label->efi_parts[i].p_start) {
@@ -1163,6 +1164,23 @@ efi_use_whole_disk(int fd)
}
/*
+ * Verify that we've found the reserved partition by checking
+ * that it looks the way it did when we created it in zpool_label_disk.
+ * If we've found the incorrect partition, then we know that this
+ * device was reformatted and no longer is soley used by ZFS.
+ */
+ if ((efi_label->efi_parts[resv_index].p_size != EFI_MIN_RESV_SIZE) ||
+ (efi_label->efi_parts[resv_index].p_tag != V_RESERVED) ||
+ (resv_index != 8)) {
+ if (efi_debug) {
+ (void) fprintf(stderr,
+ "efi_use_whole_disk: wholedisk not available\n");
+ }
+ efi_free(efi_label);
+ return (VT_ENOSPC);
+ }
+
+ /*
* Find the last physically non-zero partition before that.
* This is the data partition.
*/
diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c
index e00d5f51..53bc5034 100644
--- a/lib/libzfs/libzfs_pool.c
+++ b/lib/libzfs/libzfs_pool.c
@@ -22,7 +22,7 @@
/*
* Copyright 2015 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
* Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>
* Copyright (c) 2017 Datto Inc.
*/
@@ -42,6 +42,7 @@
#include <sys/efi_partition.h>
#include <sys/vtoc.h>
#include <sys/zfs_ioctl.h>
+#include <sys/vdev_disk.h>
#include <dlfcn.h>
#include "zfs_namecheck.h"
@@ -913,17 +914,6 @@ zpool_prop_get_feature(zpool_handle_t *zhp, const char *propname, char *buf,
}
/*
- * Don't start the slice at the default block of 34; many storage
- * devices will use a stripe width of 128k, other vendors prefer a 1m
- * alignment. It is best to play it safe and ensure a 1m alignment
- * given 512B blocks. When the block size is larger by a power of 2
- * we will still be 1m aligned. Some devices are sensitive to the
- * partition ending alignment as well.
- */
-#define NEW_START_BLOCK 2048
-#define PARTITION_END_ALIGNMENT 2048
-
-/*
* Validate the given pool name, optionally putting an extended error message in
* 'buf'.
*/
diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c
index acac2a97..b643bd35 100644
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -21,7 +21,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
* Copyright 2017 Nexenta Systems, Inc.
* Copyright (c) 2014 Integros [integros.com]
* Copyright 2016 Toomas Soome <tsoome@me.com>
@@ -3039,7 +3039,6 @@ vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx)
vd->vdev_max_asize - vd->vdev_asize,
1ULL << tvd->vdev_ms_shift);
}
- vs->vs_esize = vd->vdev_max_asize - vd->vdev_asize;
if (vd->vdev_aux == NULL && vd == vd->vdev_top &&
!vd->vdev_ishole) {
vs->vs_fragmentation = vd->vdev_mg->mg_fragmentation;
diff --git a/module/zfs/vdev_disk.c b/module/zfs/vdev_disk.c
index 6761e755..6dc0544f 100644
--- a/module/zfs/vdev_disk.c
+++ b/module/zfs/vdev_disk.c
@@ -23,7 +23,7 @@
* Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
* Rewritten for Linux by Brian Behlendorf <behlendorf1@llnl.gov>.
* LLNL-CODE-403049.
- * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -35,10 +35,14 @@
#include <sys/zio.h>
#include <sys/sunldi.h>
#include <linux/mod_compat.h>
+#include <linux/msdos_fs.h>
char *zfs_vdev_scheduler = VDEV_SCHEDULER;
static void *zfs_vdev_holder = VDEV_HOLDER;
+/* size of the "reserved" partition, in blocks */
+#define EFI_MIN_RESV_SIZE (16 * 1024)
+
/*
* Virtual device vector for disks.
*/
@@ -82,17 +86,39 @@ vdev_bdev_mode(int smode)
}
#endif /* HAVE_OPEN_BDEV_EXCLUSIVE */
+/* The capacity (in bytes) of a bdev that is available to be used by a vdev */
static uint64_t
-bdev_capacity(struct block_device *bdev)
+bdev_capacity(struct block_device *bdev, boolean_t wholedisk)
{
struct hd_struct *part = bdev->bd_part;
+ uint64_t sectors = get_capacity(bdev->bd_disk);
+ /* If there are no paritions, return the entire device capacity */
+ if (part == NULL)
+ return (sectors << SECTOR_BITS);
- /* The partition capacity referenced by the block device */
- if (part)
- return (part->nr_sects << 9);
-
- /* Otherwise assume the full device capacity */
- return (get_capacity(bdev->bd_disk) << 9);
+ /*
+ * If there are partitions, decide if we are using a `wholedisk`
+ * layout (composed of part1 and part9) or just a single partition.
+ */
+ if (wholedisk) {
+ /* Verify the expected device layout */
+ ASSERT3P(bdev, !=, bdev->bd_contains);
+ /*
+ * Sectors used by the EFI partition (part9) as well as
+ * partion alignment.
+ */
+ uint64_t used = EFI_MIN_RESV_SIZE + NEW_START_BLOCK +
+ PARTITION_END_ALIGNMENT;
+
+ /* Space available to the vdev, i.e. the size of part1 */
+ if (sectors <= used)
+ return (0);
+ uint64_t available = sectors - used;
+ return (available << SECTOR_BITS);
+ } else {
+ /* The partition capacity referenced by the block device */
+ return (part->nr_sects << SECTOR_BITS);
+ }
}
static void
@@ -328,9 +354,7 @@ skip_open:
v->vdev_nonrot = blk_queue_nonrot(bdev_get_queue(vd->vd_bdev));
/* Physical volume size in bytes */
- *psize = bdev_capacity(vd->vd_bdev);
-
- /* TODO: report possible expansion size */
+ *psize = bdev_capacity(vd->vd_bdev, v->vdev_wholedisk);
*max_psize = *psize;
/* Based on the minimum sector size set the block size */
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_002_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_002_pos.ksh
index d578ae60..66b6969d 100755
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_002_pos.ksh
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_002_pos.ksh
@@ -26,7 +26,7 @@
#
#
-# Copyright (c) 2012, 2016 by Delphix. All rights reserved.
+# Copyright (c) 2012, 2018 by Delphix. All rights reserved.
# Copyright (c) 2017 Lawrence Livermore National Security, LLC.
#
@@ -43,8 +43,9 @@
# 1) Create 3 files
# 2) Create a pool backed by the files
# 3) Expand the files' size with truncate
-# 4) Use zpool online -e to online the vdevs
-# 5) Check that the pool size was expanded
+# 4) Use zpool reopen to check the expandsize
+# 5) Use zpool online -e to online the vdevs
+# 6) Check that the pool size was expanded
#
verify_runnable "global"
@@ -64,8 +65,8 @@ log_onexit cleanup
log_assert "zpool can expand after zpool online -e zvol vdevs on LUN expansion"
-
for type in " " mirror raidz raidz2; do
+ # Initialize the file devices and the pool
for i in 1 2 3; do
log_must truncate -s $org_size ${TEMPFILE}.$i
done
@@ -80,13 +81,35 @@ for type in " " mirror raidz raidz2; do
"$autoexp"
fi
typeset prev_size=$(get_pool_prop size $TESTPOOL1)
- typeset zfs_prev_size=$(zfs get -p avail $TESTPOOL1 | tail -1 | \
- awk '{print $3}')
+ typeset zfs_prev_size=$(get_prop avail $TESTPOOL1)
+ # Increase the size of the file devices
for i in 1 2 3; do
log_must truncate -s $exp_size ${TEMPFILE}.$i
done
+ # Reopen the pool and check that the `expandsize` property is set
+ log_must zpool reopen $TESTPOOL1
+ typeset zpool_expandsize=$(get_pool_prop expandsize $TESTPOOL1)
+
+ if [[ $type == "mirror" ]]; then
+ typeset expected_zpool_expandsize=$(($exp_size-$org_size))
+ else
+ typeset expected_zpool_expandsize=$((3*($exp_size-$org_size)))
+ fi
+
+ if [[ "$zpool_expandsize" = "-" ]]; then
+ log_fail "pool $TESTPOOL1 did not detect any " \
+ "expandsize after reopen"
+ fi
+
+ if [[ $zpool_expandsize -ne $expected_zpool_expandsize ]]; then
+ log_fail "pool $TESTPOOL1 did not detect correct " \
+ "expandsize after reopen: found $zpool_expandsize," \
+ "expected $expected_zpool_expandsize"
+ fi
+
+ # Online the devices to add the new space to the pool
for i in 1 2 3; do
log_must zpool online -e $TESTPOOL1 ${TEMPFILE}.$i
done
@@ -96,8 +119,7 @@ for type in " " mirror raidz raidz2; do
sync
typeset expand_size=$(get_pool_prop size $TESTPOOL1)
- typeset zfs_expand_size=$(zfs get -p avail $TESTPOOL1 | tail -1 | \
- awk '{print $3}')
+ typeset zfs_expand_size=$(get_prop avail $TESTPOOL1)
log_note "$TESTPOOL1 $type has previous size: $prev_size and " \
"expanded size: $expand_size"
@@ -112,8 +134,8 @@ for type in " " mirror raidz raidz2; do
grep "(+${expansion_size}" | wc -l)
if [[ $size_addition -ne $i ]]; then
- log_fail "pool $TESTPOOL1 is not autoexpand " \
- "after LUN expansion"
+ log_fail "pool $TESTPOOL1 did not expand " \
+ "after LUN expansion and zpool online -e"
fi
elif [[ $type == "mirror" ]]; then
typeset expansion_size=$(($exp_size-$org_size))
@@ -123,8 +145,8 @@ for type in " " mirror raidz raidz2; do
grep "(+${expansion_size})" >/dev/null 2>&1
if [[ $? -ne 0 ]]; then
- log_fail "pool $TESTPOOL1 is not autoexpand " \
- "after LUN expansion"
+ log_fail "pool $TESTPOOL1 did not expand " \
+ "after LUN expansion and zpool online -e"
fi
else
typeset expansion_size=$((3*($exp_size-$org_size)))
@@ -134,13 +156,13 @@ for type in " " mirror raidz raidz2; do
grep "(+${expansion_size})" >/dev/null 2>&1
if [[ $? -ne 0 ]] ; then
- log_fail "pool $TESTPOOL1 is not autoexpand " \
- "after LUN expansion"
+ log_fail "pool $TESTPOOL1 did not expand " \
+ "after LUN expansion and zpool online -e"
fi
fi
else
- log_fail "pool $TESTPOOL1 is not autoexpanded after LUN " \
- "expansion"
+ log_fail "pool $TESTPOOL1 did not expand after LUN expansion " \
+ "and zpool online -e"
fi
log_must zpool destroy $TESTPOOL1
done

View File

@ -0,0 +1,686 @@
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Tony Hutter <hutter2@llnl.gov>
Date: Wed, 6 Jun 2018 09:33:54 -0700
Subject: [PATCH] Add pool state /proc entry, "SUSPENDED" pools
1. Add a proc entry to display the pool's state:
$ cat /proc/spl/kstat/zfs/tank/state
ONLINE
This is done without using the spa config locks, so it will
never hang.
2. Fix 'zpool status' and 'zpool list -o health' output to print
"SUSPENDED" instead of "ONLINE" for suspended pools.
Reviewed-by: Olaf Faaland <faaland1@llnl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: Richard Elling <Richard.Elling@RichardElling.com>
Signed-off-by: Tony Hutter <hutter2@llnl.gov>
Closes #7331
Closes #7563
Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>
---
cmd/zpool/zpool_main.c | 3 +-
configure.ac | 1 +
include/libzfs.h | 2 +
include/sys/spa.h | 3 +
lib/libspl/include/sys/kstat.h | 2 +
lib/libzfs/libzfs_pool.c | 46 +++++--
lib/libzfs/libzfs_status.c | 12 +-
module/zfs/spa_misc.c | 40 ++++++
module/zfs/spa_stats.c | 62 +++++++++
tests/runfiles/linux.run | 4 +
tests/zfs-tests/include/libtest.shlib | 38 ++++++
tests/zfs-tests/tests/functional/Makefile.am | 1 +
tests/zfs-tests/tests/functional/kstat/Makefile.am | 5 +
tests/zfs-tests/tests/functional/kstat/cleanup.ksh | 28 ++++
tests/zfs-tests/tests/functional/kstat/setup.ksh | 34 +++++
tests/zfs-tests/tests/functional/kstat/state.ksh | 144 +++++++++++++++++++++
16 files changed, 406 insertions(+), 19 deletions(-)
create mode 100644 tests/zfs-tests/tests/functional/kstat/Makefile.am
create mode 100755 tests/zfs-tests/tests/functional/kstat/cleanup.ksh
create mode 100755 tests/zfs-tests/tests/functional/kstat/setup.ksh
create mode 100755 tests/zfs-tests/tests/functional/kstat/state.ksh
diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c
index b0756938..97697011 100644
--- a/cmd/zpool/zpool_main.c
+++ b/cmd/zpool/zpool_main.c
@@ -6226,7 +6226,8 @@ status_callback(zpool_handle_t *zhp, void *data)
&nvroot) == 0);
verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS,
(uint64_t **)&vs, &c) == 0);
- health = zpool_state_to_name(vs->vs_state, vs->vs_aux);
+
+ health = zpool_get_state_str(zhp);
(void) printf(gettext(" pool: %s\n"), zpool_get_name(zhp));
(void) printf(gettext(" state: %s\n"), health);
diff --git a/configure.ac b/configure.ac
index 3f4925c3..42cfc1a3 100644
--- a/configure.ac
+++ b/configure.ac
@@ -253,6 +253,7 @@ AC_CONFIG_FILES([
tests/zfs-tests/tests/functional/history/Makefile
tests/zfs-tests/tests/functional/inheritance/Makefile
tests/zfs-tests/tests/functional/inuse/Makefile
+ tests/zfs-tests/tests/functional/kstat/Makefile
tests/zfs-tests/tests/functional/large_files/Makefile
tests/zfs-tests/tests/functional/largest_pool/Makefile
tests/zfs-tests/tests/functional/link_count/Makefile
diff --git a/include/libzfs.h b/include/libzfs.h
index 945bd5b8..fea2fee4 100644
--- a/include/libzfs.h
+++ b/include/libzfs.h
@@ -296,6 +296,8 @@ int zfs_dev_is_whole_disk(char *dev_name);
char *zfs_get_underlying_path(char *dev_name);
char *zfs_get_enclosure_sysfs_path(char *dev_name);
+const char *zpool_get_state_str(zpool_handle_t *);
+
/*
* Functions to manage pool properties
*/
diff --git a/include/sys/spa.h b/include/sys/spa.h
index 3b268419..810999c9 100644
--- a/include/sys/spa.h
+++ b/include/sys/spa.h
@@ -730,6 +730,7 @@ typedef struct spa_stats {
spa_stats_history_t tx_assign_histogram;
spa_stats_history_t io_history;
spa_stats_history_t mmp_history;
+ spa_stats_history_t state; /* pool state */
} spa_stats_t;
typedef enum txg_state {
@@ -889,6 +890,8 @@ extern void spa_history_log_internal_ds(struct dsl_dataset *ds, const char *op,
extern void spa_history_log_internal_dd(dsl_dir_t *dd, const char *operation,
dmu_tx_t *tx, const char *fmt, ...);
+extern const char *spa_state_to_name(spa_t *spa);
+
/* error handling */
struct zbookmark_phys;
extern void spa_log_error(spa_t *spa, zio_t *zio);
diff --git a/lib/libspl/include/sys/kstat.h b/lib/libspl/include/sys/kstat.h
index fcd3ed98..84c3d7ca 100644
--- a/lib/libspl/include/sys/kstat.h
+++ b/lib/libspl/include/sys/kstat.h
@@ -304,6 +304,8 @@ typedef struct kstat32 {
#define KSTAT_FLAG_PERSISTENT 0x08
#define KSTAT_FLAG_DORMANT 0x10
#define KSTAT_FLAG_INVALID 0x20
+#define KSTAT_FLAG_LONGSTRINGS 0x40
+#define KSTAT_FLAG_NO_HEADERS 0x80
/*
* Dynamic update support
diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c
index 53bc5034..315ba954 100644
--- a/lib/libzfs/libzfs_pool.c
+++ b/lib/libzfs/libzfs_pool.c
@@ -240,6 +240,38 @@ zpool_pool_state_to_name(pool_state_t state)
}
/*
+ * Given a pool handle, return the pool health string ("ONLINE", "DEGRADED",
+ * "SUSPENDED", etc).
+ */
+const char *
+zpool_get_state_str(zpool_handle_t *zhp)
+{
+ zpool_errata_t errata;
+ zpool_status_t status;
+ nvlist_t *nvroot;
+ vdev_stat_t *vs;
+ uint_t vsc;
+ const char *str;
+
+ status = zpool_get_status(zhp, NULL, &errata);
+
+ if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL) {
+ str = gettext("FAULTED");
+ } else if (status == ZPOOL_STATUS_IO_FAILURE_WAIT ||
+ status == ZPOOL_STATUS_IO_FAILURE_MMP) {
+ str = gettext("SUSPENDED");
+ } else {
+ verify(nvlist_lookup_nvlist(zpool_get_config(zhp, NULL),
+ ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
+ verify(nvlist_lookup_uint64_array(nvroot,
+ ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc)
+ == 0);
+ str = zpool_state_to_name(vs->vs_state, vs->vs_aux);
+ }
+ return (str);
+}
+
+/*
* Get a zpool property value for 'prop' and return the value in
* a pre-allocated buffer.
*/
@@ -250,9 +282,6 @@ zpool_get_prop(zpool_handle_t *zhp, zpool_prop_t prop, char *buf,
uint64_t intval;
const char *strval;
zprop_source_t src = ZPROP_SRC_NONE;
- nvlist_t *nvroot;
- vdev_stat_t *vs;
- uint_t vsc;
if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL) {
switch (prop) {
@@ -261,7 +290,7 @@ zpool_get_prop(zpool_handle_t *zhp, zpool_prop_t prop, char *buf,
break;
case ZPOOL_PROP_HEALTH:
- (void) strlcpy(buf, "FAULTED", len);
+ (void) strlcpy(buf, zpool_get_state_str(zhp), len);
break;
case ZPOOL_PROP_GUID:
@@ -362,14 +391,7 @@ zpool_get_prop(zpool_handle_t *zhp, zpool_prop_t prop, char *buf,
break;
case ZPOOL_PROP_HEALTH:
- verify(nvlist_lookup_nvlist(zpool_get_config(zhp, NULL),
- ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
- verify(nvlist_lookup_uint64_array(nvroot,
- ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc)
- == 0);
-
- (void) strlcpy(buf, zpool_state_to_name(intval,
- vs->vs_aux), len);
+ (void) strlcpy(buf, zpool_get_state_str(zhp), len);
break;
case ZPOOL_PROP_VERSION:
if (intval >= SPA_VERSION_FEATURES) {
diff --git a/lib/libzfs/libzfs_status.c b/lib/libzfs/libzfs_status.c
index 6cdcd382..5e423f3a 100644
--- a/lib/libzfs/libzfs_status.c
+++ b/lib/libzfs/libzfs_status.c
@@ -403,12 +403,12 @@ zpool_status_t
zpool_get_status(zpool_handle_t *zhp, char **msgid, zpool_errata_t *errata)
{
zpool_status_t ret = check_status(zhp->zpool_config, B_FALSE, errata);
-
- if (ret >= NMSGID)
- *msgid = NULL;
- else
- *msgid = zfs_msgid_table[ret];
-
+ if (msgid != NULL) {
+ if (ret >= NMSGID)
+ *msgid = NULL;
+ else
+ *msgid = zfs_msgid_table[ret];
+ }
return (ret);
}
diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c
index e92c3948..cc1c641d 100644
--- a/module/zfs/spa_misc.c
+++ b/module/zfs/spa_misc.c
@@ -2100,6 +2100,45 @@ spa_get_hostid(void)
return (myhostid);
}
+/*
+ * Return the pool state string ("ONLINE", "DEGRADED", "SUSPENDED", etc).
+ */
+const char *
+spa_state_to_name(spa_t *spa)
+{
+ vdev_state_t state = spa->spa_root_vdev->vdev_state;
+ vdev_aux_t aux = spa->spa_root_vdev->vdev_stat.vs_aux;
+
+ if (spa_suspended(spa) &&
+ (spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE))
+ return ("SUSPENDED");
+
+ switch (state) {
+ case VDEV_STATE_CLOSED:
+ case VDEV_STATE_OFFLINE:
+ return ("OFFLINE");
+ case VDEV_STATE_REMOVED:
+ return ("REMOVED");
+ case VDEV_STATE_CANT_OPEN:
+ if (aux == VDEV_AUX_CORRUPT_DATA || aux == VDEV_AUX_BAD_LOG)
+ return ("FAULTED");
+ else if (aux == VDEV_AUX_SPLIT_POOL)
+ return ("SPLIT");
+ else
+ return ("UNAVAIL");
+ case VDEV_STATE_FAULTED:
+ return ("FAULTED");
+ case VDEV_STATE_DEGRADED:
+ return ("DEGRADED");
+ case VDEV_STATE_HEALTHY:
+ return ("ONLINE");
+ default:
+ break;
+ }
+
+ return ("UNKNOWN");
+}
+
#if defined(_KERNEL) && defined(HAVE_SPL)
/* Namespace manipulation */
EXPORT_SYMBOL(spa_lookup);
@@ -2178,6 +2217,7 @@ EXPORT_SYMBOL(spa_is_root);
EXPORT_SYMBOL(spa_writeable);
EXPORT_SYMBOL(spa_mode);
EXPORT_SYMBOL(spa_namespace_lock);
+EXPORT_SYMBOL(spa_state_to_name);
/* BEGIN CSTYLED */
module_param(zfs_flags, uint, 0644);
diff --git a/module/zfs/spa_stats.c b/module/zfs/spa_stats.c
index 8950d9c5..ca3d0be7 100644
--- a/module/zfs/spa_stats.c
+++ b/module/zfs/spa_stats.c
@@ -22,6 +22,8 @@
#include <sys/zfs_context.h>
#include <sys/spa_impl.h>
#include <sys/vdev_impl.h>
+#include <sys/spa.h>
+#include <zfs_comutil.h>
/*
* Keeps stats on last N reads per spa_t, disabled by default.
@@ -992,6 +994,64 @@ spa_mmp_history_add(spa_t *spa, uint64_t txg, uint64_t timestamp,
return ((void *)smh);
}
+static void *
+spa_state_addr(kstat_t *ksp, loff_t n)
+{
+ return (ksp->ks_private); /* return the spa_t */
+}
+
+static int
+spa_state_data(char *buf, size_t size, void *data)
+{
+ spa_t *spa = (spa_t *)data;
+ (void) snprintf(buf, size, "%s\n", spa_state_to_name(spa));
+ return (0);
+}
+
+/*
+ * Return the state of the pool in /proc/spl/kstat/zfs/<pool>/state.
+ *
+ * This is a lock-less read of the pool's state (unlike using 'zpool', which
+ * can potentially block for seconds). Because it doesn't block, it can useful
+ * as a pool heartbeat value.
+ */
+static void
+spa_state_init(spa_t *spa)
+{
+ spa_stats_history_t *ssh = &spa->spa_stats.state;
+ char *name;
+ kstat_t *ksp;
+
+ mutex_init(&ssh->lock, NULL, MUTEX_DEFAULT, NULL);
+
+ name = kmem_asprintf("zfs/%s", spa_name(spa));
+ ksp = kstat_create(name, 0, "state", "misc",
+ KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
+
+ ssh->kstat = ksp;
+ if (ksp) {
+ ksp->ks_lock = &ssh->lock;
+ ksp->ks_data = NULL;
+ ksp->ks_private = spa;
+ ksp->ks_flags |= KSTAT_FLAG_NO_HEADERS;
+ kstat_set_raw_ops(ksp, NULL, spa_state_data, spa_state_addr);
+ kstat_install(ksp);
+ }
+
+ strfree(name);
+}
+
+static void
+spa_health_destroy(spa_t *spa)
+{
+ spa_stats_history_t *ssh = &spa->spa_stats.state;
+ kstat_t *ksp = ssh->kstat;
+ if (ksp)
+ kstat_delete(ksp);
+
+ mutex_destroy(&ssh->lock);
+}
+
void
spa_stats_init(spa_t *spa)
{
@@ -1000,11 +1060,13 @@ spa_stats_init(spa_t *spa)
spa_tx_assign_init(spa);
spa_io_history_init(spa);
spa_mmp_history_init(spa);
+ spa_state_init(spa);
}
void
spa_stats_destroy(spa_t *spa)
{
+ spa_health_destroy(spa);
spa_tx_assign_destroy(spa);
spa_txg_history_destroy(spa);
spa_read_history_destroy(spa);
diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run
index 379c9f73..69e9eb26 100644
--- a/tests/runfiles/linux.run
+++ b/tests/runfiles/linux.run
@@ -467,6 +467,10 @@ tests = ['inuse_001_pos', 'inuse_003_pos', 'inuse_004_pos',
post =
tags = ['functional', 'inuse']
+[tests/functional/kstat]
+tests = ['state']
+tags = ['functional', 'kstat']
+
[tests/functional/large_files]
tests = ['large_files_001_pos', 'large_files_002_pos']
tags = ['functional', 'large_files']
diff --git a/tests/zfs-tests/include/libtest.shlib b/tests/zfs-tests/include/libtest.shlib
index 13c85912..86dae6ea 100644
--- a/tests/zfs-tests/include/libtest.shlib
+++ b/tests/zfs-tests/include/libtest.shlib
@@ -26,6 +26,7 @@
# Copyright 2016 Nexenta Systems, Inc.
# Copyright (c) 2017 Lawrence Livermore National Security, LLC.
# Copyright (c) 2017 Datto Inc.
+# Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
#
. ${STF_TOOLS}/include/logapi.shlib
@@ -3718,3 +3719,40 @@ function get_pool_devices #testpool #devdir
fi
echo $out
}
+
+#
+# Get scsi_debug device name.
+# Returns basename of scsi_debug device (for example "sdb").
+#
+function get_debug_device
+{
+ for i in {1..10} ; do
+ val=$(lsscsi | nawk '/scsi_debug/ {print $6; exit}' | cut -d / -f3)
+
+ # lsscsi can take time to settle
+ if [ "$val" != "-" ] ; then
+ break
+ fi
+ sleep 1
+ done
+ echo "$val"
+}
+
+#
+# Returns SCSI host number for the given disk
+#
+function get_scsi_host #disk
+{
+ typeset disk=$1
+ ls /sys/block/${disk}/device/scsi_device | cut -d : -f 1
+}
+
+#
+# Simulate disk removal
+#
+function remove_disk #disk
+{
+ typeset disk=$1
+ on_off_disk $disk "offline"
+ block_device_wait
+}
diff --git a/tests/zfs-tests/tests/functional/Makefile.am b/tests/zfs-tests/tests/functional/Makefile.am
index ea52205a..bbbf3ba0 100644
--- a/tests/zfs-tests/tests/functional/Makefile.am
+++ b/tests/zfs-tests/tests/functional/Makefile.am
@@ -24,6 +24,7 @@ SUBDIRS = \
history \
inheritance \
inuse \
+ kstat \
large_files \
largest_pool \
libzfs \
diff --git a/tests/zfs-tests/tests/functional/kstat/Makefile.am b/tests/zfs-tests/tests/functional/kstat/Makefile.am
new file mode 100644
index 00000000..8ad83ec3
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/kstat/Makefile.am
@@ -0,0 +1,5 @@
+pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/kstat
+dist_pkgdata_SCRIPTS = \
+ setup.ksh \
+ cleanup.ksh \
+ state.ksh
diff --git a/tests/zfs-tests/tests/functional/kstat/cleanup.ksh b/tests/zfs-tests/tests/functional/kstat/cleanup.ksh
new file mode 100755
index 00000000..8a212ce3
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/kstat/cleanup.ksh
@@ -0,0 +1,28 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright (c) 2018 by Lawrence Livermore National Security, LLC.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+default_cleanup
diff --git a/tests/zfs-tests/tests/functional/kstat/setup.ksh b/tests/zfs-tests/tests/functional/kstat/setup.ksh
new file mode 100755
index 00000000..57717a09
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/kstat/setup.ksh
@@ -0,0 +1,34 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright (c) 2018 by Lawrence Livermore National Security, LLC.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+if ! is_linux ; then
+ log_unsupported "/proc/spl/kstat/<pool>/health only supported on Linux"
+fi
+
+default_mirror_setup $DISKS
+
+log_pass
diff --git a/tests/zfs-tests/tests/functional/kstat/state.ksh b/tests/zfs-tests/tests/functional/kstat/state.ksh
new file mode 100755
index 00000000..bf0b6e31
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/kstat/state.ksh
@@ -0,0 +1,144 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+
+#
+# Copyright (c) 2018 by Lawrence Livermore National Security, LLC.
+#
+
+#
+# DESCRIPTION:
+# Test /proc/spl/kstat/zfs/<pool>/state kstat
+#
+# STRATEGY:
+# 1. Create a mirrored pool
+# 2. Check that pool is ONLINE
+# 3. Fault one disk
+# 4. Check that pool is DEGRADED
+# 5. Create a new pool with a single scsi_debug disk
+# 6. Remove the disk
+# 7. Check that pool is SUSPENDED
+# 8. Add the disk back in
+# 9. Clear errors and destroy the pools
+
+. $STF_SUITE/include/libtest.shlib
+
+verify_runnable "both"
+
+function cleanup
+{
+ # Destroy the scsi_debug pool
+ if [ -n "$TESTPOOL2" ] ; then
+ if [ -n "$host" ] ; then
+ # Re-enable the disk
+ scan_scsi_hosts $host
+
+ # Device may have changed names after being inserted
+ SDISK=$(get_debug_device)
+ log_must ln $DEV_RDSKDIR/$SDISK $REALDISK
+ fi
+
+ # Restore our working pool image
+ if [ -n "$BACKUP" ] ; then
+ gunzip -c $BACKUP > $REALDISK
+ log_must rm -f $BACKUP
+ fi
+
+ # Our disk is back. Now we can clear errors and destroy the
+ # pool cleanly.
+ log_must zpool clear $TESTPOOL2
+
+ # Now that the disk is back and errors cleared, wait for our
+ # hung 'zpool scrub' to finish.
+ wait
+
+ destroy_pool $TESTPOOL2
+ log_must rm $REALDISK
+ unload_scsi_debug
+ fi
+}
+
+# Check that our pool state values match what's expected
+#
+# $1: pool name
+# $2: expected state ("ONLINE", "DEGRADED", "SUSPENDED", etc)
+function check_all
+{
+ pool=$1
+ expected=$2
+
+ state1=$(zpool status $pool | awk '/state: /{print $2}');
+ state2=$(zpool list -H -o health $pool)
+ state3=$(cat /proc/spl/kstat/zfs/$pool/state)
+ log_note "Checking $expected = $state1 = $state2 = $state3"
+ if [[ "$expected" == "$state1" && "$expected" == "$state2" && \
+ "$expected" == "$state3" ]] ; then
+ true
+ else
+ false
+ fi
+}
+
+log_onexit cleanup
+
+log_assert "Testing /proc/spl/kstat/zfs/<pool>/state kstat"
+
+# Test that the initial pool is healthy
+check_all $TESTPOOL "ONLINE"
+
+# Fault one of the disks, and check that pool is degraded
+DISK1=$(echo "$DISKS" | awk '{print $2}')
+zpool offline -tf $TESTPOOL $DISK1
+check_all $TESTPOOL "DEGRADED"
+
+# Create a new pool out of a scsi_debug disk
+TESTPOOL2=testpool2
+MINVDEVSIZE_MB=$((MINVDEVSIZE / 1048576))
+load_scsi_debug $MINVDEVSIZE_MB 1 1 1 '512b'
+
+SDISK=$(get_debug_device)
+host=$(get_scsi_host $SDISK)
+
+# Use $REALDISK instead of $SDISK in our pool because $SDISK can change names
+# as we remove/add the disk (i.e. /dev/sdf -> /dev/sdg).
+REALDISK=/dev/kstat-state-realdisk
+log_must [ ! -e $REALDISK ]
+ln $DEV_RDSKDIR/$SDISK $REALDISK
+
+log_must zpool create $TESTPOOL2 $REALDISK
+
+# Backup the contents of the disk image
+BACKUP=/tmp/kstat-state-realdisk.gz
+log_must [ ! -e $BACKUP ]
+gzip -c $REALDISK > $BACKUP
+
+# Yank out the disk from under the pool
+log_must rm $REALDISK
+remove_disk $SDISK
+
+# Run a 'zpool scrub' in the background to suspend the pool. We run it in the
+# background since the command will hang when the pool gets suspended. The
+# command will resume and exit after we restore the missing disk later on.
+zpool scrub $TESTPOOL2 &
+sleep 1 # Give the scrub some time to run before we check if it fails
+
+log_must check_all $TESTPOOL2 "SUSPENDED"
+
+log_pass "/proc/spl/kstat/zfs/<pool>/state test successful"

View File

@ -0,0 +1,115 @@
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Brian Behlendorf <behlendorf1@llnl.gov>
Date: Tue, 19 Jun 2018 21:52:45 -0700
Subject: [PATCH] Linux 4.14 compat: blk_queue_stackable()
The blk_queue_stackable() function was replaced in the 4.14 kernel
by queue_is_rq_based(), commit torvalds/linux@5fdee212. This change
resulted in the default elevator being used which can negatively
impact performance.
Rather than adding additional compatibility code to detect the
new interface unconditionally attempt to set the elevator. Since
we expect this to fail for block devices without an elevator the
error message has been moved in to zfs_dbgmsg().
Finally, it was observed that the elevator_change() was removed
from the 4.12 kernel, commit torvalds/linux@c033269. Update the
comment to clearly specify which are expected to export the
elevator_change() symbol.
Reviewed-by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #7645
Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>
---
config/kernel-elevator-change.m4 | 4 ++--
include/linux/blkdev_compat.h | 11 -----------
module/zfs/vdev_disk.c | 22 ++++++++++------------
3 files changed, 12 insertions(+), 25 deletions(-)
diff --git a/config/kernel-elevator-change.m4 b/config/kernel-elevator-change.m4
index ace5aa82..eba25257 100644
--- a/config/kernel-elevator-change.m4
+++ b/config/kernel-elevator-change.m4
@@ -1,6 +1,6 @@
dnl #
-dnl # 2.6.36 API change
-dnl # Verify the elevator_change() symbol is available.
+dnl # 2.6.36 API, exported elevator_change() symbol
+dnl # 4.12 API, removed elevator_change() symbol
dnl #
AC_DEFUN([ZFS_AC_KERNEL_ELEVATOR_CHANGE], [
AC_MSG_CHECKING([whether elevator_change() is available])
diff --git a/include/linux/blkdev_compat.h b/include/linux/blkdev_compat.h
index 27f05662..c8cdf38e 100644
--- a/include/linux/blkdev_compat.h
+++ b/include/linux/blkdev_compat.h
@@ -106,17 +106,6 @@ blk_queue_set_write_cache(struct request_queue *q, bool wc, bool fua)
#endif
/*
- * 2.6.27 API change,
- * The blk_queue_stackable() queue flag was added in 2.6.27 to handle dm
- * stacking drivers. Prior to this request stacking drivers were detected
- * by checking (q->request_fn == NULL), for earlier kernels we revert to
- * this legacy behavior.
- */
-#ifndef blk_queue_stackable
-#define blk_queue_stackable(q) ((q)->request_fn == NULL)
-#endif
-
-/*
* 2.6.34 API change,
* The blk_queue_max_hw_sectors() function replaces blk_queue_max_sectors().
*/
diff --git a/module/zfs/vdev_disk.c b/module/zfs/vdev_disk.c
index 6dc0544f..c5708cb2 100644
--- a/module/zfs/vdev_disk.c
+++ b/module/zfs/vdev_disk.c
@@ -168,23 +168,20 @@ vdev_elevator_switch(vdev_t *v, char *elevator)
if (!v->vdev_wholedisk && strncmp(device, "dm-", 3) != 0)
return;
- /* Skip devices without schedulers (loop, ram, dm, etc) */
- if (!q->elevator || !blk_queue_stackable(q))
- return;
-
/* Leave existing scheduler when set to "none" */
if ((strncmp(elevator, "none", 4) == 0) && (strlen(elevator) == 4))
return;
+ /*
+ * The elevator_change() function was available in kernels from
+ * 2.6.36 to 4.11. When not available fall back to using the user
+ * mode helper functionality to set the elevator via sysfs. This
+ * requires /bin/echo and sysfs to be mounted which may not be true
+ * early in the boot process.
+ */
#ifdef HAVE_ELEVATOR_CHANGE
error = elevator_change(q, elevator);
#else
- /*
- * For pre-2.6.36 kernels elevator_change() is not available.
- * Therefore we fall back to using a usermodehelper to echo the
- * elevator into sysfs; This requires /bin/echo and sysfs to be
- * mounted which may not be true early in the boot process.
- */
#define SET_SCHEDULER_CMD \
"exec 0</dev/null " \
" 1>/sys/block/%s/queue/scheduler " \
@@ -198,9 +195,10 @@ vdev_elevator_switch(vdev_t *v, char *elevator)
error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
strfree(argv[2]);
#endif /* HAVE_ELEVATOR_CHANGE */
- if (error)
- printk("ZFS: Unable to set \"%s\" scheduler for %s (%s): %d\n",
+ if (error) {
+ zfs_dbgmsg("Unable to set \"%s\" scheduler for %s (%s): %d\n",
elevator, v->vdev_path, device, error);
+ }
}
/*

View File

@ -0,0 +1,54 @@
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Troels=20N=C3=B8rgaard?= <tnn@tradeshift.com>
Date: Sat, 7 Jul 2018 01:15:19 +0200
Subject: [PATCH] Default ashift for Amazon EC2 NVMe devices
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Add a default 4 KiB ashift for Amazon EC2 NVMe devices on instances with
NVMe ephemeral devices, such as the types c5d, f1, i3 and m5d.
As per the official documentation [1] a 4096 byte blocksize should be
used to match the underlying hardware.
The string was identified via:
$ sudo sginfo -M /dev/nvme0n1
INQUIRY response (cmd: 0x12)
----------------------------
Device Type 0
Vendor: NVMe
Product: Amazon EC2 NVMe
Revision level:
$ lsblk -io KNAME,TYPE,SIZE,MODEL
KNAME TYPE SIZE MODEL
nvme0n1 disk 442.4G Amazon EC2 NVMe Instance Storage
[1] https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/
storage-optimized-instances.html
Retrived 2018-07-03
Reviewed-by: George Melikov <mail@gmelikov.ru>
Reviewed-by: Giuseppe Di Natale <guss80@gmail.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Troels Nørgaard <tnn@tradeshift.com>
Closes #7676
Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>
---
cmd/zpool/zpool_vdev.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/cmd/zpool/zpool_vdev.c b/cmd/zpool/zpool_vdev.c
index fd6bd9e7..69ff7ff6 100644
--- a/cmd/zpool/zpool_vdev.c
+++ b/cmd/zpool/zpool_vdev.c
@@ -191,6 +191,7 @@ static vdev_disk_db_entry_t vdev_disk_database[] = {
{"ATA INTEL SSDSC2BP24", 4096},
{"ATA INTEL SSDSC2BP48", 4096},
{"NA SmrtStorSDLKAE9W", 4096},
+ {"NVMe Amazon EC2 NVMe ", 4096},
/* Imported from Open Solaris */
{"ATA MARVELL SD88SA02", 4096},
/* Advanced format Hard drives */

View File

@ -0,0 +1,123 @@
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Brian Behlendorf <behlendorf1@llnl.gov>
Date: Wed, 11 Jul 2018 13:10:40 -0700
Subject: [PATCH] Fix kernel unaligned access on sparc64
Update the SA_COPY_DATA macro to check if architecture supports
efficient unaligned memory accesses at compile time. Otherwise
fallback to using the sa_copy_data() function.
The kernel provided CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS is
used to determine availability in kernel space. In user space
the x86_64, x86, powerpc, and sometimes arm architectures will
define the HAVE_EFFICIENT_UNALIGNED_ACCESS macro.
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #7642
Closes #7684
Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>
---
lib/libspl/include/sys/isa_defs.h | 7 +++++++
module/icp/algs/modes/ccm.c | 2 +-
module/zfs/sa.c | 35 ++++++++++++++++++++---------------
3 files changed, 28 insertions(+), 16 deletions(-)
diff --git a/lib/libspl/include/sys/isa_defs.h b/lib/libspl/include/sys/isa_defs.h
index a5bea039..7a90e077 100644
--- a/lib/libspl/include/sys/isa_defs.h
+++ b/lib/libspl/include/sys/isa_defs.h
@@ -55,6 +55,7 @@ extern "C" {
#endif
#define _SUNOS_VTOC_16
+#define HAVE_EFFICIENT_UNALIGNED_ACCESS
/* i386 arch specific defines */
#elif defined(__i386) || defined(__i386__)
@@ -76,6 +77,7 @@ extern "C" {
#endif
#define _SUNOS_VTOC_16
+#define HAVE_EFFICIENT_UNALIGNED_ACCESS
/* powerpc arch specific defines */
#elif defined(__powerpc) || defined(__powerpc__) || defined(__powerpc64__)
@@ -99,6 +101,7 @@ extern "C" {
#endif
#define _SUNOS_VTOC_16
+#define HAVE_EFFICIENT_UNALIGNED_ACCESS
/* arm arch specific defines */
#elif defined(__arm) || defined(__arm__) || defined(__aarch64__)
@@ -129,6 +132,10 @@ extern "C" {
#define _SUNOS_VTOC_16
+#if defined(__ARM_FEATURE_UNALIGNED)
+#define HAVE_EFFICIENT_UNALIGNED_ACCESS
+#endif
+
/* sparc arch specific defines */
#elif defined(__sparc) || defined(__sparc__)
diff --git a/module/icp/algs/modes/ccm.c b/module/icp/algs/modes/ccm.c
index 22aeb0a6..fb41194f 100644
--- a/module/icp/algs/modes/ccm.c
+++ b/module/icp/algs/modes/ccm.c
@@ -28,7 +28,7 @@
#include <sys/crypto/common.h>
#include <sys/crypto/impl.h>
-#if defined(__i386) || defined(__amd64)
+#ifdef HAVE_EFFICIENT_UNALIGNED_ACCESS
#include <sys/byteorder.h>
#define UNALIGNED_POINTERS_PERMITTED
#endif
diff --git a/module/zfs/sa.c b/module/zfs/sa.c
index 8046dbde..1fb1a8b5 100644
--- a/module/zfs/sa.c
+++ b/module/zfs/sa.c
@@ -147,21 +147,26 @@ arc_byteswap_func_t sa_bswap_table[] = {
zfs_acl_byteswap,
};
-#define SA_COPY_DATA(f, s, t, l) \
- { \
- if (f == NULL) { \
- if (l == 8) { \
- *(uint64_t *)t = *(uint64_t *)s; \
- } else if (l == 16) { \
- *(uint64_t *)t = *(uint64_t *)s; \
- *(uint64_t *)((uintptr_t)t + 8) = \
- *(uint64_t *)((uintptr_t)s + 8); \
- } else { \
- bcopy(s, t, l); \
- } \
- } else \
- sa_copy_data(f, s, t, l); \
- }
+#ifdef HAVE_EFFICIENT_UNALIGNED_ACCESS
+#define SA_COPY_DATA(f, s, t, l) \
+do { \
+ if (f == NULL) { \
+ if (l == 8) { \
+ *(uint64_t *)t = *(uint64_t *)s; \
+ } else if (l == 16) { \
+ *(uint64_t *)t = *(uint64_t *)s; \
+ *(uint64_t *)((uintptr_t)t + 8) = \
+ *(uint64_t *)((uintptr_t)s + 8); \
+ } else { \
+ bcopy(s, t, l); \
+ } \
+ } else { \
+ sa_copy_data(f, s, t, l); \
+ } \
+} while (0)
+#else
+#define SA_COPY_DATA(f, s, t, l) sa_copy_data(f, s, t, l)
+#endif
/*
* This table is fixed and cannot be changed. Its purpose is to

View File

@ -48,7 +48,6 @@ Closes #7659
Closes #7691 Closes #7691
Closes #7693 Closes #7693
(Cherry-picked from ac09630d8b0bf6c92084a30fdaefd03fd0adbdc1)
Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com> Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>
--- ---
include/sys/zfs_vfsops.h | 1 + include/sys/zfs_vfsops.h | 1 +

View File

@ -0,0 +1,133 @@
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Toomas Soome <tsoome@me.com>
Date: Wed, 1 Jun 2016 19:18:10 +0300
Subject: [PATCH] OpenZFS 8906 - uts: illumos rootfs should support salted
cksum
Porting notes:
* As of grub-2.02 these checksums are not supported. However, as
pointed out in #6501 there are alternatives such as EFISTUB which
work and have no such restriction. A warning was added to the
checksum property section of the zfs.8 man page.
Authored by: Toomas Soome <tsoome@me.com>
Reviewed by: C Fraire <cfraire@me.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Yuri Pankov <yuripv@yuripv.net>
Approved by: Dan McDonald <danmcd@joyent.com>
Ported-by: Brian Behlendorf <behlendorf1@llnl.gov>
OpenZFS-issue: https://illumos.org/issues/8906
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/7dec52f
Closes #6501
Closes #7714
Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>
---
man/man5/zpool-features.5 | 18 +++++++-----------
man/man8/zfs.8 | 5 ++++-
module/zfs/zfs_ioctl.c | 11 +----------
3 files changed, 12 insertions(+), 22 deletions(-)
diff --git a/man/man5/zpool-features.5 b/man/man5/zpool-features.5
index 78ea559f..140ce269 100644
--- a/man/man5/zpool-features.5
+++ b/man/man5/zpool-features.5
@@ -14,7 +14,7 @@
.\" CDDL HEADER, with the fields enclosed by brackets "[]" replaced with your
.\" own identifying information:
.\" Portions Copyright [yyyy] [name of copyright owner]
-.TH ZPOOL-FEATURES 5 "Aug 27, 2013"
+.TH ZPOOL-FEATURES 5 "Jun 8, 2018"
.SH NAME
zpool\-features \- ZFS pool feature descriptions
.SH DESCRIPTION
@@ -248,8 +248,9 @@ immediately activate the \fBlz4_compress\fR feature on the underlying
pool using the \fBzfs\fR(1M) command. Also, all newly written metadata
will be compressed with \fBlz4\fR algorithm. Since this feature is not
read-only compatible, this operation will render the pool unimportable
-on systems without support for the \fBlz4_compress\fR feature. Booting
-off of \fBlz4\fR-compressed root pools is supported.
+on systems without support for the \fBlz4_compress\fR feature.
+
+Booting off of \fBlz4\fR-compressed root pools is supported.
This feature becomes \fBactive\fR as soon as it is enabled and will
never return to being \fBenabled\fB.
@@ -510,8 +511,7 @@ can turn on the \fBsha512\fR checksum on any dataset using the
and will return to being \fBenabled\fR once all filesystems that have
ever had their checksum set to \fBsha512\fR are destroyed.
-Booting off of pools utilizing SHA-512/256 is supported (provided that
-the updated GRUB stage2 module is installed).
+Booting off of pools utilizing SHA-512/256 is supported.
.RE
@@ -545,9 +545,7 @@ can turn on the \fBskein\fR checksum on any dataset using the
and will return to being \fBenabled\fR once all filesystems that have
ever had their checksum set to \fBskein\fR are destroyed.
-Booting off of pools using \fBskein\fR is \fBNOT\fR supported
--- any attempt to enable \fBskein\fR on a root pool will fail with an
-error.
+Booting off of pools using \fBskein\fR is supported.
.RE
@@ -587,9 +585,7 @@ can turn on the \fBedonr\fR checksum on any dataset using the
and will return to being \fBenabled\fR once all filesystems that have
ever had their checksum set to \fBedonr\fR are destroyed.
-Booting off of pools using \fBedonr\fR is \fBNOT\fR supported
--- any attempt to enable \fBedonr\fR on a root pool will fail with an
-error.
+Booting off of pools using \fBedonr\fR is supported.
.RE
diff --git a/man/man8/zfs.8 b/man/man8/zfs.8
index 48a5e6ea..bb3b46e3 100644
--- a/man/man8/zfs.8
+++ b/man/man8/zfs.8
@@ -29,7 +29,7 @@
.\" Copyright 2016 Nexenta Systems, Inc.
.\" Copyright 2016 Richard Laager. All rights reserved.
.\"
-.Dd June 28, 2017
+.Dd July 13, 2018
.Dt ZFS 8 SMM
.Os Linux
.Sh NAME
@@ -1049,6 +1049,9 @@ The
and
.Sy edonr
checksum algorithms require enabling the appropriate features on the pool.
+These algorithms are not supported by GRUB and should not be set on the
+.Sy bootfs
+filesystem when using GRUB to boot the system.
Please see
.Xr zpool-features 5
for more information on these algorithms.
diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c
index f4f509a7..6516f646 100644
--- a/module/zfs/zfs_ioctl.c
+++ b/module/zfs/zfs_ioctl.c
@@ -3985,16 +3985,7 @@ zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr)
if ((err = spa_open(dsname, &spa, FTAG)) != 0)
return (err);
- /*
- * Salted checksums are not supported on root pools.
- */
- if (spa_bootfs(spa) != 0 &&
- intval < ZIO_CHECKSUM_FUNCTIONS &&
- (zio_checksum_table[intval].ci_flags &
- ZCHECKSUM_FLAG_SALTED)) {
- spa_close(spa, FTAG);
- return (SET_ERROR(ERANGE));
- }
+
if (!spa_feature_is_enabled(spa, feature)) {
spa_close(spa, FTAG);
return (SET_ERROR(ENOTSUP));

View File

@ -0,0 +1,108 @@
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: LOLi <loli10K@users.noreply.github.com>
Date: Tue, 1 May 2018 05:58:29 +0200
Subject: [PATCH] Fix zfs incremental send remove '-o' properties
When receiving an incremental send stream with intermediary snapshots
zfs_receive_one() does not correctly identify the top-level dataset:
consequently we restore said snapshots as if they were children
datasets in the hierarchy, forcing inheritance of any property received
with 'zfs send -o' and effectively removing any locally set value.
The test case did not correctly verify this situation because it uses
adjacent snapshots, basically testing 'zfs send -i' instead of
'zfs send -I': this commit adds an additional intermediary snapshot to
the test script.
Reviewed-by: Paul Dagnelie <pcd@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: loli10K <ezomori.nozomu@gmail.com>
Closes #7478
Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>
---
lib/libzfs/libzfs_sendrecv.c | 2 +-
.../zfs_receive/receive-o-x_props_override.ksh | 22 +++++++++++++---------
2 files changed, 14 insertions(+), 10 deletions(-)
diff --git a/lib/libzfs/libzfs_sendrecv.c b/lib/libzfs/libzfs_sendrecv.c
index 5490581a..c5acd21a 100644
--- a/lib/libzfs/libzfs_sendrecv.c
+++ b/lib/libzfs/libzfs_sendrecv.c
@@ -3592,7 +3592,7 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
goto out;
}
- if (top_zfs && *top_zfs == NULL)
+ if (top_zfs && (*top_zfs == NULL || strcmp(*top_zfs, name) == 0))
toplevel = B_TRUE;
if (drrb->drr_type == DMU_OST_ZVOL) {
type = ZFS_TYPE_VOLUME;
diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_receive/receive-o-x_props_override.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_receive/receive-o-x_props_override.ksh
index e4e69851..4e3a5393 100755
--- a/tests/zfs-tests/tests/functional/cli_root/zfs_receive/receive-o-x_props_override.ksh
+++ b/tests/zfs-tests/tests/functional/cli_root/zfs_receive/receive-o-x_props_override.ksh
@@ -212,16 +212,17 @@ log_must eval "zfs send -R $orig@snap1 > $streamfile_repl"
log_must eval "zfs recv $dest < $streamfile_repl"
# Fill the datasets with properties and create an incremental replication stream
log_must zfs snapshot -r $orig@snap2
+log_must zfs snapshot -r $orig@snap3
log_must eval "zfs set copies=2 $orig"
log_must eval "zfs set '$userprop:orig'='$userval' $orig"
log_must eval "zfs set '$userprop:orig'='$userval' $origsub"
log_must eval "zfs set '$userprop:snap'='$userval' $orig@snap1"
-log_must eval "zfs set '$userprop:snap'='$userval' $origsub@snap2"
-log_must eval "zfs send -R -I $orig@snap1 $orig@snap2 > $streamfile_incr"
+log_must eval "zfs set '$userprop:snap'='$userval' $origsub@snap3"
+log_must eval "zfs send -R -I $orig@snap1 $orig@snap3 > $streamfile_incr"
# Sets various combination of override and exclude options
log_must eval "zfs recv -F -o atime=off -o '$userprop:dest2'='$userval' "\
"-o quota=123456789 -x compression -x '$userprop:orig' " \
- "-x '$userprop:snap2' $dest < $streamfile_incr"
+ "-x '$userprop:snap3' $dest < $streamfile_incr"
# Verify we can correctly override and exclude properties
log_must eval "check_prop_source $dest copies 2 received"
log_must eval "check_prop_source $dest atime off local"
@@ -237,9 +238,9 @@ log_must eval "check_prop_missing $destsub '$userprop:orig'"
log_must eval "check_prop_source " \
"$dest@snap1 '$userprop:snap' '$userval' received"
log_must eval "check_prop_source " \
- "$destsub@snap2 '$userprop:snap' '$userval' received"
-log_must eval "check_prop_missing $dest@snap2 '$userprop:snap2'"
-log_must eval "check_prop_missing $destsub@snap2 '$userprop:snap2'"
+ "$destsub@snap3 '$userprop:snap' '$userval' received"
+log_must eval "check_prop_missing $dest@snap3 '$userprop:snap3'"
+log_must eval "check_prop_missing $destsub@snap3 '$userprop:snap3'"
# Cleanup
log_must zfs destroy -r -f $orig
log_must zfs destroy -r -f $dest
@@ -270,7 +271,8 @@ log_must eval "zfs set compression=gzip $dest"
log_must eval "zfs set '$userprop:dest'='localval' $dest"
# Receive the new stream, verify we preserve locally set properties
log_must zfs snapshot -r $orig@snap2
-log_must eval "zfs send -R -I $orig@snap1 $orig@snap2 > $streamfile_incr"
+log_must zfs snapshot -r $orig@snap3
+log_must eval "zfs send -R -I $orig@snap1 $orig@snap3 > $streamfile_incr"
log_must eval "zfs recv -F -x copies -x compression -x '$userprop:orig' " \
"-x '$userprop:dest' $dest < $streamfile_incr"
log_must eval "check_prop_source $dest '$userprop:dest' 'localval' local"
@@ -305,7 +307,8 @@ log_must eval "check_prop_source $destsub quota 0 default"
log_must eval "zfs set quota=123456789 $dest"
log_must eval "zfs set canmount=off $destsub"
log_must zfs snapshot -r $orig@snap2
-log_must eval "zfs send -R -I $orig@snap1 $orig@snap2 > $streamfile_incr"
+log_must zfs snapshot -r $orig@snap3
+log_must eval "zfs send -R -I $orig@snap1 $orig@snap3 > $streamfile_incr"
log_must eval "zfs recv -F -x quota -x canmount $dest < $streamfile_incr"
log_must eval "check_prop_source $dest quota 123456789 local"
log_must eval "check_prop_source $destsub quota 0 default"
@@ -332,7 +335,8 @@ log_must eval "zfs set '$userprop:origsub'='$userval' $destsub"
mntpnt=$(get_prop mountpoint $orig)
log_must eval "dd if=/dev/urandom of=$mntpnt/file bs=1024k count=10"
log_must zfs snapshot -r $orig@snap2
-log_must eval "zfs send -R -I $orig@snap1 $orig@snap2 > $streamfile_incr"
+log_must zfs snapshot -r $orig@snap3
+log_must eval "zfs send -R -I $orig@snap1 $orig@snap3 > $streamfile_incr"
log_must eval "dd if=$streamfile_incr of=$streamfile_trun bs=1024k count=9"
# Receive the truncated stream, verify original properties are kept
log_mustnot eval "zfs recv -F -o copies=3 -o quota=987654321 "\

View File

@ -0,0 +1,95 @@
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: LOLi <loli10K@users.noreply.github.com>
Date: Fri, 3 Aug 2018 23:56:25 +0200
Subject: [PATCH] Allow inherited properties in zfs_check_settable()
This change modifies how 'checksum' and 'dedup' properties are verified
in zfs_check_settable() handling the case where they are explicitly
inherited in the dataset hierarchy when receiving a recursive send
stream.
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Tom Caputi <tcaputi@datto.com>
Signed-off-by: loli10K <ezomori.nozomu@gmail.com>
Closes #7755
Closes #7576
Closes #7757
Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>
---
module/zfs/zfs_ioctl.c | 26 +++++++++++-----------
.../zfs_receive/receive-o-x_props_override.ksh | 6 +++--
2 files changed, 17 insertions(+), 15 deletions(-)
diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c
index 6516f646..b8783e54 100644
--- a/module/zfs/zfs_ioctl.c
+++ b/module/zfs/zfs_ioctl.c
@@ -3967,7 +3967,6 @@ zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr)
{
spa_feature_t feature;
spa_t *spa;
- uint64_t intval;
int err;
/* dedup feature version checks */
@@ -3975,22 +3974,23 @@ zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr)
zfs_earlier_version(dsname, SPA_VERSION_DEDUP))
return (SET_ERROR(ENOTSUP));
- if (nvpair_value_uint64(pair, &intval) != 0)
- return (SET_ERROR(EINVAL));
-
- /* check prop value is enabled in features */
- feature = zio_checksum_to_feature(intval & ZIO_CHECKSUM_MASK);
- if (feature == SPA_FEATURE_NONE)
- break;
+ if (nvpair_type(pair) == DATA_TYPE_UINT64 &&
+ nvpair_value_uint64(pair, &intval) == 0) {
+ /* check prop value is enabled in features */
+ feature = zio_checksum_to_feature(
+ intval & ZIO_CHECKSUM_MASK);
+ if (feature == SPA_FEATURE_NONE)
+ break;
- if ((err = spa_open(dsname, &spa, FTAG)) != 0)
- return (err);
+ if ((err = spa_open(dsname, &spa, FTAG)) != 0)
+ return (err);
- if (!spa_feature_is_enabled(spa, feature)) {
+ if (!spa_feature_is_enabled(spa, feature)) {
+ spa_close(spa, FTAG);
+ return (SET_ERROR(ENOTSUP));
+ }
spa_close(spa, FTAG);
- return (SET_ERROR(ENOTSUP));
}
- spa_close(spa, FTAG);
break;
}
diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_receive/receive-o-x_props_override.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_receive/receive-o-x_props_override.ksh
index 4e3a5393..583d8eb1 100755
--- a/tests/zfs-tests/tests/functional/cli_root/zfs_receive/receive-o-x_props_override.ksh
+++ b/tests/zfs-tests/tests/functional/cli_root/zfs_receive/receive-o-x_props_override.ksh
@@ -221,15 +221,17 @@ log_must eval "zfs set '$userprop:snap'='$userval' $origsub@snap3"
log_must eval "zfs send -R -I $orig@snap1 $orig@snap3 > $streamfile_incr"
# Sets various combination of override and exclude options
log_must eval "zfs recv -F -o atime=off -o '$userprop:dest2'='$userval' "\
- "-o quota=123456789 -x compression -x '$userprop:orig' " \
- "-x '$userprop:snap3' $dest < $streamfile_incr"
+ "-o quota=123456789 -o checksum=sha512 -x compression "\
+ "-x '$userprop:orig' -x '$userprop:snap3' $dest < $streamfile_incr"
# Verify we can correctly override and exclude properties
log_must eval "check_prop_source $dest copies 2 received"
log_must eval "check_prop_source $dest atime off local"
log_must eval "check_prop_source $dest '$userprop:dest2' '$userval' local"
log_must eval "check_prop_source $dest quota 123456789 local"
+log_must eval "check_prop_source $dest checksum sha512 local"
log_must eval "check_prop_inherit $destsub copies $dest"
log_must eval "check_prop_inherit $destsub atime $dest"
+log_must eval "check_prop_inherit $destsub checksum $dest"
log_must eval "check_prop_inherit $destsub '$userprop:dest2' $dest"
log_must eval "check_prop_source $destsub quota 0 default"
log_must eval "check_prop_source $destsub compression off default"

View File

@ -0,0 +1,33 @@
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: LOLi <loli10K@users.noreply.github.com>
Date: Sat, 18 Aug 2018 22:10:36 +0200
Subject: [PATCH] Fix arcstat.py handling of unsupported options
This change allows the arcstat.py script to handle unsupported options
gracefully and print both error and usage messages when one such option
is provided.
Reviewed-by: Giuseppe Di Natale <guss80@gmail.com>
Reviewed-by: George Melikov <mail@gmelikov.ru>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: loli10K <ezomori.nozomu@gmail.com>
Closes #7799
Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>
---
cmd/arcstat/arcstat.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/cmd/arcstat/arcstat.py b/cmd/arcstat/arcstat.py
index 85c83ccc..b52a8c29 100755
--- a/cmd/arcstat/arcstat.py
+++ b/cmd/arcstat/arcstat.py
@@ -285,7 +285,7 @@ def init():
]
)
except getopt.error as msg:
- sys.stderr.write(msg)
+ sys.stderr.write("Error: %s\n" % str(msg))
usage()
opts = None

View File

@ -0,0 +1,123 @@
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: DeHackEd <DeHackEd@users.noreply.github.com>
Date: Mon, 20 Aug 2018 12:55:18 -0400
Subject: [PATCH] Don't modify argv[] in user tools
argv[] gets modified during string parsing for input arguments. This
is reflected in the live process listing. Don't do that.
Reviewed-by: Serapheim Dimitropoulos <serapheim@delphix.com>
Reviewed-by: loli10K <ezomori.nozomu@gmail.com>
Reviewed-by: Giuseppe Di Natale <guss80@gmail.com>
Reviewed-by: George Melikov <mail@gmelikov.ru>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: DHE <git@dehacked.net>
Closes #7760
Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>
---
cmd/zfs/zfs_main.c | 18 ++++++++++++++++--
cmd/zpool/zpool_main.c | 18 ++++++++++++++++--
2 files changed, 32 insertions(+), 4 deletions(-)
diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c
index f57df858..275d9c89 100644
--- a/cmd/zfs/zfs_main.c
+++ b/cmd/zfs/zfs_main.c
@@ -7041,6 +7041,7 @@ main(int argc, char **argv)
int ret = 0;
int i = 0;
char *cmdname;
+ char **newargv;
(void) setlocale(LC_ALL, "");
(void) textdomain(TEXT_DOMAIN);
@@ -7096,16 +7097,25 @@ main(int argc, char **argv)
libzfs_print_on_error(g_zfs, B_TRUE);
/*
+ * Many commands modify input strings for string parsing reasons.
+ * We create a copy to protect the original argv.
+ */
+ newargv = malloc((argc + 1) * sizeof (newargv[0]));
+ for (i = 0; i < argc; i++)
+ newargv[i] = strdup(argv[i]);
+ newargv[argc] = NULL;
+
+ /*
* Run the appropriate command.
*/
libzfs_mnttab_cache(g_zfs, B_TRUE);
if (find_command_idx(cmdname, &i) == 0) {
current_command = &command_table[i];
- ret = command_table[i].func(argc - 1, argv + 1);
+ ret = command_table[i].func(argc - 1, newargv + 1);
} else if (strchr(cmdname, '=') != NULL) {
verify(find_command_idx("set", &i) == 0);
current_command = &command_table[i];
- ret = command_table[i].func(argc, argv);
+ ret = command_table[i].func(argc, newargv);
} else {
(void) fprintf(stderr, gettext("unrecognized "
"command '%s'\n"), cmdname);
@@ -7113,6 +7123,10 @@ main(int argc, char **argv)
ret = 1;
}
+ for (i = 0; i < argc; i++)
+ free(newargv[i]);
+ free(newargv);
+
if (ret == 0 && log_history)
(void) zpool_log_history(g_zfs, history_str);
diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c
index 97697011..a4fd0321 100644
--- a/cmd/zpool/zpool_main.c
+++ b/cmd/zpool/zpool_main.c
@@ -7971,6 +7971,7 @@ main(int argc, char **argv)
int ret = 0;
int i = 0;
char *cmdname;
+ char **newargv;
(void) setlocale(LC_ALL, "");
(void) textdomain(TEXT_DOMAIN);
@@ -8006,15 +8007,24 @@ main(int argc, char **argv)
zfs_save_arguments(argc, argv, history_str, sizeof (history_str));
/*
+ * Many commands modify input strings for string parsing reasons.
+ * We create a copy to protect the original argv.
+ */
+ newargv = malloc((argc + 1) * sizeof (newargv[0]));
+ for (i = 0; i < argc; i++)
+ newargv[i] = strdup(argv[i]);
+ newargv[argc] = NULL;
+
+ /*
* Run the appropriate command.
*/
if (find_command_idx(cmdname, &i) == 0) {
current_command = &command_table[i];
- ret = command_table[i].func(argc - 1, argv + 1);
+ ret = command_table[i].func(argc - 1, newargv + 1);
} else if (strchr(cmdname, '=')) {
verify(find_command_idx("set", &i) == 0);
current_command = &command_table[i];
- ret = command_table[i].func(argc, argv);
+ ret = command_table[i].func(argc, newargv);
} else if (strcmp(cmdname, "freeze") == 0 && argc == 3) {
/*
* 'freeze' is a vile debugging abomination, so we treat
@@ -8031,6 +8041,10 @@ main(int argc, char **argv)
ret = 1;
}
+ for (i = 0; i < argc; i++)
+ free(newargv[i]);
+ free(newargv);
+
if (ret == 0 && log_history)
(void) zpool_log_history(g_zfs, history_str);

View File

@ -0,0 +1,42 @@
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Brian Behlendorf <behlendorf1@llnl.gov>
Date: Tue, 31 Jul 2018 10:17:44 -0700
Subject: [PATCH] Add missing zfs-dracut RPM dependencies
The zfs-dracut package requires the hostid, basename, head, awk,
and grep utilities be installed. The first three are provided by
coreutils but additional dependencies are required for awk and grep.
Reviewed-by: Manuel Amador (Rudd-O) <rudd-o@rudd-o.com>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #7729
Closes #7747
Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>
---
rpm/generic/zfs.spec.in | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/rpm/generic/zfs.spec.in b/rpm/generic/zfs.spec.in
index 5b89db02..398221c6 100644
--- a/rpm/generic/zfs.spec.in
+++ b/rpm/generic/zfs.spec.in
@@ -196,7 +196,7 @@ Requires: acl
Requires: sudo
Requires: sysstat
Requires: rng-tools
-Requires: libaio
+Requires: libaio
AutoReqProv: no
%description test
@@ -208,6 +208,8 @@ Summary: Dracut module
Group: System Environment/Kernel
Requires: %{name}%{?_isa} = %{version}-%{release}
Requires: dracut
+Requires: /usr/bin/awk
+Requires: grep
%description dracut
This package contains a dracut module used to construct an initramfs

View File

@ -0,0 +1,31 @@
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Brian Behlendorf <behlendorf1@llnl.gov>
Date: Thu, 23 Aug 2018 09:34:34 -0700
Subject: [PATCH] Add libaio-devel BuildRequires
The zfs-test package needs a build requirement on the libaio-devel
package. Without it ./configure will correctly determine that
mmap_libaio cannot be built and it will be skipped.
Reviewed-by: George Melikov <mail@gmelikov.ru>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #7821
Closes #7824
Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>
---
rpm/generic/zfs.spec.in | 1 +
1 file changed, 1 insertion(+)
diff --git a/rpm/generic/zfs.spec.in b/rpm/generic/zfs.spec.in
index 398221c6..16c5780b 100644
--- a/rpm/generic/zfs.spec.in
+++ b/rpm/generic/zfs.spec.in
@@ -197,6 +197,7 @@ Requires: sudo
Requires: sysstat
Requires: rng-tools
Requires: libaio
+BuildRequires: libaio-devel
AutoReqProv: no
%description test

View File

@ -0,0 +1,36 @@
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: LOLi <loli10K@users.noreply.github.com>
Date: Sun, 26 Aug 2018 21:43:27 +0200
Subject: [PATCH] Fix libaio-devel requirement for Debian-based distributions
BuildRequires tags for "-devel" packages in the RPM spec file do not
work when building on Debian-based distributions.
Fix this issue by making this requirement conditional to RPM-based
distributions.
Reviewed-by: George Melikov <mail@gmelikov.ru>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: loli10K <ezomori.nozomu@gmail.com>
Closes #7829
Closes #7831
Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>
---
rpm/generic/zfs.spec.in | 2 ++
1 file changed, 2 insertions(+)
diff --git a/rpm/generic/zfs.spec.in b/rpm/generic/zfs.spec.in
index 16c5780b..22565725 100644
--- a/rpm/generic/zfs.spec.in
+++ b/rpm/generic/zfs.spec.in
@@ -197,7 +197,9 @@ Requires: sudo
Requires: sysstat
Requires: rng-tools
Requires: libaio
+%if 0%{?rhel}%{?fedora}%{?suse_version}
BuildRequires: libaio-devel
+%endif
AutoReqProv: no
%description test

View File

@ -0,0 +1,61 @@
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Joao Carlos Mendes Luis <dioni21@users.noreply.github.com>
Date: Sun, 26 Aug 2018 16:55:44 -0300
Subject: [PATCH] Fedora 28: Fix misc bounds check compiler warnings
Fix a bunch of truncation compiler warnings that show up
on Fedora 28 (GCC 8.0.1).
Reviewed-by: Giuseppe Di Natale <guss80@gmail.com>
Reviewed-by: George Melikov <mail@gmelikov.ru>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Issue #7368
Closes #7826
Closes #7830
Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>
---
lib/libshare/smb.c | 2 +-
module/icp/core/kcf_mech_tabs.c | 2 +-
tests/zfs-tests/tests/functional/ctime/ctime.c | 2 +-
3 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/lib/libshare/smb.c b/lib/libshare/smb.c
index 76145bd9..91d4decb 100644
--- a/lib/libshare/smb.c
+++ b/lib/libshare/smb.c
@@ -218,7 +218,7 @@ smb_enable_share_one(const char *sharename, const char *sharepath)
int rc;
/* Support ZFS share name regexp '[[:alnum:]_-.: ]' */
- strncpy(name, sharename, sizeof (name));
+ strlcpy(name, sharename, sizeof (name));
name [sizeof (name)-1] = '\0';
pos = name;
diff --git a/module/icp/core/kcf_mech_tabs.c b/module/icp/core/kcf_mech_tabs.c
index 723bfdb6..741dae7a 100644
--- a/module/icp/core/kcf_mech_tabs.c
+++ b/module/icp/core/kcf_mech_tabs.c
@@ -321,7 +321,7 @@ kcf_create_mech_entry(kcf_ops_class_t class, char *mechname)
mutex_enter(&(me_tab[i].me_mutex));
if (me_tab[i].me_name[0] == 0) {
/* Found an empty spot */
- (void) strncpy(me_tab[i].me_name, mechname,
+ (void) strlcpy(me_tab[i].me_name, mechname,
CRYPTO_MAX_MECH_NAME);
me_tab[i].me_name[CRYPTO_MAX_MECH_NAME-1] = '\0';
me_tab[i].me_mechid = KCF_MECHID(class, i);
diff --git a/tests/zfs-tests/tests/functional/ctime/ctime.c b/tests/zfs-tests/tests/functional/ctime/ctime.c
index ba8af15f..1cd18323 100644
--- a/tests/zfs-tests/tests/functional/ctime/ctime.c
+++ b/tests/zfs-tests/tests/functional/ctime/ctime.c
@@ -155,7 +155,7 @@ do_link(const char *pfile)
return (-1);
}
- strncpy(pfile_copy, pfile, sizeof (pfile_copy));
+ strncpy(pfile_copy, pfile, sizeof (pfile_copy)-1);
pfile_copy[sizeof (pfile_copy) - 1] = '\0';
/*
* Figure out source file directory name, and create

View File

@ -0,0 +1,556 @@
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Tim Chase <tim@chase2k.com>
Date: Mon, 27 Aug 2018 10:28:32 -0400
Subject: [PATCH] Fix problems receiving reallocated dnodes
This is a port of 047116ac - Raw sends must be able to decrease nlevels,
to the zfs-0.7-stable branch. It includes the various fixes to the
problem of receiving incremental streams which include reallocated dnodes
in which the number of dnode slots has changed but excludes the parts
which are related to raw streams.
From 047116ac:
Currently, when a raw zfs send file includes a
DRR_OBJECT record that would decrease the number of
levels of an existing object, the object is reallocated
with dmu_object_reclaim() which creates the new dnode
using the old object's nlevels. For non-raw sends this
doesn't really matter, but raw sends require that
nlevels on the receive side match that of the send
side so that the checksum-of-MAC tree can be properly
maintained. This patch corrects the issue by freeing
the object completely before allocating it again in
this case.
This patch also corrects several issues with
dnode_hold_impl() and related functions that prevented
dnodes (particularly multi-slot dnodes) from being
reallocated properly due to the fact that existing
dnodes were not being fully cleaned up when they
were freed.
This patch adds a test to make sure that zfs recv
functions properly with incremental streams containing
dnodes of different sizes.
This also includes a one-liner fix from loli10K to fix a test failure:
https://github.com/zfsonlinux/zfs/pull/7792#discussion_r212769264
Authored-by: Tom Caputi <tcaputi@datto.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Tim Chase <tim@chase2k.com>
Ported-by: Tim Chase <tim@chase2k.com>
Closes #6821
Closes #6864
NOTE: This is the first of the port of 3 related patches patches to the
zfs-0.7-release branch of ZoL. The other two patches should immediately
follow this one.
Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>
---
cmd/ztest/ztest.c | 25 +++++-
include/sys/dnode.h | 6 ++
lib/libzfs/libzfs_sendrecv.c | 1 +
module/zfs/dmu_object.c | 1 -
module/zfs/dmu_send.c | 51 +++++++++--
module/zfs/dnode.c | 84 +++++++++++++++++--
module/zfs/dnode_sync.c | 2 +
tests/runfiles/linux.run | 2 +-
tests/zfs-tests/tests/functional/rsend/Makefile.am | 3 +-
.../functional/rsend/send_realloc_dnode_size.ksh | 98 ++++++++++++++++++++++
10 files changed, 258 insertions(+), 15 deletions(-)
create mode 100644 tests/zfs-tests/tests/functional/rsend/send_realloc_dnode_size.ksh
diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c
index 1a320b03..a410eeef 100644
--- a/cmd/ztest/ztest.c
+++ b/cmd/ztest/ztest.c
@@ -197,7 +197,8 @@ extern uint64_t metaslab_gang_bang;
extern uint64_t metaslab_df_alloc_threshold;
extern int metaslab_preload_limit;
extern boolean_t zfs_compressed_arc_enabled;
-extern int zfs_abd_scatter_enabled;
+extern int zfs_abd_scatter_enabled;
+extern int dmu_object_alloc_chunk_shift;
static ztest_shared_opts_t *ztest_shared_opts;
static ztest_shared_opts_t ztest_opts;
@@ -310,6 +311,7 @@ static ztest_shared_callstate_t *ztest_shared_callstate;
ztest_func_t ztest_dmu_read_write;
ztest_func_t ztest_dmu_write_parallel;
ztest_func_t ztest_dmu_object_alloc_free;
+ztest_func_t ztest_dmu_object_next_chunk;
ztest_func_t ztest_dmu_commit_callbacks;
ztest_func_t ztest_zap;
ztest_func_t ztest_zap_parallel;
@@ -357,6 +359,7 @@ ztest_info_t ztest_info[] = {
ZTI_INIT(ztest_dmu_read_write, 1, &zopt_always),
ZTI_INIT(ztest_dmu_write_parallel, 10, &zopt_always),
ZTI_INIT(ztest_dmu_object_alloc_free, 1, &zopt_always),
+ ZTI_INIT(ztest_dmu_object_next_chunk, 1, &zopt_sometimes),
ZTI_INIT(ztest_dmu_commit_callbacks, 1, &zopt_always),
ZTI_INIT(ztest_zap, 30, &zopt_always),
ZTI_INIT(ztest_zap_parallel, 100, &zopt_always),
@@ -3927,6 +3930,26 @@ ztest_dmu_object_alloc_free(ztest_ds_t *zd, uint64_t id)
umem_free(od, size);
}
+/*
+ * Rewind the global allocator to verify object allocation backfilling.
+ */
+void
+ztest_dmu_object_next_chunk(ztest_ds_t *zd, uint64_t id)
+{
+ objset_t *os = zd->zd_os;
+ int dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift;
+ uint64_t object;
+
+ /*
+ * Rewind the global allocator randomly back to a lower object number
+ * to force backfilling and reclamation of recently freed dnodes.
+ */
+ mutex_enter(&os->os_obj_lock);
+ object = ztest_random(os->os_obj_next_chunk);
+ os->os_obj_next_chunk = P2ALIGN(object, dnodes_per_chunk);
+ mutex_exit(&os->os_obj_lock);
+}
+
#undef OD_ARRAY_SIZE
#define OD_ARRAY_SIZE 2
diff --git a/include/sys/dnode.h b/include/sys/dnode.h
index c7efe559..ea7defe1 100644
--- a/include/sys/dnode.h
+++ b/include/sys/dnode.h
@@ -360,6 +360,7 @@ int dnode_next_offset(dnode_t *dn, int flags, uint64_t *off,
int minlvl, uint64_t blkfill, uint64_t txg);
void dnode_evict_dbufs(dnode_t *dn);
void dnode_evict_bonus(dnode_t *dn);
+void dnode_free_interior_slots(dnode_t *dn);
#define DNODE_IS_CACHEABLE(_dn) \
((_dn)->dn_objset->os_primary_cache == ZFS_CACHE_ALL || \
@@ -454,6 +455,11 @@ typedef struct dnode_stats {
*/
kstat_named_t dnode_hold_free_txg;
/*
+ * Number of times dnode_free_interior_slots() needed to retry
+ * acquiring a slot zrl lock due to contention.
+ */
+ kstat_named_t dnode_free_interior_lock_retry;
+ /*
* Number of new dnodes allocated by dnode_allocate().
*/
kstat_named_t dnode_allocate;
diff --git a/lib/libzfs/libzfs_sendrecv.c b/lib/libzfs/libzfs_sendrecv.c
index c5acd21a..cadf16cc 100644
--- a/lib/libzfs/libzfs_sendrecv.c
+++ b/lib/libzfs/libzfs_sendrecv.c
@@ -3577,6 +3577,7 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
}
newfs = B_TRUE;
+ *cp = '/';
}
if (flags->verbose) {
diff --git a/module/zfs/dmu_object.c b/module/zfs/dmu_object.c
index e7412b75..f53da407 100644
--- a/module/zfs/dmu_object.c
+++ b/module/zfs/dmu_object.c
@@ -275,7 +275,6 @@ dmu_object_reclaim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot,
return (err);
}
-
int
dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx)
{
diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c
index cdbc1cd1..148b5ff8 100644
--- a/module/zfs/dmu_send.c
+++ b/module/zfs/dmu_send.c
@@ -2156,10 +2156,8 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
}
err = dmu_object_info(rwa->os, drro->drr_object, &doi);
-
- if (err != 0 && err != ENOENT)
+ if (err != 0 && err != ENOENT && err != EEXIST)
return (SET_ERROR(EINVAL));
- object = err == 0 ? drro->drr_object : DMU_NEW_OBJECT;
if (drro->drr_object > rwa->max_object)
rwa->max_object = drro->drr_object;
@@ -2175,13 +2173,56 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
nblkptr = deduce_nblkptr(drro->drr_bonustype,
drro->drr_bonuslen);
+ object = drro->drr_object;
+
if (drro->drr_blksz != doi.doi_data_block_size ||
- nblkptr < doi.doi_nblkptr) {
+ nblkptr < doi.doi_nblkptr ||
+ drro->drr_dn_slots != doi.doi_dnodesize >> DNODE_SHIFT) {
err = dmu_free_long_range(rwa->os, drro->drr_object,
0, DMU_OBJECT_END);
if (err != 0)
return (SET_ERROR(EINVAL));
}
+ } else if (err == EEXIST) {
+ /*
+ * The object requested is currently an interior slot of a
+ * multi-slot dnode. This will be resolved when the next txg
+ * is synced out, since the send stream will have told us
+ * to free this slot when we freed the associated dnode
+ * earlier in the stream.
+ */
+ txg_wait_synced(dmu_objset_pool(rwa->os), 0);
+ object = drro->drr_object;
+ } else {
+ /* object is free and we are about to allocate a new one */
+ object = DMU_NEW_OBJECT;
+ }
+
+ /*
+ * If this is a multi-slot dnode there is a chance that this
+ * object will expand into a slot that is already used by
+ * another object from the previous snapshot. We must free
+ * these objects before we attempt to allocate the new dnode.
+ */
+ if (drro->drr_dn_slots > 1) {
+ for (uint64_t slot = drro->drr_object + 1;
+ slot < drro->drr_object + drro->drr_dn_slots;
+ slot++) {
+ dmu_object_info_t slot_doi;
+
+ err = dmu_object_info(rwa->os, slot, &slot_doi);
+ if (err == ENOENT || err == EEXIST)
+ continue;
+ else if (err != 0)
+ return (err);
+
+ err = dmu_free_long_object(rwa->os, slot);
+
+ if (err != 0)
+ return (err);
+ }
+
+ txg_wait_synced(dmu_objset_pool(rwa->os), 0);
}
tx = dmu_tx_create(rwa->os);
@@ -2732,7 +2773,7 @@ receive_read_record(struct receive_arg *ra)
* See receive_read_prefetch for an explanation why we're
* storing this object in the ignore_obj_list.
*/
- if (err == ENOENT ||
+ if (err == ENOENT || err == EEXIST ||
(err == 0 && doi.doi_data_block_size != drro->drr_blksz)) {
objlist_insert(&ra->ignore_objlist, drro->drr_object);
err = 0;
diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c
index e05a4d0a..df6a4872 100644
--- a/module/zfs/dnode.c
+++ b/module/zfs/dnode.c
@@ -55,6 +55,7 @@ dnode_stats_t dnode_stats = {
{ "dnode_hold_free_overflow", KSTAT_DATA_UINT64 },
{ "dnode_hold_free_refcount", KSTAT_DATA_UINT64 },
{ "dnode_hold_free_txg", KSTAT_DATA_UINT64 },
+ { "dnode_free_interior_lock_retry", KSTAT_DATA_UINT64 },
{ "dnode_allocate", KSTAT_DATA_UINT64 },
{ "dnode_reallocate", KSTAT_DATA_UINT64 },
{ "dnode_buf_evict", KSTAT_DATA_UINT64 },
@@ -516,7 +517,8 @@ dnode_destroy(dnode_t *dn)
mutex_exit(&os->os_lock);
/* the dnode can no longer move, so we can release the handle */
- zrl_remove(&dn->dn_handle->dnh_zrlock);
+ if (!zrl_is_locked(&dn->dn_handle->dnh_zrlock))
+ zrl_remove(&dn->dn_handle->dnh_zrlock);
dn->dn_allocated_txg = 0;
dn->dn_free_txg = 0;
@@ -662,6 +664,8 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
DN_BONUS_SIZE(spa_maxdnodesize(dmu_objset_spa(dn->dn_objset))));
dn_slots = dn_slots > 0 ? dn_slots : DNODE_MIN_SLOTS;
+
+ dnode_free_interior_slots(dn);
DNODE_STAT_BUMP(dnode_reallocate);
/* clean up any unreferenced dbufs */
@@ -1062,19 +1066,73 @@ dnode_set_slots(dnode_children_t *children, int idx, int slots, void *ptr)
}
static boolean_t
-dnode_check_slots(dnode_children_t *children, int idx, int slots, void *ptr)
+dnode_check_slots_free(dnode_children_t *children, int idx, int slots)
{
ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
for (int i = idx; i < idx + slots; i++) {
dnode_handle_t *dnh = &children->dnc_children[i];
- if (dnh->dnh_dnode != ptr)
+ dnode_t *dn = dnh->dnh_dnode;
+
+ if (dn == DN_SLOT_FREE) {
+ continue;
+ } else if (DN_SLOT_IS_PTR(dn)) {
+ mutex_enter(&dn->dn_mtx);
+ dmu_object_type_t type = dn->dn_type;
+ mutex_exit(&dn->dn_mtx);
+
+ if (type != DMU_OT_NONE)
+ return (B_FALSE);
+
+ continue;
+ } else {
return (B_FALSE);
+ }
+
+ return (B_FALSE);
}
return (B_TRUE);
}
+static void
+dnode_reclaim_slots(dnode_children_t *children, int idx, int slots)
+{
+ ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
+
+ for (int i = idx; i < idx + slots; i++) {
+ dnode_handle_t *dnh = &children->dnc_children[i];
+
+ ASSERT(zrl_is_locked(&dnh->dnh_zrlock));
+
+ if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
+ ASSERT3S(dnh->dnh_dnode->dn_type, ==, DMU_OT_NONE);
+ dnode_destroy(dnh->dnh_dnode);
+ dnh->dnh_dnode = DN_SLOT_FREE;
+ }
+ }
+}
+
+void
+dnode_free_interior_slots(dnode_t *dn)
+{
+ dnode_children_t *children = dmu_buf_get_user(&dn->dn_dbuf->db);
+ int epb = dn->dn_dbuf->db.db_size >> DNODE_SHIFT;
+ int idx = (dn->dn_object & (epb - 1)) + 1;
+ int slots = dn->dn_num_slots - 1;
+
+ if (slots == 0)
+ return;
+
+ ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
+
+ while (!dnode_slots_tryenter(children, idx, slots))
+ DNODE_STAT_BUMP(dnode_free_interior_lock_retry);
+
+ dnode_set_slots(children, idx, slots, DN_SLOT_FREE);
+ dnode_slots_rele(children, idx, slots);
+}
+
void
dnode_special_close(dnode_handle_t *dnh)
{
@@ -1355,7 +1413,7 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
while (dn == DN_SLOT_UNINIT) {
dnode_slots_hold(dnc, idx, slots);
- if (!dnode_check_slots(dnc, idx, slots, DN_SLOT_FREE)) {
+ if (!dnode_check_slots_free(dnc, idx, slots)) {
DNODE_STAT_BUMP(dnode_hold_free_misses);
dnode_slots_rele(dnc, idx, slots);
dbuf_rele(db, FTAG);
@@ -1368,15 +1426,29 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
continue;
}
- if (!dnode_check_slots(dnc, idx, slots, DN_SLOT_FREE)) {
+ if (!dnode_check_slots_free(dnc, idx, slots)) {
DNODE_STAT_BUMP(dnode_hold_free_lock_misses);
dnode_slots_rele(dnc, idx, slots);
dbuf_rele(db, FTAG);
return (SET_ERROR(ENOSPC));
}
+ /*
+ * Allocated but otherwise free dnodes which would
+ * be in the interior of a multi-slot dnodes need
+ * to be freed. Single slot dnodes can be safely
+ * re-purposed as a performance optimization.
+ */
+ if (slots > 1)
+ dnode_reclaim_slots(dnc, idx + 1, slots - 1);
+
dnh = &dnc->dnc_children[idx];
- dn = dnode_create(os, dn_block + idx, db, object, dnh);
+ if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
+ dn = dnh->dnh_dnode;
+ } else {
+ dn = dnode_create(os, dn_block + idx, db,
+ object, dnh);
+ }
}
mutex_enter(&dn->dn_mtx);
diff --git a/module/zfs/dnode_sync.c b/module/zfs/dnode_sync.c
index 742d962b..8d65e385 100644
--- a/module/zfs/dnode_sync.c
+++ b/module/zfs/dnode_sync.c
@@ -533,6 +533,7 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx)
if (dn->dn_allocated_txg != dn->dn_free_txg)
dmu_buf_will_dirty(&dn->dn_dbuf->db, tx);
bzero(dn->dn_phys, sizeof (dnode_phys_t) * dn->dn_num_slots);
+ dnode_free_interior_slots(dn);
mutex_enter(&dn->dn_mtx);
dn->dn_type = DMU_OT_NONE;
@@ -540,6 +541,7 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx)
dn->dn_allocated_txg = 0;
dn->dn_free_txg = 0;
dn->dn_have_spill = B_FALSE;
+ dn->dn_num_slots = 1;
mutex_exit(&dn->dn_mtx);
ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run
index 69e9eb26..d8fe6f3a 100644
--- a/tests/runfiles/linux.run
+++ b/tests/runfiles/linux.run
@@ -605,7 +605,7 @@ tests = ['rsend_001_pos', 'rsend_002_pos', 'rsend_003_pos', 'rsend_004_pos',
'send-c_lz4_disabled', 'send-c_recv_lz4_disabled',
'send-c_mixed_compression', 'send-c_stream_size_estimate', 'send-cD',
'send-c_embedded_blocks', 'send-c_resume', 'send-cpL_varied_recsize',
- 'send-c_recv_dedup', 'send_freeobjects']
+ 'send-c_recv_dedup', 'send_freeobjects', 'send_realloc_dnode_size']
tags = ['functional', 'rsend']
[tests/functional/scrub_mirror]
diff --git a/tests/zfs-tests/tests/functional/rsend/Makefile.am b/tests/zfs-tests/tests/functional/rsend/Makefile.am
index 6b1aa8b3..a2837d1a 100644
--- a/tests/zfs-tests/tests/functional/rsend/Makefile.am
+++ b/tests/zfs-tests/tests/functional/rsend/Makefile.am
@@ -36,7 +36,8 @@ dist_pkgdata_SCRIPTS = \
send-c_volume.ksh \
send-c_zstreamdump.ksh \
send-cpL_varied_recsize.ksh \
- send_freeobjects.ksh
+ send_freeobjects.ksh \
+ send_realloc_dnode_size.ksh
dist_pkgdata_DATA = \
rsend.cfg \
diff --git a/tests/zfs-tests/tests/functional/rsend/send_realloc_dnode_size.ksh b/tests/zfs-tests/tests/functional/rsend/send_realloc_dnode_size.ksh
new file mode 100644
index 00000000..20676394
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/rsend/send_realloc_dnode_size.ksh
@@ -0,0 +1,98 @@
+#!/bin/ksh
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2017 by Lawrence Livermore National Security, LLC.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/rsend/rsend.kshlib
+
+#
+# Description:
+# Verify incremental receive properly handles objects with changed
+# dnode slot count.
+#
+# Strategy:
+# 1. Populate a dataset with 1k byte dnodes and snapshot
+# 2. Remove objects, set dnodesize=legacy, and remount dataset so new objects
+# get recycled numbers and formerly "interior" dnode slots get assigned
+# to new objects
+# 3. Remove objects, set dnodesize=2k, and remount dataset so new objects
+# overlap with recently recycled and formerly "normal" dnode slots get
+# assigned to new objects
+# 4. Generate initial and incremental streams
+# 5. Verify initial and incremental streams can be received
+#
+
+verify_runnable "both"
+
+log_assert "Verify incremental receive handles objects with changed dnode size"
+
+function cleanup
+{
+ rm -f $BACKDIR/fs-dn-legacy
+ rm -f $BACKDIR/fs-dn-1k
+ rm -f $BACKDIR/fs-dn-2k
+
+ if datasetexists $POOL/fs ; then
+ log_must zfs destroy -rR $POOL/fs
+ fi
+
+ if datasetexists $POOL/newfs ; then
+ log_must zfs destroy -rR $POOL/newfs
+ fi
+}
+
+log_onexit cleanup
+
+# 1. Populate a dataset with 1k byte dnodes and snapshot
+log_must zfs create -o dnodesize=1k $POOL/fs
+log_must mk_files 200 262144 0 $POOL/fs
+log_must zfs snapshot $POOL/fs@a
+
+# 2. Remove objects, set dnodesize=legacy, and remount dataset so new objects
+# get recycled numbers and formerly "interior" dnode slots get assigned
+# to new objects
+rm /$POOL/fs/*
+
+log_must zfs unmount $POOL/fs
+log_must zfs set dnodesize=legacy $POOL/fs
+log_must zfs mount $POOL/fs
+
+log_must mk_files 200 262144 0 $POOL/fs
+log_must zfs snapshot $POOL/fs@b
+
+# 3. Remove objects, set dnodesize=2k, and remount dataset so new objects
+# overlap with recently recycled and formerly "normal" dnode slots get
+# assigned to new objects
+rm /$POOL/fs/*
+
+log_must zfs unmount $POOL/fs
+log_must zfs set dnodesize=2k $POOL/fs
+log_must zfs mount $POOL/fs
+
+mk_files 200 262144 0 $POOL/fs
+log_must zfs snapshot $POOL/fs@c
+
+# 4. Generate initial and incremental streams
+log_must eval "zfs send $POOL/fs@a > $BACKDIR/fs-dn-1k"
+log_must eval "zfs send -i $POOL/fs@a $POOL/fs@b > $BACKDIR/fs-dn-legacy"
+log_must eval "zfs send -i $POOL/fs@b $POOL/fs@c > $BACKDIR/fs-dn-2k"
+
+# 5. Verify initial and incremental streams can be received
+log_must eval "zfs recv $POOL/newfs < $BACKDIR/fs-dn-1k"
+log_must eval "zfs recv $POOL/newfs < $BACKDIR/fs-dn-legacy"
+log_must eval "zfs recv $POOL/newfs < $BACKDIR/fs-dn-2k"
+
+log_pass "Verify incremental receive handles objects with changed dnode size"

View File

@ -0,0 +1,134 @@
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Tom Caputi <tcaputi@datto.com>
Date: Tue, 17 Apr 2018 14:13:57 -0400
Subject: [PATCH] Fix object reclaim when using large dnodes
Currently, when the receive_object() code wants to reclaim an
object, it always assumes that the dnode is the legacy 512 bytes,
even when the incoming bonus buffer exceeds this length. This
causes a buffer overflow if --enable-debug is not provided and
triggers an ASSERT if it is. This patch resolves this issue and
adds an ASSERT to ensure this can't happen again.
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #7097
Closes #7433
Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>
---
module/zfs/dmu_object.c | 2 +-
module/zfs/dmu_send.c | 5 +++--
module/zfs/dnode.c | 3 +--
.../functional/rsend/send_realloc_dnode_size.ksh | 21 +++++++++++++++++----
4 files changed, 22 insertions(+), 9 deletions(-)
mode change 100644 => 100755 tests/zfs-tests/tests/functional/rsend/send_realloc_dnode_size.ksh
diff --git a/module/zfs/dmu_object.c b/module/zfs/dmu_object.c
index f53da407..1fc71d10 100644
--- a/module/zfs/dmu_object.c
+++ b/module/zfs/dmu_object.c
@@ -249,7 +249,7 @@ dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
{
return (dmu_object_reclaim_dnsize(os, object, ot, blocksize, bonustype,
- bonuslen, 0, tx));
+ bonuslen, DNODE_MIN_SIZE, tx));
}
int
diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c
index 148b5ff8..1de0f316 100644
--- a/module/zfs/dmu_send.c
+++ b/module/zfs/dmu_send.c
@@ -2244,9 +2244,10 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
drro->drr_bonustype != doi.doi_bonus_type ||
drro->drr_bonuslen != doi.doi_bonus_size) {
/* currently allocated, but with different properties */
- err = dmu_object_reclaim(rwa->os, drro->drr_object,
+ err = dmu_object_reclaim_dnsize(rwa->os, drro->drr_object,
drro->drr_type, drro->drr_blksz,
- drro->drr_bonustype, drro->drr_bonuslen, tx);
+ drro->drr_bonustype, drro->drr_bonuslen,
+ drro->drr_dn_slots << DNODE_SHIFT, tx);
}
if (err != 0) {
dmu_tx_commit(tx);
diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c
index df6a4872..d465b545 100644
--- a/module/zfs/dnode.c
+++ b/module/zfs/dnode.c
@@ -662,8 +662,7 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
ASSERT(DMU_OT_IS_VALID(bonustype));
ASSERT3U(bonuslen, <=,
DN_BONUS_SIZE(spa_maxdnodesize(dmu_objset_spa(dn->dn_objset))));
-
- dn_slots = dn_slots > 0 ? dn_slots : DNODE_MIN_SLOTS;
+ ASSERT3U(bonuslen, <=, DN_BONUS_SIZE(dn_slots << DNODE_SHIFT));
dnode_free_interior_slots(dn);
DNODE_STAT_BUMP(dnode_reallocate);
diff --git a/tests/zfs-tests/tests/functional/rsend/send_realloc_dnode_size.ksh b/tests/zfs-tests/tests/functional/rsend/send_realloc_dnode_size.ksh
old mode 100644
new mode 100755
index 20676394..12a72fa0
--- a/tests/zfs-tests/tests/functional/rsend/send_realloc_dnode_size.ksh
+++ b/tests/zfs-tests/tests/functional/rsend/send_realloc_dnode_size.ksh
@@ -13,6 +13,7 @@
#
# Copyright (c) 2017 by Lawrence Livermore National Security, LLC.
+# Copyright (c) 2018 Datto Inc.
#
. $STF_SUITE/include/libtest.shlib
@@ -31,8 +32,10 @@
# 3. Remove objects, set dnodesize=2k, and remount dataset so new objects
# overlap with recently recycled and formerly "normal" dnode slots get
# assigned to new objects
-# 4. Generate initial and incremental streams
-# 5. Verify initial and incremental streams can be received
+# 4. Create an empty file and add xattrs to it to exercise reclaiming a
+# dnode that requires more than 1 slot for its bonus buffer (Zol #7433)
+# 5. Generate initial and incremental streams
+# 6. Verify initial and incremental streams can be received
#
verify_runnable "both"
@@ -44,6 +47,7 @@ function cleanup
rm -f $BACKDIR/fs-dn-legacy
rm -f $BACKDIR/fs-dn-1k
rm -f $BACKDIR/fs-dn-2k
+ rm -f $BACKDIR/fs-attr
if datasetexists $POOL/fs ; then
log_must zfs destroy -rR $POOL/fs
@@ -82,17 +86,26 @@ log_must zfs unmount $POOL/fs
log_must zfs set dnodesize=2k $POOL/fs
log_must zfs mount $POOL/fs
+log_must touch /$POOL/fs/attrs
mk_files 200 262144 0 $POOL/fs
log_must zfs snapshot $POOL/fs@c
-# 4. Generate initial and incremental streams
+# 4. Create an empty file and add xattrs to it to exercise reclaiming a
+# dnode that requires more than 1 slot for its bonus buffer (Zol #7433)
+log_must zfs set compression=on xattr=sa $POOL/fs
+log_must eval "python -c 'print \"a\" * 512' | attr -s bigval /$POOL/fs/attrs"
+log_must zfs snapshot $POOL/fs@d
+
+# 5. Generate initial and incremental streams
log_must eval "zfs send $POOL/fs@a > $BACKDIR/fs-dn-1k"
log_must eval "zfs send -i $POOL/fs@a $POOL/fs@b > $BACKDIR/fs-dn-legacy"
log_must eval "zfs send -i $POOL/fs@b $POOL/fs@c > $BACKDIR/fs-dn-2k"
+log_must eval "zfs send -i $POOL/fs@c $POOL/fs@d > $BACKDIR/fs-attr"
-# 5. Verify initial and incremental streams can be received
+# 6. Verify initial and incremental streams can be received
log_must eval "zfs recv $POOL/newfs < $BACKDIR/fs-dn-1k"
log_must eval "zfs recv $POOL/newfs < $BACKDIR/fs-dn-legacy"
log_must eval "zfs recv $POOL/newfs < $BACKDIR/fs-dn-2k"
+log_must eval "zfs recv $POOL/newfs < $BACKDIR/fs-attr"
log_pass "Verify incremental receive handles objects with changed dnode size"

View File

@ -0,0 +1,124 @@
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Tom Caputi <tcaputi@datto.com>
Date: Thu, 28 Jun 2018 17:55:11 -0400
Subject: [PATCH] Fix 'zfs recv' of non large_dnode send streams
Currently, there is a bug where older send streams without the
DMU_BACKUP_FEATURE_LARGE_DNODE flag are not handled correctly.
The code in receive_object() fails to handle cases where
drro->drr_dn_slots is set to 0, which is always the case when the
sending code does not support this feature flag. This patch fixes
the issue by ensuring that that a value of 0 is treated as
DNODE_MIN_SLOTS.
Tested-by: DHE <git@dehacked.net>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #7617
Closes #7662
Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>
---
module/zfs/dmu_object.c | 3 +++
module/zfs/dmu_send.c | 33 +++++++++++++++++++++++++++------
2 files changed, 30 insertions(+), 6 deletions(-)
diff --git a/module/zfs/dmu_object.c b/module/zfs/dmu_object.c
index 1fc71d10..40c25362 100644
--- a/module/zfs/dmu_object.c
+++ b/module/zfs/dmu_object.c
@@ -261,6 +261,9 @@ dmu_object_reclaim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot,
int dn_slots = dnodesize >> DNODE_SHIFT;
int err;
+ if (dn_slots == 0)
+ dn_slots = DNODE_MIN_SLOTS;
+
if (object == DMU_META_DNODE_OBJECT)
return (SET_ERROR(EBADF));
diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c
index 1de0f316..13aae960 100644
--- a/module/zfs/dmu_send.c
+++ b/module/zfs/dmu_send.c
@@ -2139,6 +2139,8 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
dmu_tx_t *tx;
uint64_t object;
int err;
+ uint8_t dn_slots = drro->drr_dn_slots != 0 ?
+ drro->drr_dn_slots : DNODE_MIN_SLOTS;
if (drro->drr_type == DMU_OT_NONE ||
!DMU_OT_IS_VALID(drro->drr_type) ||
@@ -2150,7 +2152,7 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
drro->drr_blksz > spa_maxblocksize(dmu_objset_spa(rwa->os)) ||
drro->drr_bonuslen >
DN_BONUS_SIZE(spa_maxdnodesize(dmu_objset_spa(rwa->os))) ||
- drro->drr_dn_slots >
+ dn_slots >
(spa_maxdnodesize(dmu_objset_spa(rwa->os)) >> DNODE_SHIFT)) {
return (SET_ERROR(EINVAL));
}
@@ -2177,12 +2179,31 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
if (drro->drr_blksz != doi.doi_data_block_size ||
nblkptr < doi.doi_nblkptr ||
- drro->drr_dn_slots != doi.doi_dnodesize >> DNODE_SHIFT) {
+ dn_slots != doi.doi_dnodesize >> DNODE_SHIFT) {
err = dmu_free_long_range(rwa->os, drro->drr_object,
0, DMU_OBJECT_END);
if (err != 0)
return (SET_ERROR(EINVAL));
}
+
+ /*
+ * The dmu does not currently support decreasing nlevels
+ * on an object. For non-raw sends, this does not matter
+ * and the new object can just use the previous one's nlevels.
+ * For raw sends, however, the structure of the received dnode
+ * (including nlevels) must match that of the send side.
+ * Therefore, instead of using dmu_object_reclaim(), we must
+ * free the object completely and call dmu_object_claim_dnsize()
+ * instead.
+ */
+ if (dn_slots != doi.doi_dnodesize >> DNODE_SHIFT) {
+ err = dmu_free_long_object(rwa->os, drro->drr_object);
+ if (err != 0)
+ return (SET_ERROR(EINVAL));
+
+ txg_wait_synced(dmu_objset_pool(rwa->os), 0);
+ object = DMU_NEW_OBJECT;
+ }
} else if (err == EEXIST) {
/*
* The object requested is currently an interior slot of a
@@ -2204,9 +2225,9 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
* another object from the previous snapshot. We must free
* these objects before we attempt to allocate the new dnode.
*/
- if (drro->drr_dn_slots > 1) {
+ if (dn_slots > 1) {
for (uint64_t slot = drro->drr_object + 1;
- slot < drro->drr_object + drro->drr_dn_slots;
+ slot < drro->drr_object + dn_slots;
slot++) {
dmu_object_info_t slot_doi;
@@ -2238,7 +2259,7 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
err = dmu_object_claim_dnsize(rwa->os, drro->drr_object,
drro->drr_type, drro->drr_blksz,
drro->drr_bonustype, drro->drr_bonuslen,
- drro->drr_dn_slots << DNODE_SHIFT, tx);
+ dn_slots << DNODE_SHIFT, tx);
} else if (drro->drr_type != doi.doi_type ||
drro->drr_blksz != doi.doi_data_block_size ||
drro->drr_bonustype != doi.doi_bonus_type ||
@@ -2247,7 +2268,7 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
err = dmu_object_reclaim_dnsize(rwa->os, drro->drr_object,
drro->drr_type, drro->drr_blksz,
drro->drr_bonustype, drro->drr_bonuslen,
- drro->drr_dn_slots << DNODE_SHIFT, tx);
+ dn_slots << DNODE_SHIFT, tx);
}
if (err != 0) {
dmu_tx_commit(tx);

View File

@ -0,0 +1,42 @@
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Georgy Yakovlev <ya@sysdump.net>
Date: Thu, 10 May 2018 23:00:18 -0700
Subject: [PATCH] Fix build with CONFIG_GCC_PLUGIN_RANDSTRUCT
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
fs/zfs/zfs/metaslab.c:1055:2: error: positional initialization of field
in struct declared with designated_init attribute
[-Werror=designated-init]
metaslab_rt_remove,
Signed-off-by: Georgy Yakovlev <ya@sysdump.net>
Reviewed-by: Giuseppe Di Natale <dinatale2@llnl.gov>
Closes: #7069
Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>
---
module/zfs/metaslab.c | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c
index 5e413c06..ee24850d 100644
--- a/module/zfs/metaslab.c
+++ b/module/zfs/metaslab.c
@@ -1049,11 +1049,11 @@ metaslab_rt_vacate(range_tree_t *rt, void *arg)
}
static range_tree_ops_t metaslab_rt_ops = {
- metaslab_rt_create,
- metaslab_rt_destroy,
- metaslab_rt_add,
- metaslab_rt_remove,
- metaslab_rt_vacate
+ .rtop_create = metaslab_rt_create,
+ .rtop_destroy = metaslab_rt_destroy,
+ .rtop_add = metaslab_rt_add,
+ .rtop_remove = metaslab_rt_remove,
+ .rtop_vacate = metaslab_rt_vacate
};
/*

View File

@ -0,0 +1,35 @@
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Chris Siebenmann <cks.github@cs.toronto.edu>
Date: Wed, 5 Sep 2018 01:26:56 -0400
Subject: [PATCH] Correctly handle errors from kern_path
As a regular kernel function, kern_path() returns errors as negative
errnos, such as -ELOOP. zfsctl_snapdir_vget() must convert these into
the positive errnos used throughout the ZFS code when it returns them
to other ZFS functions so that the ZFS code properly sees them as
errors.
Reviewed-by: George Melikov <mail@gmelikov.ru>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Chris Siebenmann <cks.git01@cs.toronto.edu>
Closes #7764
Closes #7864
Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>
---
module/zfs/zfs_ctldir.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/module/zfs/zfs_ctldir.c b/module/zfs/zfs_ctldir.c
index 25edea78..0ab5b4f0 100644
--- a/module/zfs/zfs_ctldir.c
+++ b/module/zfs/zfs_ctldir.c
@@ -1180,7 +1180,7 @@ zfsctl_snapdir_vget(struct super_block *sb, uint64_t objsetid, int gen,
goto out;
/* Trigger automount */
- error = kern_path(mnt, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &path);
+ error = -kern_path(mnt, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &path);
if (error)
goto out;

View File

@ -0,0 +1,56 @@
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Tony Hutter <hutter2@llnl.gov>
Date: Wed, 5 Sep 2018 10:37:32 -0700
Subject: [PATCH] Tag zfs-0.7.10
META file and changelog updated.
Signed-off-by: Tony Hutter <hutter2@llnl.gov>
Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>
---
META | 2 +-
rpm/generic/zfs-kmod.spec.in | 3 +++
rpm/generic/zfs.spec.in | 3 +++
3 files changed, 7 insertions(+), 1 deletion(-)
diff --git a/META b/META
index fbada64e..89525ac8 100644
--- a/META
+++ b/META
@@ -1,7 +1,7 @@
Meta: 1
Name: zfs
Branch: 1.0
-Version: 0.7.9
+Version: 0.7.10
Release: 1
Release-Tags: relext
License: CDDL
diff --git a/rpm/generic/zfs-kmod.spec.in b/rpm/generic/zfs-kmod.spec.in
index a3678681..8306ea76 100644
--- a/rpm/generic/zfs-kmod.spec.in
+++ b/rpm/generic/zfs-kmod.spec.in
@@ -191,6 +191,9 @@ chmod u+x ${RPM_BUILD_ROOT}%{kmodinstdir_prefix}/*/extra/*/*/*
rm -rf $RPM_BUILD_ROOT
%changelog
+* Wed Sep 05 2018 Tony Hutter <hutter2@llnl.gov> - 0.7.10-1
+- Released 0.7.10-1, detailed release notes are available at:
+- https://github.com/zfsonlinux/zfs/releases/tag/zfs-0.7.10
* Tue May 08 2018 Tony Hutter <hutter2@llnl.gov> - 0.7.9-1
- Released 0.7.9-1, detailed release notes are available at:
- https://github.com/zfsonlinux/zfs/releases/tag/zfs-0.7.9
diff --git a/rpm/generic/zfs.spec.in b/rpm/generic/zfs.spec.in
index 22565725..76953aec 100644
--- a/rpm/generic/zfs.spec.in
+++ b/rpm/generic/zfs.spec.in
@@ -371,6 +371,9 @@ systemctl --system daemon-reload >/dev/null || true
%endif
%changelog
+* Wed Sep 05 2018 Tony Hutter <hutter2@llnl.gov> - 0.7.10-1
+- Released 0.7.10-1, detailed release notes are available at:
+- https://github.com/zfsonlinux/zfs/releases/tag/zfs-0.7.10
* Tue May 08 2018 Tony Hutter <hutter2@llnl.gov> - 0.7.9-1
- Released 0.7.9-1, detailed release notes are available at:
- https://github.com/zfsonlinux/zfs/releases/tag/zfs-0.7.9

View File

@ -1,5 +1,34 @@
0001-remove-DKMS-modules-and-dracut-build.patch 0001-remove-DKMS-modules-and-dracut-build.patch
0002-import-with-d-dev-disk-by-id-in-scan-service.patch 0002-import-with-d-dev-disk-by-id-in-scan-service.patch
0003-always-load-ZFS-module-on-boot.patch 0003-always-load-ZFS-module-on-boot.patch
0004-Fix-zpl_mount-deadlock.patch 0004-Fix-deadlock-between-zfs-umount-snapentry_expire.patch
0005-Fix-deadlock-between-zfs-umount-snapentry_expire.patch 0005-zv_suspend_lock-in-zvol_open-zvol_release.patch
0006-Linux-4.18-compat-inode-timespec-timespec64.patch
0007-Linux-compat-4.18-check_disk_size_change.patch
0008-OpenZFS-8997-ztest-assertion-failure-in-zil_lwb_writ.patch
0009-Fix-divide-by-zero-in-mmp_delay_update.patch
0010-Fix-ENOSPC-in-Handle-zap_add-failures-in.patch
0011-Trim-new-line-from-zfs_vdev_scheduler.patch
0012-module-param-callbacks-check-for-initialized-spa.patch
0013-Support-Debian-DKMS-builds.patch
0014-zpool-reopen-should-detect-expanded-devices.patch
0015-Add-pool-state-proc-entry-SUSPENDED-pools.patch
0016-Linux-4.14-compat-blk_queue_stackable.patch
0017-Default-ashift-for-Amazon-EC2-NVMe-devices.patch
0018-Fix-kernel-unaligned-access-on-sparc64.patch
0019-Fix-zpl_mount-deadlock.patch
0020-OpenZFS-8906-uts-illumos-rootfs-should-support-salte.patch
0021-Fix-zfs-incremental-send-remove-o-properties.patch
0022-Allow-inherited-properties-in-zfs_check_settable.patch
0023-Fix-arcstat.py-handling-of-unsupported-options.patch
0024-Don-t-modify-argv-in-user-tools.patch
0025-Add-missing-zfs-dracut-RPM-dependencies.patch
0026-Add-libaio-devel-BuildRequires.patch
0027-Fix-libaio-devel-requirement-for-Debian-based-distri.patch
0028-Fedora-28-Fix-misc-bounds-check-compiler-warnings.patch
0029-Fix-problems-receiving-reallocated-dnodes.patch
0030-Fix-object-reclaim-when-using-large-dnodes.patch
0031-Fix-zfs-recv-of-non-large_dnode-send-streams.patch
0032-Fix-build-with-CONFIG_GCC_PLUGIN_RANDSTRUCT.patch
0033-Correctly-handle-errors-from-kern_path.patch
0034-Tag-zfs-0.7.10.patch