mirror of
https://git.proxmox.com/git/mirror_zfs.git
synced 2025-02-10 09:21:09 +03:00
Fix 'zpool import' detection issues
This patch addresses multiple 'zpool import' block device indentification problems which are most likely to occur on a system configured to use blkid, by_vdev paths, multipath and failover. The symptom most commonly observed is the import uses different path names to import the pool than would normally be expected. * When using blkid to identify vdevs the listed devices may be added to the cache in any order. In order to apply the preferred search order heuristic a zfs_path_order() function was added to calculate the order given full path names. * Since it's possible to have multiple block devices with different vdev guids which refer to the same ZPOOL_CONFIG_PATH the slice cache must be indexed by guid and name. By avoiding collisions the preferred ordering can be maintaining even when multiple block devices claim the same ZPOOL_CONFIG_PATH. The preferred sorting by partition was never benefitial for a Linux system and was removed as part of this change. * When adding entries to the blkid cache avl_find/avl_insert are used instead of avl_add because collisions are possible and must be handled gracefully. * For pools using multipath devices there are, at a minimum, three devices where a vdev label may be read. They are the dm-* device and each underlying /dev/sd* device. Due to the way the block cache is implemented each of these devices may have a different cached copy of the vdev label. This can result in "ghost pools" which appear to persist even after a 'zpool labelclear' has been done to the dm-* device. In order to prevent this the vdev label is read with O_DIRECT in order to bypass any caching to get the on-disk version. * When opening a block device verify that vdev guid read from the disk matches the expected vdev guid. This allows for bad labels to be filtered out. Reviewed-by: Tony Hutter <hutter2@llnl.gov> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Closes #5359
This commit is contained in:
parent
34328f3cf8
commit
83bf769d50
@ -64,6 +64,10 @@ extern "C" {
|
|||||||
*/
|
*/
|
||||||
#define DISK_LABEL_WAIT (30 * 1000) /* 30 seconds */
|
#define DISK_LABEL_WAIT (30 * 1000) /* 30 seconds */
|
||||||
|
|
||||||
|
#define IMPORT_ORDER_PREFERRED_1 1
|
||||||
|
#define IMPORT_ORDER_PREFERRED_2 2
|
||||||
|
#define IMPORT_ORDER_SCAN_OFFSET 10
|
||||||
|
#define IMPORT_ORDER_DEFAULT 100
|
||||||
#define DEFAULT_IMPORT_PATH_SIZE 9
|
#define DEFAULT_IMPORT_PATH_SIZE 9
|
||||||
extern char *zpool_default_import_path[DEFAULT_IMPORT_PATH_SIZE];
|
extern char *zpool_default_import_path[DEFAULT_IMPORT_PATH_SIZE];
|
||||||
|
|
||||||
@ -728,6 +732,7 @@ extern boolean_t zfs_bookmark_exists(const char *path);
|
|||||||
extern int zfs_append_partition(char *path, size_t max_len);
|
extern int zfs_append_partition(char *path, size_t max_len);
|
||||||
extern int zfs_resolve_shortname(const char *name, char *path, size_t pathlen);
|
extern int zfs_resolve_shortname(const char *name, char *path, size_t pathlen);
|
||||||
extern int zfs_strcmp_pathname(char *name, char *cmp_name, int wholedisk);
|
extern int zfs_strcmp_pathname(char *name, char *cmp_name, int wholedisk);
|
||||||
|
extern int zfs_path_order(char *path, int *order);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Mount support functions.
|
* Mount support functions.
|
||||||
|
@ -1313,6 +1313,7 @@ zpool_read_label(int fd, nvlist_t **config, int *num_labels)
|
|||||||
vdev_label_t *label;
|
vdev_label_t *label;
|
||||||
nvlist_t *expected_config = NULL;
|
nvlist_t *expected_config = NULL;
|
||||||
uint64_t expected_guid = 0, size;
|
uint64_t expected_guid = 0, size;
|
||||||
|
int error;
|
||||||
|
|
||||||
*config = NULL;
|
*config = NULL;
|
||||||
|
|
||||||
@ -1320,7 +1321,8 @@ zpool_read_label(int fd, nvlist_t **config, int *num_labels)
|
|||||||
return (0);
|
return (0);
|
||||||
size = P2ALIGN_TYPED(statbuf.st_size, sizeof (vdev_label_t), uint64_t);
|
size = P2ALIGN_TYPED(statbuf.st_size, sizeof (vdev_label_t), uint64_t);
|
||||||
|
|
||||||
if ((label = malloc(sizeof (vdev_label_t))) == NULL)
|
error = posix_memalign((void **)&label, PAGESIZE, sizeof (*label));
|
||||||
|
if (error)
|
||||||
return (-1);
|
return (-1);
|
||||||
|
|
||||||
for (l = 0; l < VDEV_LABELS; l++) {
|
for (l = 0; l < VDEV_LABELS; l++) {
|
||||||
@ -1378,6 +1380,7 @@ typedef struct rdsk_node {
|
|||||||
char *rn_name; /* Full path to device */
|
char *rn_name; /* Full path to device */
|
||||||
int rn_order; /* Preferred order (low to high) */
|
int rn_order; /* Preferred order (low to high) */
|
||||||
int rn_num_labels; /* Number of valid labels */
|
int rn_num_labels; /* Number of valid labels */
|
||||||
|
uint64_t rn_vdev_guid; /* Expected vdev guid when set */
|
||||||
libzfs_handle_t *rn_hdl;
|
libzfs_handle_t *rn_hdl;
|
||||||
nvlist_t *rn_config; /* Label config */
|
nvlist_t *rn_config; /* Label config */
|
||||||
avl_tree_t *rn_avl;
|
avl_tree_t *rn_avl;
|
||||||
@ -1386,39 +1389,29 @@ typedef struct rdsk_node {
|
|||||||
boolean_t rn_labelpaths;
|
boolean_t rn_labelpaths;
|
||||||
} rdsk_node_t;
|
} rdsk_node_t;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Sorted by vdev guid and full path to allow for multiple entries with
|
||||||
|
* the same full path name. This is required because it's possible to
|
||||||
|
* have multiple block devices with labels that refer to the same
|
||||||
|
* ZPOOL_CONFIG_PATH yet have different vdev guids. In this case both
|
||||||
|
* entries need to be added to the cache. Scenarios where this can occur
|
||||||
|
* include overwritten pool labels, devices which are visible from multiple
|
||||||
|
* hosts and multipath devices.
|
||||||
|
*/
|
||||||
static int
|
static int
|
||||||
slice_cache_compare(const void *arg1, const void *arg2)
|
slice_cache_compare(const void *arg1, const void *arg2)
|
||||||
{
|
{
|
||||||
const char *nm1 = ((rdsk_node_t *)arg1)->rn_name;
|
const char *nm1 = ((rdsk_node_t *)arg1)->rn_name;
|
||||||
const char *nm2 = ((rdsk_node_t *)arg2)->rn_name;
|
const char *nm2 = ((rdsk_node_t *)arg2)->rn_name;
|
||||||
char *nm1slice, *nm2slice;
|
uint64_t guid1 = ((rdsk_node_t *)arg1)->rn_vdev_guid;
|
||||||
|
uint64_t guid2 = ((rdsk_node_t *)arg2)->rn_vdev_guid;
|
||||||
int rv;
|
int rv;
|
||||||
|
|
||||||
/*
|
rv = AVL_CMP(guid1, guid2);
|
||||||
* partitions one and three (slices zero and two) are the most
|
if (rv)
|
||||||
* likely to provide results, so put those first
|
return (rv);
|
||||||
*/
|
|
||||||
nm1slice = strstr(nm1, "part1");
|
|
||||||
nm2slice = strstr(nm2, "part1");
|
|
||||||
if (nm1slice && !nm2slice) {
|
|
||||||
return (-1);
|
|
||||||
}
|
|
||||||
if (!nm1slice && nm2slice) {
|
|
||||||
return (1);
|
|
||||||
}
|
|
||||||
nm1slice = strstr(nm1, "part3");
|
|
||||||
nm2slice = strstr(nm2, "part3");
|
|
||||||
if (nm1slice && !nm2slice) {
|
|
||||||
return (-1);
|
|
||||||
}
|
|
||||||
if (!nm1slice && nm2slice) {
|
|
||||||
return (1);
|
|
||||||
}
|
|
||||||
|
|
||||||
rv = strcmp(nm1, nm2);
|
return (AVL_ISIGN(strcmp(nm1, nm2)));
|
||||||
if (rv == 0)
|
|
||||||
return (0);
|
|
||||||
return (rv > 0 ? 1 : -1);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static boolean_t
|
static boolean_t
|
||||||
@ -1506,6 +1499,7 @@ zpool_open_func(void *arg)
|
|||||||
struct stat64 statbuf;
|
struct stat64 statbuf;
|
||||||
nvlist_t *config;
|
nvlist_t *config;
|
||||||
char *bname, *dupname;
|
char *bname, *dupname;
|
||||||
|
uint64_t vdev_guid = 0;
|
||||||
int error;
|
int error;
|
||||||
int num_labels;
|
int num_labels;
|
||||||
int fd;
|
int fd;
|
||||||
@ -1531,19 +1525,28 @@ zpool_open_func(void *arg)
|
|||||||
(!S_ISREG(statbuf.st_mode) && !S_ISBLK(statbuf.st_mode)))
|
(!S_ISREG(statbuf.st_mode) && !S_ISBLK(statbuf.st_mode)))
|
||||||
return;
|
return;
|
||||||
|
|
||||||
if ((fd = open(rn->rn_name, O_RDONLY)) < 0)
|
/*
|
||||||
|
* Preferentially open using O_DIRECT to bypass the block device
|
||||||
|
* cache which may be stale for multipath devices. An EINVAL errno
|
||||||
|
* indicates O_DIRECT is unsupported so fallback to just O_RDONLY.
|
||||||
|
*/
|
||||||
|
fd = open(rn->rn_name, O_RDONLY | O_DIRECT);
|
||||||
|
if ((fd < 0) && (errno == EINVAL))
|
||||||
|
fd = open(rn->rn_name, O_RDONLY);
|
||||||
|
|
||||||
|
if (fd < 0)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* This file is too small to hold a zpool
|
* This file is too small to hold a zpool
|
||||||
*/
|
*/
|
||||||
if (S_ISREG(statbuf.st_mode) &&
|
if (S_ISREG(statbuf.st_mode) && statbuf.st_size < SPA_MINDEVSIZE) {
|
||||||
statbuf.st_size < SPA_MINDEVSIZE) {
|
|
||||||
(void) close(fd);
|
(void) close(fd);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
if ((zpool_read_label(fd, &config, &num_labels)) != 0) {
|
error = zpool_read_label(fd, &config, &num_labels);
|
||||||
|
if (error != 0) {
|
||||||
(void) close(fd);
|
(void) close(fd);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@ -1554,6 +1557,18 @@ zpool_open_func(void *arg)
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Check that the vdev is for the expected guid. Additional entries
|
||||||
|
* are speculatively added based on the paths stored in the labels.
|
||||||
|
* Entries with valid paths but incorrect guids must be removed.
|
||||||
|
*/
|
||||||
|
error = nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, &vdev_guid);
|
||||||
|
if (error || (rn->rn_vdev_guid && rn->rn_vdev_guid != vdev_guid)) {
|
||||||
|
(void) close(fd);
|
||||||
|
nvlist_free(config);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
(void) close(fd);
|
(void) close(fd);
|
||||||
|
|
||||||
rn->rn_config = config;
|
rn->rn_config = config;
|
||||||
@ -1580,9 +1595,10 @@ zpool_open_func(void *arg)
|
|||||||
if (path != NULL) {
|
if (path != NULL) {
|
||||||
slice = zfs_alloc(hdl, sizeof (rdsk_node_t));
|
slice = zfs_alloc(hdl, sizeof (rdsk_node_t));
|
||||||
slice->rn_name = zfs_strdup(hdl, path);
|
slice->rn_name = zfs_strdup(hdl, path);
|
||||||
|
slice->rn_vdev_guid = vdev_guid;
|
||||||
slice->rn_avl = rn->rn_avl;
|
slice->rn_avl = rn->rn_avl;
|
||||||
slice->rn_hdl = hdl;
|
slice->rn_hdl = hdl;
|
||||||
slice->rn_order = 1;
|
slice->rn_order = IMPORT_ORDER_PREFERRED_1;
|
||||||
slice->rn_labelpaths = B_FALSE;
|
slice->rn_labelpaths = B_FALSE;
|
||||||
mutex_enter(rn->rn_lock);
|
mutex_enter(rn->rn_lock);
|
||||||
if (avl_find(rn->rn_avl, slice, &where)) {
|
if (avl_find(rn->rn_avl, slice, &where)) {
|
||||||
@ -1605,9 +1621,10 @@ zpool_open_func(void *arg)
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
slice->rn_vdev_guid = vdev_guid;
|
||||||
slice->rn_avl = rn->rn_avl;
|
slice->rn_avl = rn->rn_avl;
|
||||||
slice->rn_hdl = hdl;
|
slice->rn_hdl = hdl;
|
||||||
slice->rn_order = 2;
|
slice->rn_order = IMPORT_ORDER_PREFERRED_2;
|
||||||
slice->rn_labelpaths = B_FALSE;
|
slice->rn_labelpaths = B_FALSE;
|
||||||
mutex_enter(rn->rn_lock);
|
mutex_enter(rn->rn_lock);
|
||||||
if (avl_find(rn->rn_avl, slice, &where)) {
|
if (avl_find(rn->rn_avl, slice, &where)) {
|
||||||
@ -1709,10 +1726,11 @@ zpool_find_import_scan(libzfs_handle_t *hdl, kmutex_t *lock,
|
|||||||
free(slice);
|
free(slice);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
slice->rn_vdev_guid = 0;
|
||||||
slice->rn_lock = lock;
|
slice->rn_lock = lock;
|
||||||
slice->rn_avl = cache;
|
slice->rn_avl = cache;
|
||||||
slice->rn_hdl = hdl;
|
slice->rn_hdl = hdl;
|
||||||
slice->rn_order = i+1;
|
slice->rn_order = i + IMPORT_ORDER_SCAN_OFFSET;
|
||||||
slice->rn_labelpaths = B_FALSE;
|
slice->rn_labelpaths = B_FALSE;
|
||||||
mutex_enter(lock);
|
mutex_enter(lock);
|
||||||
avl_add(cache, slice);
|
avl_add(cache, slice);
|
||||||
@ -1747,6 +1765,7 @@ zpool_find_import_blkid(libzfs_handle_t *hdl, kmutex_t *lock,
|
|||||||
blkid_cache cache;
|
blkid_cache cache;
|
||||||
blkid_dev_iterate iter;
|
blkid_dev_iterate iter;
|
||||||
blkid_dev dev;
|
blkid_dev dev;
|
||||||
|
avl_index_t where;
|
||||||
int error;
|
int error;
|
||||||
|
|
||||||
*slice_cache = NULL;
|
*slice_cache = NULL;
|
||||||
@ -1781,13 +1800,25 @@ zpool_find_import_blkid(libzfs_handle_t *hdl, kmutex_t *lock,
|
|||||||
while (blkid_dev_next(iter, &dev) == 0) {
|
while (blkid_dev_next(iter, &dev) == 0) {
|
||||||
slice = zfs_alloc(hdl, sizeof (rdsk_node_t));
|
slice = zfs_alloc(hdl, sizeof (rdsk_node_t));
|
||||||
slice->rn_name = zfs_strdup(hdl, blkid_dev_devname(dev));
|
slice->rn_name = zfs_strdup(hdl, blkid_dev_devname(dev));
|
||||||
|
slice->rn_vdev_guid = 0;
|
||||||
slice->rn_lock = lock;
|
slice->rn_lock = lock;
|
||||||
slice->rn_avl = *slice_cache;
|
slice->rn_avl = *slice_cache;
|
||||||
slice->rn_hdl = hdl;
|
slice->rn_hdl = hdl;
|
||||||
slice->rn_order = 100;
|
|
||||||
slice->rn_labelpaths = B_TRUE;
|
slice->rn_labelpaths = B_TRUE;
|
||||||
|
|
||||||
|
error = zfs_path_order(slice->rn_name, &slice->rn_order);
|
||||||
|
if (error == 0)
|
||||||
|
slice->rn_order += IMPORT_ORDER_SCAN_OFFSET;
|
||||||
|
else
|
||||||
|
slice->rn_order = IMPORT_ORDER_DEFAULT;
|
||||||
|
|
||||||
mutex_enter(lock);
|
mutex_enter(lock);
|
||||||
avl_add(*slice_cache, slice);
|
if (avl_find(*slice_cache, slice, &where)) {
|
||||||
|
free(slice->rn_name);
|
||||||
|
free(slice);
|
||||||
|
} else {
|
||||||
|
avl_insert(*slice_cache, slice, where);
|
||||||
|
}
|
||||||
mutex_exit(lock);
|
mutex_exit(lock);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1136,6 +1136,45 @@ zfs_strcmp_pathname(char *name, char *cmp, int wholedisk)
|
|||||||
return (0);
|
return (0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Given a full path to a device determine if that device appears in the
|
||||||
|
* import search path. If it does return the first match and store the
|
||||||
|
* index in the passed 'order' variable, otherwise return an error.
|
||||||
|
*/
|
||||||
|
int
|
||||||
|
zfs_path_order(char *name, int *order)
|
||||||
|
{
|
||||||
|
int i = 0, error = ENOENT;
|
||||||
|
char *dir, *env, *envdup;
|
||||||
|
|
||||||
|
env = getenv("ZPOOL_IMPORT_PATH");
|
||||||
|
if (env) {
|
||||||
|
envdup = strdup(env);
|
||||||
|
dir = strtok(envdup, ":");
|
||||||
|
while (dir) {
|
||||||
|
if (strncmp(name, dir, strlen(dir)) == 0) {
|
||||||
|
*order = i;
|
||||||
|
error = 0;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
dir = strtok(NULL, ":");
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
free(envdup);
|
||||||
|
} else {
|
||||||
|
for (i = 0; i < DEFAULT_IMPORT_PATH_SIZE; i++) {
|
||||||
|
if (strncmp(name, zpool_default_import_path[i],
|
||||||
|
strlen(zpool_default_import_path[i])) == 0) {
|
||||||
|
*order = i;
|
||||||
|
error = 0;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return (error);
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Initialize the zc_nvlist_dst member to prepare for receiving an nvlist from
|
* Initialize the zc_nvlist_dst member to prepare for receiving an nvlist from
|
||||||
* an ioctl().
|
* an ioctl().
|
||||||
|
Loading…
Reference in New Issue
Block a user