Use 'noop' IO Scheduler

Initial testing has shown the the right IO scheduler to use under Linux
is noop.  This strikes the ideal balance by allowing the zfs elevator
to do all request ordering and prioritization.  While allowing the
Linux elevator to do the maximum front/back merging allowed by the
physical device.  This yields the largest possible requests for the
device with the lowest total overhead.

While 'noop' should be right for your system you can choose a different
IO scheduler with the 'zfs_vdev_scheduler' option.  You may set this
value to any of the standard Linux schedulers: noop, cfq, deadline,
anticipatory.  In addition, if you choose 'none' zfs will not attempt
to change the IO scheduler for the block device.
This commit is contained in:
Brian Behlendorf 2011-02-07 13:54:59 -08:00
parent 4db77a74a6
commit 6839eed23e
2 changed files with 49 additions and 0 deletions

View File

@ -81,6 +81,9 @@ extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **);
# define vdev_bdev_block_size(bdev) bdev_hardsect_size(bdev) # define vdev_bdev_block_size(bdev) bdev_hardsect_size(bdev)
#endif #endif
/* Default Linux IO Scheduler */
#define VDEV_SCHEDULER "noop"
#endif /* _KERNEL */ #endif /* _KERNEL */
#ifdef __cplusplus #ifdef __cplusplus

View File

@ -33,6 +33,8 @@
#include <sys/zio.h> #include <sys/zio.h>
#include <sys/sunldi.h> #include <sys/sunldi.h>
char *zfs_vdev_scheduler = VDEV_SCHEDULER;
/* /*
* Virtual device vector for disks. * Virtual device vector for disks.
*/ */
@ -102,6 +104,43 @@ vdev_disk_error(zio_t *zio)
#endif #endif
} }
/*
* Use the Linux 'noop' elevator for zfs managed block devices. This
* strikes the ideal balance by allowing the zfs elevator to do all
* request ordering and prioritization. While allowing the Linux
* elevator to do the maximum front/back merging allowed by the
* physical device. This yields the largest possible requests for
* the device with the lowest total overhead.
*
* Unfortunately we cannot directly call the elevator_switch() function
* because it is not exported from the block layer. This means we have
* to use the sysfs interface and a user space upcall. Pools will be
* automatically imported on module load so we must do this at device
* open time from the kernel.
*/
static int
vdev_elevator_switch(vdev_t *v, char *elevator, char *device)
{
char sh_path[] = "/bin/sh";
char sh_cmd[128];
char *argv[] = { sh_path, "-c", sh_cmd };
char *envp[] = { NULL };
int error;
if (!strncmp(elevator, "none", 4) && (strlen(elevator) == 4))
return (0);
sprintf(sh_cmd, "%s \"%s\" >/sys/block/%s/queue/scheduler",
"/bin/echo", elevator, device);
error = call_usermodehelper(sh_path, argv, envp, 1);
if (error)
printk("ZFS: Unable to set \"%s\" scheduler for %s (%s): %d\n",
elevator, v->vdev_path, device, error);
return (error);
}
static int static int
vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *ashift) vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *ashift)
{ {
@ -167,6 +206,10 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *ashift)
/* Based on the minimum sector size set the block size */ /* Based on the minimum sector size set the block size */
*ashift = highbit(MAX(block_size, SPA_MINBLOCKSIZE)) - 1; *ashift = highbit(MAX(block_size, SPA_MINBLOCKSIZE)) - 1;
/* Try to set the io scheduler elevator algorithm */
(void) vdev_elevator_switch(v, zfs_vdev_scheduler,
bdev->bd_disk->disk_name);
return 0; return 0;
} }
@ -702,3 +745,6 @@ vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config)
return 0; return 0;
} }
module_param(zfs_vdev_scheduler, charp, 0644);
MODULE_PARM_DESC(zfs_vdev_scheduler, "IO Scheduler (noop)");