Introduce zfs rewrite subcommand (#17246)

This allows to rewrite content of specified file(s) as-is without
modifications, but at a different location, compression, checksum,
dedup, copies and other parameter values.  It is faster than read
plus write, since it does not require data copying to user-space.
It is also faster for sync=always datasets, since without data
modification it does not require ZIL writing.  Also since it is
protected by normal range range locks, it can be done under any
other load.  Also it does not affect file's modification time or
other properties.

Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Rob Norris <robn@despairlabs.com>
This commit is contained in:
Alexander Motin
2025-05-12 13:22:17 -04:00
committed by GitHub
parent 9aae14a14a
commit 49fbdd4533
16 changed files with 636 additions and 5 deletions
+12
View File
@@ -305,6 +305,18 @@ zfs_ioctl(vnode_t *vp, ulong_t com, intptr_t data, int flag, cred_t *cred,
*(offset_t *)data = off;
return (0);
}
case ZFS_IOC_REWRITE: {
zfs_rewrite_args_t *args = (zfs_rewrite_args_t *)data;
if ((flag & FWRITE) == 0)
return (SET_ERROR(EBADF));
error = vn_lock(vp, LK_SHARED);
if (error)
return (error);
error = zfs_rewrite(VTOZ(vp), args->off, args->len,
args->flags, args->arg);
VOP_UNLOCK(vp);
return (error);
}
}
return (SET_ERROR(ENOTTY));
}
+23
View File
@@ -985,6 +985,27 @@ zpl_ioctl_setdosflags(struct file *filp, void __user *arg)
return (err);
}
static int
zpl_ioctl_rewrite(struct file *filp, void __user *arg)
{
struct inode *ip = file_inode(filp);
zfs_rewrite_args_t args;
fstrans_cookie_t cookie;
int err;
if (copy_from_user(&args, arg, sizeof (args)))
return (-EFAULT);
if (unlikely(!(filp->f_mode & FMODE_WRITE)))
return (-EBADF);
cookie = spl_fstrans_mark();
err = -zfs_rewrite(ITOZ(ip), args.off, args.len, args.flags, args.arg);
spl_fstrans_unmark(cookie);
return (err);
}
static long
zpl_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
@@ -1003,6 +1024,8 @@ zpl_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
return (zpl_ioctl_getdosflags(filp, (void *)arg));
case ZFS_IOC_SETDOSFLAGS:
return (zpl_ioctl_setdosflags(filp, (void *)arg));
case ZFS_IOC_REWRITE:
return (zpl_ioctl_rewrite(filp, (void *)arg));
default:
return (-ENOTTY);
}
+137
View File
@@ -1050,6 +1050,143 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
return (0);
}
/*
* Rewrite a range of file as-is without modification.
*
* IN: zp - znode of file to be rewritten.
* off - Offset of the range to rewrite.
* len - Length of the range to rewrite.
* flags - Random rewrite parameters.
* arg - flags-specific argument.
*
* RETURN: 0 if success
* error code if failure
*/
int
zfs_rewrite(znode_t *zp, uint64_t off, uint64_t len, uint64_t flags,
uint64_t arg)
{
int error;
if (flags != 0 || arg != 0)
return (SET_ERROR(EINVAL));
zfsvfs_t *zfsvfs = ZTOZSB(zp);
if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
return (error);
if (zfs_is_readonly(zfsvfs)) {
zfs_exit(zfsvfs, FTAG);
return (SET_ERROR(EROFS));
}
if (off >= zp->z_size) {
zfs_exit(zfsvfs, FTAG);
return (0);
}
if (len == 0 || len > zp->z_size - off)
len = zp->z_size - off;
/* Flush any mmap()'d data to disk */
if (zn_has_cached_data(zp, off, off + len - 1))
zn_flush_cached_data(zp, B_TRUE);
zfs_locked_range_t *lr;
lr = zfs_rangelock_enter(&zp->z_rangelock, off, len, RL_WRITER);
const uint64_t uid = KUID_TO_SUID(ZTOUID(zp));
const uint64_t gid = KGID_TO_SGID(ZTOGID(zp));
const uint64_t projid = zp->z_projid;
dmu_buf_impl_t *db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl);
DB_DNODE_ENTER(db);
dnode_t *dn = DB_DNODE(db);
uint64_t n, noff = off, nr = 0, nw = 0;
while (len > 0) {
/*
* Rewrite only actual data, skipping any holes. This might
* be inaccurate for dirty files, but we don't really care.
*/
if (noff == off) {
/* Find next data in the file. */
error = dnode_next_offset(dn, 0, &noff, 1, 1, 0);
if (error || noff >= off + len) {
if (error == ESRCH) /* No more data. */
error = 0;
break;
}
ASSERT3U(noff, >=, off);
len -= noff - off;
off = noff;
/* Find where the data end. */
error = dnode_next_offset(dn, DNODE_FIND_HOLE, &noff,
1, 1, 0);
if (error != 0)
noff = off + len;
}
ASSERT3U(noff, >, off);
if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, uid) ||
zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, gid) ||
(projid != ZFS_DEFAULT_PROJID &&
zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT,
projid))) {
error = SET_ERROR(EDQUOT);
break;
}
n = MIN(MIN(len, noff - off),
DMU_MAX_ACCESS / 2 - P2PHASE(off, zp->z_blksz));
dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
dmu_tx_hold_write_by_dnode(tx, dn, off, n);
error = dmu_tx_assign(tx, DMU_TX_WAIT);
if (error) {
dmu_tx_abort(tx);
break;
}
/* Mark all dbufs within range as dirty to trigger rewrite. */
dmu_buf_t **dbp;
int numbufs;
error = dmu_buf_hold_array_by_dnode(dn, off, n, TRUE, FTAG,
&numbufs, &dbp, DMU_READ_PREFETCH);
if (error) {
dmu_tx_abort(tx);
break;
}
for (int i = 0; i < numbufs; i++) {
nr += dbp[i]->db_size;
if (dmu_buf_is_dirty(dbp[i], tx))
continue;
nw += dbp[i]->db_size;
dmu_buf_will_dirty(dbp[i], tx);
}
dmu_buf_rele_array(dbp, numbufs, FTAG);
dmu_tx_commit(tx);
len -= n;
off += n;
if (issig()) {
error = SET_ERROR(EINTR);
break;
}
}
DB_DNODE_EXIT(db);
dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, nr);
dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, nw);
zfs_rangelock_exit(lr);
zfs_exit(zfsvfs, FTAG);
return (error);
}
int
zfs_getsecattr(znode_t *zp, vsecattr_t *vsecp, int flag, cred_t *cr)
{