2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* CDDL HEADER START
|
|
|
|
*
|
|
|
|
* The contents of this file are subject to the terms of the
|
|
|
|
* Common Development and Distribution License (the "License").
|
|
|
|
* You may not use this file except in compliance with the License.
|
|
|
|
*
|
|
|
|
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
|
|
|
* or http://www.opensolaris.org/os/licensing.
|
|
|
|
* See the License for the specific language governing permissions
|
|
|
|
* and limitations under the License.
|
|
|
|
*
|
|
|
|
* When distributing Covered Code, include this CDDL HEADER in each
|
|
|
|
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
|
|
|
* If applicable, add the following below this CDDL HEADER, with the
|
|
|
|
* fields enclosed by brackets "[]" replaced with your own identifying
|
|
|
|
* information: Portions Copyright [yyyy] [name of copyright owner]
|
|
|
|
*
|
|
|
|
* CDDL HEADER END
|
|
|
|
*/
|
2017-02-03 01:13:41 +03:00
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
2010-05-29 00:45:14 +04:00
|
|
|
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
|
2018-08-20 19:52:37 +03:00
|
|
|
* Copyright (c) 2012, 2018 by Delphix. All rights reserved.
|
2015-07-30 17:24:36 +03:00
|
|
|
* Copyright (c) 2015 by Chunwei Chen. All rights reserved.
|
2017-02-03 01:13:41 +03:00
|
|
|
* Copyright 2017 Nexenta Systems, Inc.
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
|
|
|
|
|
|
|
/* Portions Copyright 2007 Jeremy Teo */
|
2010-05-29 00:45:14 +04:00
|
|
|
/* Portions Copyright 2010 Robert Milkowski */
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-08-26 22:45:02 +04:00
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
#include <sys/types.h>
|
|
|
|
#include <sys/param.h>
|
|
|
|
#include <sys/time.h>
|
|
|
|
#include <sys/sysmacros.h>
|
|
|
|
#include <sys/vfs.h>
|
|
|
|
#include <sys/file.h>
|
|
|
|
#include <sys/stat.h>
|
|
|
|
#include <sys/kmem.h>
|
|
|
|
#include <sys/taskq.h>
|
|
|
|
#include <sys/uio.h>
|
|
|
|
#include <sys/vmsystm.h>
|
|
|
|
#include <sys/atomic.h>
|
|
|
|
#include <sys/pathname.h>
|
|
|
|
#include <sys/cmn_err.h>
|
|
|
|
#include <sys/errno.h>
|
|
|
|
#include <sys/zfs_dir.h>
|
|
|
|
#include <sys/zfs_acl.h>
|
|
|
|
#include <sys/zfs_ioctl.h>
|
|
|
|
#include <sys/fs/zfs.h>
|
|
|
|
#include <sys/dmu.h>
|
2010-05-29 00:45:14 +04:00
|
|
|
#include <sys/dmu_objset.h>
|
2008-11-20 23:01:55 +03:00
|
|
|
#include <sys/spa.h>
|
|
|
|
#include <sys/txg.h>
|
|
|
|
#include <sys/dbuf.h>
|
|
|
|
#include <sys/zap.h>
|
2010-05-29 00:45:14 +04:00
|
|
|
#include <sys/sa.h>
|
2008-11-20 23:01:55 +03:00
|
|
|
#include <sys/policy.h>
|
|
|
|
#include <sys/sunddi.h>
|
2008-12-03 23:09:06 +03:00
|
|
|
#include <sys/sid.h>
|
2011-11-11 11:15:53 +04:00
|
|
|
#include <sys/zfs_ctldir.h>
|
2008-11-20 23:01:55 +03:00
|
|
|
#include <sys/zfs_fuid.h>
|
2019-12-11 23:12:08 +03:00
|
|
|
#include <sys/zfs_quota.h>
|
2010-05-29 00:45:14 +04:00
|
|
|
#include <sys/zfs_sa.h>
|
2010-12-17 22:18:08 +03:00
|
|
|
#include <sys/zfs_vnops.h>
|
2008-11-20 23:01:55 +03:00
|
|
|
#include <sys/zfs_rlock.h>
|
2010-05-29 00:45:14 +04:00
|
|
|
#include <sys/cred.h>
|
2011-06-25 18:06:37 +04:00
|
|
|
#include <sys/zpl.h>
|
OpenZFS 8585 - improve batching done in zil_commit()
Authored by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Brad Lewis <brad.lewis@delphix.com>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Dan McDonald <danmcd@joyent.com>
Ported-by: Prakash Surya <prakash.surya@delphix.com>
Problem
=======
The current implementation of zil_commit() can introduce significant
latency, beyond what is inherent due to the latency of the underlying
storage. The additional latency comes from two main problems:
1. When there's outstanding ZIL blocks being written (i.e. there's
already a "writer thread" in progress), then any new calls to
zil_commit() will block waiting for the currently oustanding ZIL
blocks to complete. The blocks written for each "writer thread" is
coined a "batch", and there can only ever be a single "batch" being
written at a time. When a batch is being written, any new ZIL
transactions will have to wait for the next batch to be written,
which won't occur until the current batch finishes.
As a result, the underlying storage may not be used as efficiently
as possible. While "new" threads enter zil_commit() and are blocked
waiting for the next batch, it's possible that the underlying
storage isn't fully utilized by the current batch of ZIL blocks. In
that case, it'd be better to allow these new threads to generate
(and issue) a new ZIL block, such that it could be serviced by the
underlying storage concurrently with the other ZIL blocks that are
being serviced.
2. Any call to zil_commit() must wait for all ZIL blocks in its "batch"
to complete, prior to zil_commit() returning. The size of any given
batch is proportional to the number of ZIL transaction in the queue
at the time that the batch starts processing the queue; which
doesn't occur until the previous batch completes. Thus, if there's a
lot of transactions in the queue, the batch could be composed of
many ZIL blocks, and each call to zil_commit() will have to wait for
all of these writes to complete (even if the thread calling
zil_commit() only cared about one of the transactions in the batch).
To further complicate the situation, these two issues result in the
following side effect:
3. If a given batch takes longer to complete than normal, this results
in larger batch sizes, which then take longer to complete and
further drive up the latency of zil_commit(). This can occur for a
number of reasons, including (but not limited to): transient changes
in the workload, and storage latency irregularites.
Solution
========
The solution attempted by this change has the following goals:
1. no on-disk changes; maintain current on-disk format.
2. modify the "batch size" to be equal to the "ZIL block size".
3. allow new batches to be generated and issued to disk, while there's
already batches being serviced by the disk.
4. allow zil_commit() to wait for as few ZIL blocks as possible.
5. use as few ZIL blocks as possible, for the same amount of ZIL
transactions, without introducing significant latency to any
individual ZIL transaction. i.e. use fewer, but larger, ZIL blocks.
In theory, with these goals met, the new allgorithm will allow the
following improvements:
1. new ZIL blocks can be generated and issued, while there's already
oustanding ZIL blocks being serviced by the storage.
2. the latency of zil_commit() should be proportional to the underlying
storage latency, rather than the incoming synchronous workload.
Porting Notes
=============
Due to the changes made in commit 119a394ab0, the lifetime of an itx
structure differs than in OpenZFS. Specifically, the itx structure is
kept around until the data associated with the itx is considered to be
safe on disk; this is so that the itx's callback can be called after the
data is committed to stable storage. Since OpenZFS doesn't have this itx
callback mechanism, it's able to destroy the itx structure immediately
after the itx is committed to an lwb (before the lwb is written to
disk).
To support this difference, and to ensure the itx's callbacks can still
be called after the itx's data is on disk, a few changes had to be made:
* A list of itxs was added to the lwb structure. This list contains
all of the itxs that have been committed to the lwb, such that the
callbacks for these itxs can be called from zil_lwb_flush_vdevs_done(),
after the data for the itxs is committed to disk.
* A list of itxs was added on the stack of the zil_process_commit_list()
function; the "nolwb_itxs" list. In some circumstances, an itx may
not be committed to an lwb (e.g. if allocating the "next" ZIL block
on disk fails), so this list is used to keep track of which itxs
fall into this state, such that their callbacks can be called after
the ZIL's writer pipeline is "stalled".
* The logic to actually call the itx's callback was moved into the
zil_itx_destroy() function. Since all consumers of zil_itx_destroy()
were effectively performing the same logic (i.e. if callback is
non-null, call the callback), it seemed like useful code cleanup to
consolidate this logic into a single function.
Additionally, the existing Linux tracepoint infrastructure dealing with
the ZIL's probes and structures had to be updated to reflect these code
changes. Specifically:
* The "zil__cw1" and "zil__cw2" probes were removed, so they had to be
removed from "trace_zil.h" as well.
* Some of the zilog structure's fields were removed, which affected
the tracepoint definitions of the structure.
* New tracepoints had to be added for the following 3 new probes:
* zil__process__commit__itx
* zil__process__normal__itx
* zil__commit__io__error
OpenZFS-issue: https://www.illumos.org/issues/8585
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/5d95a3a
Closes #6566
2017-12-05 20:39:16 +03:00
|
|
|
#include <sys/zil.h>
|
2018-02-14 01:54:54 +03:00
|
|
|
#include <sys/sa_impl.h>
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Programming rules.
|
|
|
|
*
|
|
|
|
* Each vnode op performs some logical unit of work. To do this, the ZPL must
|
|
|
|
* properly lock its in-core state, create a DMU transaction, do the work,
|
|
|
|
* record this work in the intent log (ZIL), commit the DMU transaction,
|
|
|
|
* and wait for the intent log to commit if it is a synchronous operation.
|
|
|
|
* Moreover, the vnode ops must work in both normal and log replay context.
|
|
|
|
* The ordering of events is important to avoid deadlocks and references
|
|
|
|
* to freed memory. The example below illustrates the following Big Rules:
|
|
|
|
*
|
|
|
|
* (1) A check must be made in each zfs thread for a mounted file system.
|
2017-03-08 03:21:37 +03:00
|
|
|
* This is done avoiding races using ZFS_ENTER(zfsvfs).
|
|
|
|
* A ZFS_EXIT(zfsvfs) is needed before all returns. Any znodes
|
2008-11-20 23:01:55 +03:00
|
|
|
* must be checked with ZFS_VERIFY_ZP(zp). Both of these macros
|
|
|
|
* can return EIO from the calling function.
|
|
|
|
*
|
2019-12-11 22:53:57 +03:00
|
|
|
* (2) zrele() should always be the last thing except for zil_commit()
|
2008-11-20 23:01:55 +03:00
|
|
|
* (if necessary) and ZFS_EXIT(). This is for 3 reasons:
|
|
|
|
* First, if it's the last reference, the vnode/znode
|
|
|
|
* can be freed, so the zp may point to freed memory. Second, the last
|
|
|
|
* reference will call zfs_zinactive(), which may induce a lot of work --
|
|
|
|
* pushing cached pages (which acquires range locks) and syncing out
|
|
|
|
* cached atime changes. Third, zfs_zinactive() may require a new tx,
|
|
|
|
* which could deadlock the system if you were already holding one.
|
2019-12-11 22:53:57 +03:00
|
|
|
* If you must call zrele() within a tx then use zfs_zrele_async().
|
2008-11-20 23:01:55 +03:00
|
|
|
*
|
|
|
|
* (3) All range locks must be grabbed before calling dmu_tx_assign(),
|
|
|
|
* as they can span dmu_tx_assign() calls.
|
|
|
|
*
|
2013-11-23 03:13:18 +04:00
|
|
|
* (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to
|
|
|
|
* dmu_tx_assign(). This is critical because we don't want to block
|
|
|
|
* while holding locks.
|
|
|
|
*
|
|
|
|
* If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT. This
|
|
|
|
* reduces lock contention and CPU usage when we must wait (note that if
|
|
|
|
* throughput is constrained by the storage, nearly every transaction
|
|
|
|
* must wait).
|
|
|
|
*
|
|
|
|
* Note, in particular, that if a lock is sometimes acquired before
|
|
|
|
* the tx assigns, and sometimes after (e.g. z_lock), then failing
|
|
|
|
* to use a non-blocking assign can deadlock the system. The scenario:
|
2008-11-20 23:01:55 +03:00
|
|
|
*
|
|
|
|
* Thread A has grabbed a lock before calling dmu_tx_assign().
|
|
|
|
* Thread B is in an already-assigned tx, and blocks for this lock.
|
|
|
|
* Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
|
|
|
|
* forever, because the previous txg can't quiesce until B's tx commits.
|
|
|
|
*
|
2017-03-08 03:21:37 +03:00
|
|
|
* If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-29 07:01:20 +04:00
|
|
|
* then drop all locks, call dmu_tx_wait(), and try again. On subsequent
|
OpenZFS 8997 - ztest assertion failure in zil_lwb_write_issue
PROBLEM
=======
When `dmu_tx_assign` is called from `zil_lwb_write_issue`, it's possible
for either `ERESTART` or `EIO` to be returned.
If `ERESTART` is returned, this will cause an assertion to fail directly
in `zil_lwb_write_issue`, where the code assumes the return value is
`EIO` if `dmu_tx_assign` returns a non-zero value. This can occur if the
SPA is suspended when `dmu_tx_assign` is called, and most often occurs
when running `zloop`.
If `EIO` is returned, this can cause assertions to fail elsewhere in the
ZIL code. For example, `zil_commit_waiter_timeout` contains the
following logic:
lwb_t *nlwb = zil_lwb_write_issue(zilog, lwb);
ASSERT3S(lwb->lwb_state, !=, LWB_STATE_OPENED);
In this case, if `dmu_tx_assign` returned `EIO` from within
`zil_lwb_write_issue`, the `lwb` variable passed in will not be issued
to disk. Thus, it's `lwb_state` field will remain `LWB_STATE_OPENED` and
this assertion will fail. `zil_commit_waiter_timeout` assumes that after
it calls `zil_lwb_write_issue`, the `lwb` will be issued to disk, and
doesn't handle the case where this is not true; i.e. it doesn't handle
the case where `dmu_tx_assign` returns `EIO`.
SOLUTION
========
This change modifies the `dmu_tx_assign` function such that `txg_how` is
a bitmask, rather than of the `txg_how_t` enum type. Now, the previous
`TXG_WAITED` semantics can be used via `TXG_NOTHROTTLE`, along with
specifying either `TXG_NOWAIT` or `TXG_WAIT` semantics.
Previously, when `TXG_WAITED` was specified, `TXG_NOWAIT` semantics was
automatically invoked. This was not ideal when using `TXG_WAITED` within
`zil_lwb_write_issued`, leading the problem described above. Rather, we
want to achieve the semantics of `TXG_WAIT`, while also preventing the
`tx` from being penalized via the dirty delay throttling.
With this change, `zil_lwb_write_issued` can acheive the semtantics that
it requires by passing in the value `TXG_WAIT | TXG_NOTHROTTLE` to
`dmu_tx_assign`.
Further, consumers of `dmu_tx_assign` wishing to achieve the old
`TXG_WAITED` semantics can pass in the value `TXG_NOWAIT | TXG_NOTHROTTLE`.
Authored by: Prakash Surya <prakash.surya@delphix.com>
Approved by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: Andriy Gapon <avg@FreeBSD.org>
Ported-by: Brian Behlendorf <behlendorf1@llnl.gov>
Porting Notes:
- Additionally updated `zfs_tmpfile` to use `TXG_NOTHROTTLE`
OpenZFS-issue: https://www.illumos.org/issues/8997
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/19ea6cb0f9
Closes #7084
2018-01-09 00:45:53 +03:00
|
|
|
* calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT,
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-29 07:01:20 +04:00
|
|
|
* to indicate that this operation has already called dmu_tx_wait().
|
|
|
|
* This will ensure that we don't retry forever, waiting a short bit
|
|
|
|
* each time.
|
2008-11-20 23:01:55 +03:00
|
|
|
*
|
|
|
|
* (5) If the operation succeeded, generate the intent log entry for it
|
|
|
|
* before dropping locks. This ensures that the ordering of events
|
|
|
|
* in the intent log matches the order in which they actually occurred.
|
2013-06-11 21:12:34 +04:00
|
|
|
* During ZIL replay the zfs_log_* functions will update the sequence
|
2009-01-16 00:59:39 +03:00
|
|
|
* number to indicate the zil transaction has replayed.
|
2008-11-20 23:01:55 +03:00
|
|
|
*
|
|
|
|
* (6) At the end of each vnode op, the DMU tx must always commit,
|
|
|
|
* regardless of whether there were any errors.
|
|
|
|
*
|
2010-08-27 01:24:34 +04:00
|
|
|
* (7) After dropping all locks, invoke zil_commit(zilog, foid)
|
2008-11-20 23:01:55 +03:00
|
|
|
* to ensure that synchronous semantics are provided when necessary.
|
|
|
|
*
|
|
|
|
* In general, this is how things should be ordered in each vnode op:
|
|
|
|
*
|
2017-03-08 03:21:37 +03:00
|
|
|
* ZFS_ENTER(zfsvfs); // exit if unmounted
|
2008-11-20 23:01:55 +03:00
|
|
|
* top:
|
2011-02-08 22:16:06 +03:00
|
|
|
* zfs_dirent_lock(&dl, ...) // lock directory entry (may igrab())
|
2008-11-20 23:01:55 +03:00
|
|
|
* rw_enter(...); // grab any other locks you need
|
|
|
|
* tx = dmu_tx_create(...); // get DMU tx
|
|
|
|
* dmu_tx_hold_*(); // hold each object you might modify
|
OpenZFS 8997 - ztest assertion failure in zil_lwb_write_issue
PROBLEM
=======
When `dmu_tx_assign` is called from `zil_lwb_write_issue`, it's possible
for either `ERESTART` or `EIO` to be returned.
If `ERESTART` is returned, this will cause an assertion to fail directly
in `zil_lwb_write_issue`, where the code assumes the return value is
`EIO` if `dmu_tx_assign` returns a non-zero value. This can occur if the
SPA is suspended when `dmu_tx_assign` is called, and most often occurs
when running `zloop`.
If `EIO` is returned, this can cause assertions to fail elsewhere in the
ZIL code. For example, `zil_commit_waiter_timeout` contains the
following logic:
lwb_t *nlwb = zil_lwb_write_issue(zilog, lwb);
ASSERT3S(lwb->lwb_state, !=, LWB_STATE_OPENED);
In this case, if `dmu_tx_assign` returned `EIO` from within
`zil_lwb_write_issue`, the `lwb` variable passed in will not be issued
to disk. Thus, it's `lwb_state` field will remain `LWB_STATE_OPENED` and
this assertion will fail. `zil_commit_waiter_timeout` assumes that after
it calls `zil_lwb_write_issue`, the `lwb` will be issued to disk, and
doesn't handle the case where this is not true; i.e. it doesn't handle
the case where `dmu_tx_assign` returns `EIO`.
SOLUTION
========
This change modifies the `dmu_tx_assign` function such that `txg_how` is
a bitmask, rather than of the `txg_how_t` enum type. Now, the previous
`TXG_WAITED` semantics can be used via `TXG_NOTHROTTLE`, along with
specifying either `TXG_NOWAIT` or `TXG_WAIT` semantics.
Previously, when `TXG_WAITED` was specified, `TXG_NOWAIT` semantics was
automatically invoked. This was not ideal when using `TXG_WAITED` within
`zil_lwb_write_issued`, leading the problem described above. Rather, we
want to achieve the semantics of `TXG_WAIT`, while also preventing the
`tx` from being penalized via the dirty delay throttling.
With this change, `zil_lwb_write_issued` can acheive the semtantics that
it requires by passing in the value `TXG_WAIT | TXG_NOTHROTTLE` to
`dmu_tx_assign`.
Further, consumers of `dmu_tx_assign` wishing to achieve the old
`TXG_WAITED` semantics can pass in the value `TXG_NOWAIT | TXG_NOTHROTTLE`.
Authored by: Prakash Surya <prakash.surya@delphix.com>
Approved by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: Andriy Gapon <avg@FreeBSD.org>
Ported-by: Brian Behlendorf <behlendorf1@llnl.gov>
Porting Notes:
- Additionally updated `zfs_tmpfile` to use `TXG_NOTHROTTLE`
OpenZFS-issue: https://www.illumos.org/issues/8997
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/19ea6cb0f9
Closes #7084
2018-01-09 00:45:53 +03:00
|
|
|
* error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
|
2008-11-20 23:01:55 +03:00
|
|
|
* if (error) {
|
|
|
|
* rw_exit(...); // drop locks
|
|
|
|
* zfs_dirent_unlock(dl); // unlock directory entry
|
2019-12-11 22:53:57 +03:00
|
|
|
* zrele(...); // release held znodes
|
2009-01-16 00:59:39 +03:00
|
|
|
* if (error == ERESTART) {
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-29 07:01:20 +04:00
|
|
|
* waited = B_TRUE;
|
2008-11-20 23:01:55 +03:00
|
|
|
* dmu_tx_wait(tx);
|
|
|
|
* dmu_tx_abort(tx);
|
|
|
|
* goto top;
|
|
|
|
* }
|
|
|
|
* dmu_tx_abort(tx); // abort DMU tx
|
2017-03-08 03:21:37 +03:00
|
|
|
* ZFS_EXIT(zfsvfs); // finished in zfs
|
2008-11-20 23:01:55 +03:00
|
|
|
* return (error); // really out of space
|
|
|
|
* }
|
|
|
|
* error = do_real_work(); // do whatever this VOP does
|
|
|
|
* if (error == 0)
|
|
|
|
* zfs_log_*(...); // on success, make ZIL entry
|
|
|
|
* dmu_tx_commit(tx); // commit DMU tx -- error or not
|
|
|
|
* rw_exit(...); // drop locks
|
|
|
|
* zfs_dirent_unlock(dl); // unlock directory entry
|
2019-12-11 22:53:57 +03:00
|
|
|
* zrele(...); // release held znodes
|
2010-08-27 01:24:34 +04:00
|
|
|
* zil_commit(zilog, foid); // synchronous when necessary
|
2017-03-08 03:21:37 +03:00
|
|
|
* ZFS_EXIT(zfsvfs); // finished in zfs
|
2008-11-20 23:01:55 +03:00
|
|
|
* return (error); // done, report error
|
|
|
|
*/
|
|
|
|
|
2011-03-08 22:04:51 +03:00
|
|
|
/*
|
|
|
|
* Virus scanning is unsupported. It would be possible to add a hook
|
|
|
|
* here to performance the required virus scan. This could be done
|
|
|
|
* entirely in the kernel or potentially as an update to invoke a
|
|
|
|
* scanning utility.
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
zfs_vscan(struct inode *ip, cred_t *cr, int async)
|
|
|
|
{
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* ARGSUSED */
|
|
|
|
int
|
|
|
|
zfs_open(struct inode *ip, int mode, int flag, cred_t *cr)
|
|
|
|
{
|
|
|
|
znode_t *zp = ITOZ(ip);
|
2017-03-08 03:21:37 +03:00
|
|
|
zfsvfs_t *zfsvfs = ITOZSB(ip);
|
2011-03-08 22:04:51 +03:00
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_ENTER(zfsvfs);
|
2011-03-08 22:04:51 +03:00
|
|
|
ZFS_VERIFY_ZP(zp);
|
|
|
|
|
|
|
|
/* Honor ZFS_APPENDONLY file attribute */
|
|
|
|
if ((mode & FMODE_WRITE) && (zp->z_pflags & ZFS_APPENDONLY) &&
|
|
|
|
((flag & O_APPEND) == 0)) {
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2013-03-08 22:41:28 +04:00
|
|
|
return (SET_ERROR(EPERM));
|
2011-03-08 22:04:51 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Virus scan eligible files on open */
|
2017-03-08 03:21:37 +03:00
|
|
|
if (!zfs_has_ctldir(zp) && zfsvfs->z_vscan && S_ISREG(ip->i_mode) &&
|
2011-03-08 22:04:51 +03:00
|
|
|
!(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) {
|
|
|
|
if (zfs_vscan(ip, cr, 0) != 0) {
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2013-03-08 22:41:28 +04:00
|
|
|
return (SET_ERROR(EACCES));
|
2011-03-08 22:04:51 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Keep a count of the synchronous opens in the znode */
|
|
|
|
if (flag & O_SYNC)
|
|
|
|
atomic_inc_32(&zp->z_sync_cnt);
|
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2011-03-08 22:04:51 +03:00
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* ARGSUSED */
|
|
|
|
int
|
|
|
|
zfs_close(struct inode *ip, int flag, cred_t *cr)
|
|
|
|
{
|
|
|
|
znode_t *zp = ITOZ(ip);
|
2017-03-08 03:21:37 +03:00
|
|
|
zfsvfs_t *zfsvfs = ITOZSB(ip);
|
2011-03-08 22:04:51 +03:00
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_ENTER(zfsvfs);
|
2011-03-08 22:04:51 +03:00
|
|
|
ZFS_VERIFY_ZP(zp);
|
|
|
|
|
2013-12-17 22:18:25 +04:00
|
|
|
/* Decrement the synchronous opens in the znode */
|
2011-03-08 22:04:51 +03:00
|
|
|
if (flag & O_SYNC)
|
2013-12-17 22:18:25 +04:00
|
|
|
atomic_dec_32(&zp->z_sync_cnt);
|
2011-03-08 22:04:51 +03:00
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
if (!zfs_has_ctldir(zp) && zfsvfs->z_vscan && S_ISREG(ip->i_mode) &&
|
2011-03-08 22:04:51 +03:00
|
|
|
!(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0)
|
|
|
|
VERIFY(zfs_vscan(ip, cr, 1) == 0);
|
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2012-12-12 04:58:44 +04:00
|
|
|
return (0);
|
2011-03-08 22:04:51 +03:00
|
|
|
}
|
|
|
|
|
2013-06-13 21:51:09 +04:00
|
|
|
#if defined(SEEK_HOLE) && defined(SEEK_DATA)
|
2013-07-01 20:24:43 +04:00
|
|
|
/*
|
2013-06-13 21:51:09 +04:00
|
|
|
* Lseek support for finding holes (cmd == SEEK_HOLE) and
|
|
|
|
* data (cmd == SEEK_DATA). "off" is an in/out parameter.
|
2013-07-01 20:24:43 +04:00
|
|
|
*/
|
|
|
|
static int
|
2013-06-13 21:51:09 +04:00
|
|
|
zfs_holey_common(struct inode *ip, int cmd, loff_t *off)
|
2013-07-01 20:24:43 +04:00
|
|
|
{
|
2013-06-13 21:51:09 +04:00
|
|
|
znode_t *zp = ITOZ(ip);
|
2013-07-01 20:24:43 +04:00
|
|
|
uint64_t noff = (uint64_t)*off; /* new offset */
|
|
|
|
uint64_t file_sz;
|
|
|
|
int error;
|
|
|
|
boolean_t hole;
|
|
|
|
|
|
|
|
file_sz = zp->z_size;
|
|
|
|
if (noff >= file_sz) {
|
2013-03-08 22:41:28 +04:00
|
|
|
return (SET_ERROR(ENXIO));
|
2013-07-01 20:24:43 +04:00
|
|
|
}
|
|
|
|
|
2013-06-13 21:51:09 +04:00
|
|
|
if (cmd == SEEK_HOLE)
|
2013-07-01 20:24:43 +04:00
|
|
|
hole = B_TRUE;
|
|
|
|
else
|
|
|
|
hole = B_FALSE;
|
|
|
|
|
2013-06-13 21:51:09 +04:00
|
|
|
error = dmu_offset_next(ZTOZSB(zp)->z_os, zp->z_id, hole, &noff);
|
2013-07-01 20:24:43 +04:00
|
|
|
|
2014-09-17 19:25:10 +04:00
|
|
|
if (error == ESRCH)
|
2013-03-08 22:41:28 +04:00
|
|
|
return (SET_ERROR(ENXIO));
|
2014-09-17 19:25:10 +04:00
|
|
|
|
2017-04-24 19:38:31 +03:00
|
|
|
/* file was dirty, so fall back to using generic logic */
|
|
|
|
if (error == EBUSY) {
|
|
|
|
if (hole)
|
|
|
|
*off = file_sz;
|
|
|
|
|
|
|
|
return (0);
|
|
|
|
}
|
2017-03-25 00:28:38 +03:00
|
|
|
|
2014-09-17 19:25:10 +04:00
|
|
|
/*
|
|
|
|
* We could find a hole that begins after the logical end-of-file,
|
|
|
|
* because dmu_offset_next() only works on whole blocks. If the
|
|
|
|
* EOF falls mid-block, then indicate that the "virtual hole"
|
|
|
|
* at the end of the file begins at the logical EOF, rather than
|
|
|
|
* at the end of the last block.
|
|
|
|
*/
|
|
|
|
if (noff > file_sz) {
|
|
|
|
ASSERT(hole);
|
|
|
|
noff = file_sz;
|
2013-07-01 20:24:43 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
if (noff < *off)
|
|
|
|
return (error);
|
|
|
|
*off = noff;
|
|
|
|
return (error);
|
|
|
|
}
|
2013-06-13 21:51:09 +04:00
|
|
|
|
|
|
|
int
|
|
|
|
zfs_holey(struct inode *ip, int cmd, loff_t *off)
|
|
|
|
{
|
|
|
|
znode_t *zp = ITOZ(ip);
|
2017-03-08 03:21:37 +03:00
|
|
|
zfsvfs_t *zfsvfs = ITOZSB(ip);
|
2013-06-13 21:51:09 +04:00
|
|
|
int error;
|
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_ENTER(zfsvfs);
|
2013-06-13 21:51:09 +04:00
|
|
|
ZFS_VERIFY_ZP(zp);
|
|
|
|
|
|
|
|
error = zfs_holey_common(ip, cmd, off);
|
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2013-06-13 21:51:09 +04:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
#endif /* SEEK_HOLE && SEEK_DATA */
|
2013-07-01 20:24:43 +04:00
|
|
|
|
2011-02-03 21:34:05 +03:00
|
|
|
#if defined(_KERNEL)
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* When a file is memory mapped, we must keep the IO data synchronized
|
|
|
|
* between the DMU cache and the memory mapped pages. What this means:
|
|
|
|
*
|
|
|
|
* On Write: If we find a memory mapped page, we write to *both*
|
|
|
|
* the page and the dmu buffer.
|
|
|
|
*/
|
2009-02-18 23:51:31 +03:00
|
|
|
static void
|
2011-02-03 21:34:05 +03:00
|
|
|
update_pages(struct inode *ip, int64_t start, int len,
|
|
|
|
objset_t *os, uint64_t oid)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2011-02-03 21:34:05 +03:00
|
|
|
struct address_space *mp = ip->i_mapping;
|
|
|
|
struct page *pp;
|
|
|
|
uint64_t nbytes;
|
2009-02-18 23:51:31 +03:00
|
|
|
int64_t off;
|
2011-02-03 21:34:05 +03:00
|
|
|
void *pb;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2016-04-05 22:39:37 +03:00
|
|
|
off = start & (PAGE_SIZE-1);
|
|
|
|
for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) {
|
|
|
|
nbytes = MIN(PAGE_SIZE - off, len);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2016-04-05 22:39:37 +03:00
|
|
|
pp = find_lock_page(mp, start >> PAGE_SHIFT);
|
2011-02-03 21:34:05 +03:00
|
|
|
if (pp) {
|
|
|
|
if (mapping_writably_mapped(mp))
|
|
|
|
flush_dcache_page(pp);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2011-02-03 21:34:05 +03:00
|
|
|
pb = kmap(pp);
|
|
|
|
(void) dmu_read(os, oid, start+off, nbytes, pb+off,
|
2009-07-03 02:44:48 +04:00
|
|
|
DMU_READ_PREFETCH);
|
2011-02-03 21:34:05 +03:00
|
|
|
kunmap(pp);
|
|
|
|
|
|
|
|
if (mapping_writably_mapped(mp))
|
|
|
|
flush_dcache_page(pp);
|
|
|
|
|
|
|
|
mark_page_accessed(pp);
|
|
|
|
SetPageUptodate(pp);
|
|
|
|
ClearPageError(pp);
|
|
|
|
unlock_page(pp);
|
2016-04-05 22:39:37 +03:00
|
|
|
put_page(pp);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
2011-02-03 21:34:05 +03:00
|
|
|
|
2009-02-18 23:51:31 +03:00
|
|
|
len -= nbytes;
|
2008-11-20 23:01:55 +03:00
|
|
|
off = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* When a file is memory mapped, we must keep the IO data synchronized
|
|
|
|
* between the DMU cache and the memory mapped pages. What this means:
|
|
|
|
*
|
|
|
|
* On Read: We "read" preferentially from memory mapped pages,
|
|
|
|
* else we default from the dmu buffer.
|
|
|
|
*
|
|
|
|
* NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
|
2013-06-11 21:12:34 +04:00
|
|
|
* the file is memory mapped.
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
|
|
|
static int
|
2011-02-08 22:16:06 +03:00
|
|
|
mappedread(struct inode *ip, int nbytes, uio_t *uio)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2011-02-03 21:34:05 +03:00
|
|
|
struct address_space *mp = ip->i_mapping;
|
|
|
|
struct page *pp;
|
2011-02-08 22:16:06 +03:00
|
|
|
znode_t *zp = ITOZ(ip);
|
2008-11-20 23:01:55 +03:00
|
|
|
int64_t start, off;
|
2011-02-03 21:34:05 +03:00
|
|
|
uint64_t bytes;
|
2008-11-20 23:01:55 +03:00
|
|
|
int len = nbytes;
|
|
|
|
int error = 0;
|
2011-02-03 21:34:05 +03:00
|
|
|
void *pb;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
start = uio->uio_loffset;
|
2016-04-05 22:39:37 +03:00
|
|
|
off = start & (PAGE_SIZE-1);
|
|
|
|
for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) {
|
|
|
|
bytes = MIN(PAGE_SIZE - off, len);
|
2011-02-03 21:34:05 +03:00
|
|
|
|
2016-04-05 22:39:37 +03:00
|
|
|
pp = find_lock_page(mp, start >> PAGE_SHIFT);
|
2011-02-03 21:34:05 +03:00
|
|
|
if (pp) {
|
|
|
|
ASSERT(PageUptodate(pp));
|
2018-03-28 20:19:22 +03:00
|
|
|
unlock_page(pp);
|
2011-02-03 21:34:05 +03:00
|
|
|
|
|
|
|
pb = kmap(pp);
|
|
|
|
error = uiomove(pb + off, bytes, UIO_READ, uio);
|
|
|
|
kunmap(pp);
|
|
|
|
|
|
|
|
if (mapping_writably_mapped(mp))
|
|
|
|
flush_dcache_page(pp);
|
|
|
|
|
|
|
|
mark_page_accessed(pp);
|
2016-04-05 22:39:37 +03:00
|
|
|
put_page(pp);
|
2008-11-20 23:01:55 +03:00
|
|
|
} else {
|
2015-06-17 00:06:27 +03:00
|
|
|
error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
|
|
|
|
uio, bytes);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
2011-02-03 21:34:05 +03:00
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
len -= bytes;
|
|
|
|
off = 0;
|
|
|
|
if (error)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
return (error);
|
|
|
|
}
|
2011-02-03 21:34:05 +03:00
|
|
|
#endif /* _KERNEL */
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2011-05-04 02:09:28 +04:00
|
|
|
unsigned long zfs_read_chunk_size = 1024 * 1024; /* Tunable */
|
2015-08-21 04:43:10 +03:00
|
|
|
unsigned long zfs_delete_blocks = DMU_MAX_DELETEBLKCNT;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Read bytes from specified file into supplied buffer.
|
|
|
|
*
|
2011-02-08 22:16:06 +03:00
|
|
|
* IN: ip - inode of file to be read from.
|
2008-11-20 23:01:55 +03:00
|
|
|
* uio - structure supplying read location, range info,
|
|
|
|
* and return buffer.
|
2019-11-21 20:32:57 +03:00
|
|
|
* ioflag - O_SYNC flags; used to provide FRSYNC semantics.
|
2011-02-03 21:34:05 +03:00
|
|
|
* O_DIRECT flag; used to bypass page cache.
|
2008-11-20 23:01:55 +03:00
|
|
|
* cr - credentials of caller.
|
|
|
|
*
|
|
|
|
* OUT: uio - updated offset and range, buffer filled.
|
|
|
|
*
|
2013-06-11 21:12:34 +04:00
|
|
|
* RETURN: 0 on success, error code on failure.
|
2008-11-20 23:01:55 +03:00
|
|
|
*
|
|
|
|
* Side Effects:
|
2011-02-08 22:16:06 +03:00
|
|
|
* inode - atime updated if byte count > 0
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
|
|
|
/* ARGSUSED */
|
2010-12-17 22:18:08 +03:00
|
|
|
int
|
2011-02-08 22:16:06 +03:00
|
|
|
zfs_read(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2018-08-20 19:52:37 +03:00
|
|
|
int error = 0;
|
2019-03-22 23:09:11 +03:00
|
|
|
boolean_t frsync = B_FALSE;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2018-08-20 19:52:37 +03:00
|
|
|
znode_t *zp = ITOZ(ip);
|
|
|
|
zfsvfs_t *zfsvfs = ITOZSB(ip);
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_ENTER(zfsvfs);
|
2008-11-20 23:01:55 +03:00
|
|
|
ZFS_VERIFY_ZP(zp);
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
if (zp->z_pflags & ZFS_AV_QUARANTINED) {
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2013-03-08 22:41:28 +04:00
|
|
|
return (SET_ERROR(EACCES));
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Validate file offset
|
|
|
|
*/
|
|
|
|
if (uio->uio_loffset < (offset_t)0) {
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2013-03-08 22:41:28 +04:00
|
|
|
return (SET_ERROR(EINVAL));
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Fasttrack empty reads
|
|
|
|
*/
|
|
|
|
if (uio->uio_resid == 0) {
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2008-11-20 23:01:55 +03:00
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
2019-03-22 23:09:11 +03:00
|
|
|
#ifdef FRSYNC
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* If we're in FRSYNC mode, sync out this znode before reading it.
|
2017-08-11 18:57:54 +03:00
|
|
|
* Only do this for non-snapshots.
|
2019-03-22 23:09:11 +03:00
|
|
|
*
|
|
|
|
* Some platforms do not support FRSYNC and instead map it
|
2019-11-21 20:32:57 +03:00
|
|
|
* to O_SYNC, which results in unnecessary calls to zil_commit. We
|
2019-03-22 23:09:11 +03:00
|
|
|
* only honor FRSYNC requests on platforms which support it.
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
2019-03-22 23:09:11 +03:00
|
|
|
frsync = !!(ioflag & FRSYNC);
|
|
|
|
#endif
|
2017-08-11 18:57:54 +03:00
|
|
|
if (zfsvfs->z_log &&
|
2019-03-22 23:09:11 +03:00
|
|
|
(frsync || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS))
|
2017-03-08 03:21:37 +03:00
|
|
|
zil_commit(zfsvfs->z_log, zp->z_id);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Lock the range against changes.
|
|
|
|
*/
|
2019-11-01 20:37:33 +03:00
|
|
|
zfs_locked_range_t *lr = zfs_rangelock_enter(&zp->z_rangelock,
|
2018-08-20 19:52:37 +03:00
|
|
|
uio->uio_loffset, uio->uio_resid, RL_READER);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If we are reading past end-of-file we can skip
|
|
|
|
* to the end; but we might still need to set atime.
|
|
|
|
*/
|
2010-05-29 00:45:14 +04:00
|
|
|
if (uio->uio_loffset >= zp->z_size) {
|
2008-11-20 23:01:55 +03:00
|
|
|
error = 0;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
ASSERT(uio->uio_loffset < zp->z_size);
|
2018-08-20 19:52:37 +03:00
|
|
|
ssize_t n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset);
|
|
|
|
ssize_t start_resid = n;
|
2010-05-29 00:45:14 +04:00
|
|
|
|
2011-02-08 22:16:06 +03:00
|
|
|
#ifdef HAVE_UIO_ZEROCOPY
|
2018-08-20 19:52:37 +03:00
|
|
|
xuio_t *xuio = NULL;
|
2010-05-29 00:45:14 +04:00
|
|
|
if ((uio->uio_extflg == UIO_XUIO) &&
|
|
|
|
(((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) {
|
|
|
|
int nblk;
|
|
|
|
int blksz = zp->z_blksz;
|
|
|
|
uint64_t offset = uio->uio_loffset;
|
|
|
|
|
|
|
|
xuio = (xuio_t *)uio;
|
|
|
|
if ((ISP2(blksz))) {
|
|
|
|
nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset,
|
|
|
|
blksz)) / blksz;
|
|
|
|
} else {
|
|
|
|
ASSERT(offset + n <= blksz);
|
|
|
|
nblk = 1;
|
|
|
|
}
|
|
|
|
(void) dmu_xuio_init(xuio, nblk);
|
|
|
|
|
2011-02-08 22:16:06 +03:00
|
|
|
if (vn_has_cached_data(ip)) {
|
2010-05-29 00:45:14 +04:00
|
|
|
/*
|
|
|
|
* For simplicity, we always allocate a full buffer
|
|
|
|
* even if we only expect to read a portion of a block.
|
|
|
|
*/
|
|
|
|
while (--nblk >= 0) {
|
|
|
|
(void) dmu_xuio_add(xuio,
|
|
|
|
dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
|
|
|
|
blksz), 0, blksz);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2011-02-08 22:16:06 +03:00
|
|
|
#endif /* HAVE_UIO_ZEROCOPY */
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
while (n > 0) {
|
2018-08-20 19:52:37 +03:00
|
|
|
ssize_t nbytes = MIN(n, zfs_read_chunk_size -
|
2008-11-20 23:01:55 +03:00
|
|
|
P2PHASE(uio->uio_loffset, zfs_read_chunk_size));
|
|
|
|
|
2015-06-17 00:06:27 +03:00
|
|
|
if (zp->z_is_mapped && !(ioflag & O_DIRECT)) {
|
2011-02-08 22:16:06 +03:00
|
|
|
error = mappedread(ip, nbytes, uio);
|
2015-06-17 00:06:27 +03:00
|
|
|
} else {
|
|
|
|
error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
|
|
|
|
uio, nbytes);
|
|
|
|
}
|
2011-02-03 21:34:05 +03:00
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
if (error) {
|
|
|
|
/* convert checksum errors into IO errors */
|
|
|
|
if (error == ECKSUM)
|
2013-03-08 22:41:28 +04:00
|
|
|
error = SET_ERROR(EIO);
|
2008-11-20 23:01:55 +03:00
|
|
|
break;
|
2008-12-03 23:09:06 +03:00
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
n -= nbytes;
|
|
|
|
}
|
2018-08-20 19:52:37 +03:00
|
|
|
|
|
|
|
int64_t nread = start_resid - n;
|
|
|
|
dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, nread);
|
|
|
|
task_io_account_read(nread);
|
2008-11-20 23:01:55 +03:00
|
|
|
out:
|
2019-10-04 01:54:29 +03:00
|
|
|
zfs_rangelock_exit(lr);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2008-11-20 23:01:55 +03:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Write the bytes to a file.
|
|
|
|
*
|
2011-02-08 22:16:06 +03:00
|
|
|
* IN: ip - inode of file to be written to.
|
2008-11-20 23:01:55 +03:00
|
|
|
* uio - structure supplying write location, range info,
|
|
|
|
* and data buffer.
|
2019-11-21 20:32:57 +03:00
|
|
|
* ioflag - O_APPEND flag set if in append mode.
|
2011-02-03 21:34:05 +03:00
|
|
|
* O_DIRECT flag; used to bypass page cache.
|
2008-11-20 23:01:55 +03:00
|
|
|
* cr - credentials of caller.
|
|
|
|
*
|
|
|
|
* OUT: uio - updated offset and range.
|
|
|
|
*
|
|
|
|
* RETURN: 0 if success
|
|
|
|
* error code if failure
|
|
|
|
*
|
|
|
|
* Timestamps:
|
2011-02-08 22:16:06 +03:00
|
|
|
* ip - ctime|mtime updated if byte count > 0
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
2010-05-29 00:45:14 +04:00
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/* ARGSUSED */
|
2010-12-17 22:18:08 +03:00
|
|
|
int
|
2011-02-08 22:16:06 +03:00
|
|
|
zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2018-08-20 19:52:37 +03:00
|
|
|
int error = 0;
|
|
|
|
ssize_t start_resid = uio->uio_resid;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Fasttrack empty write
|
|
|
|
*/
|
2018-08-20 19:52:37 +03:00
|
|
|
ssize_t n = start_resid;
|
2008-11-20 23:01:55 +03:00
|
|
|
if (n == 0)
|
|
|
|
return (0);
|
|
|
|
|
2018-08-20 19:52:37 +03:00
|
|
|
rlim64_t limit = uio->uio_limit;
|
2008-11-20 23:01:55 +03:00
|
|
|
if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
|
|
|
|
limit = MAXOFFSET_T;
|
|
|
|
|
2018-08-20 19:52:37 +03:00
|
|
|
znode_t *zp = ITOZ(ip);
|
|
|
|
zfsvfs_t *zfsvfs = ZTOZSB(zp);
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_ENTER(zfsvfs);
|
2008-11-20 23:01:55 +03:00
|
|
|
ZFS_VERIFY_ZP(zp);
|
2008-12-03 23:09:06 +03:00
|
|
|
|
2018-08-20 19:52:37 +03:00
|
|
|
sa_bulk_attr_t bulk[4];
|
|
|
|
int count = 0;
|
|
|
|
uint64_t mtime[2], ctime[2];
|
2017-03-08 03:21:37 +03:00
|
|
|
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
|
|
|
|
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
|
|
|
|
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
|
|
|
|
&zp->z_size, 8);
|
|
|
|
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
|
2010-05-29 00:45:14 +04:00
|
|
|
&zp->z_pflags, 8);
|
|
|
|
|
2016-01-11 01:31:24 +03:00
|
|
|
/*
|
|
|
|
* Callers might not be able to detect properly that we are read-only,
|
|
|
|
* so check it explicitly here.
|
|
|
|
*/
|
2017-03-08 03:21:37 +03:00
|
|
|
if (zfs_is_readonly(zfsvfs)) {
|
|
|
|
ZFS_EXIT(zfsvfs);
|
2016-01-11 01:31:24 +03:00
|
|
|
return (SET_ERROR(EROFS));
|
|
|
|
}
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
/*
|
|
|
|
* If immutable or not appending then return EPERM
|
|
|
|
*/
|
2010-05-29 00:45:14 +04:00
|
|
|
if ((zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) ||
|
2019-11-21 20:32:57 +03:00
|
|
|
((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & O_APPEND) &&
|
2010-05-29 00:45:14 +04:00
|
|
|
(uio->uio_loffset < zp->z_size))) {
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2013-03-08 22:41:28 +04:00
|
|
|
return (SET_ERROR(EPERM));
|
2008-12-03 23:09:06 +03:00
|
|
|
}
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
/*
|
|
|
|
* Validate file offset
|
|
|
|
*/
|
2019-11-21 20:32:57 +03:00
|
|
|
offset_t woff = ioflag & O_APPEND ? zp->z_size : uio->uio_loffset;
|
2010-05-29 00:45:14 +04:00
|
|
|
if (woff < 0) {
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2013-03-08 22:41:28 +04:00
|
|
|
return (SET_ERROR(EINVAL));
|
2010-05-29 00:45:14 +04:00
|
|
|
}
|
|
|
|
|
2018-08-20 19:52:37 +03:00
|
|
|
int max_blksz = zfsvfs->z_max_blksz;
|
|
|
|
xuio_t *xuio = NULL;
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* Pre-fault the pages to ensure slow (eg NFS) pages
|
|
|
|
* don't hold up txg.
|
2010-05-29 00:45:14 +04:00
|
|
|
* Skip this if uio contains loaned arc_buf.
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
2013-09-02 08:22:30 +04:00
|
|
|
#ifdef HAVE_UIO_ZEROCOPY
|
2010-05-29 00:45:14 +04:00
|
|
|
if ((uio->uio_extflg == UIO_XUIO) &&
|
|
|
|
(((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY))
|
|
|
|
xuio = (xuio_t *)uio;
|
|
|
|
else
|
2013-09-02 08:22:30 +04:00
|
|
|
#endif
|
deadlock between mm_sem and tx assign in zfs_write() and page fault
The bug time sequence:
1. thread #1, `zfs_write` assign a txg "n".
2. In a same process, thread #2, mmap page fault (which means the
`mm_sem` is hold) occurred, `zfs_dirty_inode` open a txg failed,
and wait previous txg "n" completed.
3. thread #1 call `uiomove` to write, however page fault is occurred
in `uiomove`, which means it need `mm_sem`, but `mm_sem` is hold by
thread #2, so it stuck and can't complete, then txg "n" will
not complete.
So thread #1 and thread #2 are deadlocked.
Reviewed-by: Chunwei Chen <tuxoko@gmail.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Matthew Ahrens <mahrens@delphix.com>
Signed-off-by: Grady Wong <grady.w@xtaotech.com>
Closes #7939
2018-10-16 21:11:24 +03:00
|
|
|
if (uio_prefaultpages(MIN(n, max_blksz), uio)) {
|
|
|
|
ZFS_EXIT(zfsvfs);
|
|
|
|
return (SET_ERROR(EFAULT));
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If in append mode, set the io offset pointer to eof.
|
|
|
|
*/
|
2019-11-01 20:37:33 +03:00
|
|
|
zfs_locked_range_t *lr;
|
2019-11-21 20:32:57 +03:00
|
|
|
if (ioflag & O_APPEND) {
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
2010-05-29 00:45:14 +04:00
|
|
|
* Obtain an appending range lock to guarantee file append
|
|
|
|
* semantics. We reset the write offset once we have the lock.
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
2019-10-04 01:54:29 +03:00
|
|
|
lr = zfs_rangelock_enter(&zp->z_rangelock, 0, n, RL_APPEND);
|
2018-10-02 01:13:12 +03:00
|
|
|
woff = lr->lr_offset;
|
|
|
|
if (lr->lr_length == UINT64_MAX) {
|
2010-05-29 00:45:14 +04:00
|
|
|
/*
|
|
|
|
* We overlocked the file because this write will cause
|
|
|
|
* the file block size to increase.
|
|
|
|
* Note that zp_size cannot change with this lock held.
|
|
|
|
*/
|
|
|
|
woff = zp->z_size;
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
2010-05-29 00:45:14 +04:00
|
|
|
uio->uio_loffset = woff;
|
2008-11-20 23:01:55 +03:00
|
|
|
} else {
|
|
|
|
/*
|
2010-05-29 00:45:14 +04:00
|
|
|
* Note that if the file block size will change as a result of
|
|
|
|
* this write, then this range lock will lock the entire file
|
|
|
|
* so that we can re-write the block safely.
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
2019-10-04 01:54:29 +03:00
|
|
|
lr = zfs_rangelock_enter(&zp->z_rangelock, woff, n, RL_WRITER);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
if (woff >= limit) {
|
2019-10-04 01:54:29 +03:00
|
|
|
zfs_rangelock_exit(lr);
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2013-03-08 22:41:28 +04:00
|
|
|
return (SET_ERROR(EFBIG));
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
if ((woff + n) > limit || woff > (limit - n))
|
|
|
|
n = limit - woff;
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
/* Will this write extend the file length? */
|
2018-08-20 19:52:37 +03:00
|
|
|
int write_eof = (woff + n > zp->z_size);
|
|
|
|
|
|
|
|
uint64_t end_size = MAX(zp->z_size, woff + n);
|
|
|
|
zilog_t *zilog = zfsvfs->z_log;
|
|
|
|
#ifdef HAVE_UIO_ZEROCOPY
|
2019-12-05 23:37:00 +03:00
|
|
|
int i_iov = 0;
|
|
|
|
const iovec_t *iovp = uio->uio_iov;
|
|
|
|
int iovcnt __maybe_unused = uio->uio_iovcnt;
|
2018-08-20 19:52:37 +03:00
|
|
|
#endif
|
2010-05-29 00:45:14 +04:00
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Write the file in reasonable size chunks. Each chunk is written
|
|
|
|
* in a separate transaction; this keeps the intent log records small
|
|
|
|
* and allows us to do more fine-grained space accounting.
|
|
|
|
*/
|
|
|
|
while (n > 0) {
|
2009-07-03 02:44:48 +04:00
|
|
|
woff = uio->uio_loffset;
|
2018-08-20 19:52:37 +03:00
|
|
|
|
2018-02-14 01:54:54 +03:00
|
|
|
if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT,
|
|
|
|
KUID_TO_SUID(ip->i_uid)) ||
|
|
|
|
zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT,
|
|
|
|
KGID_TO_SGID(ip->i_gid)) ||
|
|
|
|
(zp->z_projid != ZFS_DEFAULT_PROJID &&
|
|
|
|
zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT,
|
|
|
|
zp->z_projid))) {
|
2013-03-08 22:41:28 +04:00
|
|
|
error = SET_ERROR(EDQUOT);
|
2009-07-03 02:44:48 +04:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2018-08-20 19:52:37 +03:00
|
|
|
arc_buf_t *abuf = NULL;
|
|
|
|
const iovec_t *aiov = NULL;
|
|
|
|
if (xuio) {
|
2016-10-20 21:24:01 +03:00
|
|
|
#ifdef HAVE_UIO_ZEROCOPY
|
2010-05-29 00:45:14 +04:00
|
|
|
ASSERT(i_iov < iovcnt);
|
2015-07-30 17:24:36 +03:00
|
|
|
ASSERT3U(uio->uio_segflg, !=, UIO_BVEC);
|
2010-05-29 00:45:14 +04:00
|
|
|
aiov = &iovp[i_iov];
|
|
|
|
abuf = dmu_xuio_arcbuf(xuio, i_iov);
|
|
|
|
dmu_xuio_clear(xuio, i_iov);
|
|
|
|
ASSERT((aiov->iov_base == abuf->b_data) ||
|
|
|
|
((char *)aiov->iov_base - (char *)abuf->b_data +
|
|
|
|
aiov->iov_len == arc_buf_size(abuf)));
|
|
|
|
i_iov++;
|
2016-10-20 21:24:01 +03:00
|
|
|
#endif
|
2018-08-20 19:52:37 +03:00
|
|
|
} else if (n >= max_blksz && woff >= zp->z_size &&
|
2009-07-03 02:44:48 +04:00
|
|
|
P2PHASE(woff, max_blksz) == 0 &&
|
|
|
|
zp->z_blksz == max_blksz) {
|
2010-05-29 00:45:14 +04:00
|
|
|
/*
|
|
|
|
* This write covers a full block. "Borrow" a buffer
|
|
|
|
* from the dmu so that we can fill it before we enter
|
|
|
|
* a transaction. This avoids the possibility of
|
|
|
|
* holding up the transaction if the data copy hangs
|
|
|
|
* up on a pagefault (e.g., from an NFS server mapping).
|
|
|
|
*/
|
2009-07-03 02:44:48 +04:00
|
|
|
size_t cbytes;
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
|
|
|
|
max_blksz);
|
2009-07-03 02:44:48 +04:00
|
|
|
ASSERT(abuf != NULL);
|
|
|
|
ASSERT(arc_buf_size(abuf) == max_blksz);
|
2010-12-17 01:05:42 +03:00
|
|
|
if ((error = uiocopy(abuf->b_data, max_blksz,
|
|
|
|
UIO_WRITE, uio, &cbytes))) {
|
2009-07-03 02:44:48 +04:00
|
|
|
dmu_return_arcbuf(abuf);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
ASSERT(cbytes == max_blksz);
|
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* Start a transaction.
|
|
|
|
*/
|
2018-08-20 19:52:37 +03:00
|
|
|
dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
|
2010-05-29 00:45:14 +04:00
|
|
|
dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
|
2019-07-30 19:18:30 +03:00
|
|
|
dmu_buf_impl_t *db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl);
|
|
|
|
DB_DNODE_ENTER(db);
|
|
|
|
dmu_tx_hold_write_by_dnode(tx, DB_DNODE(db), woff,
|
|
|
|
MIN(n, max_blksz));
|
|
|
|
DB_DNODE_EXIT(db);
|
2010-05-29 00:45:14 +04:00
|
|
|
zfs_sa_upgrade_txholds(tx, zp);
|
2013-11-23 03:13:18 +04:00
|
|
|
error = dmu_tx_assign(tx, TXG_WAIT);
|
2008-11-20 23:01:55 +03:00
|
|
|
if (error) {
|
|
|
|
dmu_tx_abort(tx);
|
2009-07-03 02:44:48 +04:00
|
|
|
if (abuf != NULL)
|
|
|
|
dmu_return_arcbuf(abuf);
|
2008-11-20 23:01:55 +03:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2018-10-02 01:13:12 +03:00
|
|
|
* If rangelock_enter() over-locked we grow the blocksize
|
2008-11-20 23:01:55 +03:00
|
|
|
* and then reduce the lock range. This will only happen
|
2018-10-02 01:13:12 +03:00
|
|
|
* on the first iteration since rangelock_reduce() will
|
|
|
|
* shrink down lr_length to the appropriate size.
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
2018-10-02 01:13:12 +03:00
|
|
|
if (lr->lr_length == UINT64_MAX) {
|
2008-11-20 23:01:55 +03:00
|
|
|
uint64_t new_blksz;
|
|
|
|
|
|
|
|
if (zp->z_blksz > max_blksz) {
|
2014-11-03 23:15:08 +03:00
|
|
|
/*
|
|
|
|
* File's blocksize is already larger than the
|
|
|
|
* "recordsize" property. Only let it grow to
|
|
|
|
* the next power of 2.
|
|
|
|
*/
|
2008-11-20 23:01:55 +03:00
|
|
|
ASSERT(!ISP2(zp->z_blksz));
|
2014-11-03 23:15:08 +03:00
|
|
|
new_blksz = MIN(end_size,
|
|
|
|
1 << highbit64(zp->z_blksz));
|
2008-11-20 23:01:55 +03:00
|
|
|
} else {
|
|
|
|
new_blksz = MIN(end_size, max_blksz);
|
|
|
|
}
|
|
|
|
zfs_grow_blocksize(zp, new_blksz, tx);
|
2019-10-04 01:54:29 +03:00
|
|
|
zfs_rangelock_reduce(lr, woff, n);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* XXX - should we really limit each write to z_max_blksz?
|
|
|
|
* Perhaps we should use SPA_MAXBLOCKSIZE chunks?
|
|
|
|
*/
|
2018-08-20 19:52:37 +03:00
|
|
|
ssize_t nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2018-08-20 19:52:37 +03:00
|
|
|
ssize_t tx_bytes;
|
2009-07-03 02:44:48 +04:00
|
|
|
if (abuf == NULL) {
|
|
|
|
tx_bytes = uio->uio_resid;
|
deadlock between mm_sem and tx assign in zfs_write() and page fault
The bug time sequence:
1. thread #1, `zfs_write` assign a txg "n".
2. In a same process, thread #2, mmap page fault (which means the
`mm_sem` is hold) occurred, `zfs_dirty_inode` open a txg failed,
and wait previous txg "n" completed.
3. thread #1 call `uiomove` to write, however page fault is occurred
in `uiomove`, which means it need `mm_sem`, but `mm_sem` is hold by
thread #2, so it stuck and can't complete, then txg "n" will
not complete.
So thread #1 and thread #2 are deadlocked.
Reviewed-by: Chunwei Chen <tuxoko@gmail.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Matthew Ahrens <mahrens@delphix.com>
Signed-off-by: Grady Wong <grady.w@xtaotech.com>
Closes #7939
2018-10-16 21:11:24 +03:00
|
|
|
uio->uio_fault_disable = B_TRUE;
|
2010-05-29 00:45:14 +04:00
|
|
|
error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl),
|
|
|
|
uio, nbytes, tx);
|
2019-05-08 20:04:04 +03:00
|
|
|
uio->uio_fault_disable = B_FALSE;
|
deadlock between mm_sem and tx assign in zfs_write() and page fault
The bug time sequence:
1. thread #1, `zfs_write` assign a txg "n".
2. In a same process, thread #2, mmap page fault (which means the
`mm_sem` is hold) occurred, `zfs_dirty_inode` open a txg failed,
and wait previous txg "n" completed.
3. thread #1 call `uiomove` to write, however page fault is occurred
in `uiomove`, which means it need `mm_sem`, but `mm_sem` is hold by
thread #2, so it stuck and can't complete, then txg "n" will
not complete.
So thread #1 and thread #2 are deadlocked.
Reviewed-by: Chunwei Chen <tuxoko@gmail.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Matthew Ahrens <mahrens@delphix.com>
Signed-off-by: Grady Wong <grady.w@xtaotech.com>
Closes #7939
2018-10-16 21:11:24 +03:00
|
|
|
if (error == EFAULT) {
|
|
|
|
dmu_tx_commit(tx);
|
2020-04-01 19:48:54 +03:00
|
|
|
/*
|
|
|
|
* Account for partial writes before
|
|
|
|
* continuing the loop.
|
|
|
|
* Update needs to occur before the next
|
|
|
|
* uio_prefaultpages, or prefaultpages may
|
|
|
|
* error, and we may break the loop early.
|
|
|
|
*/
|
|
|
|
if (tx_bytes != uio->uio_resid)
|
|
|
|
n -= tx_bytes - uio->uio_resid;
|
deadlock between mm_sem and tx assign in zfs_write() and page fault
The bug time sequence:
1. thread #1, `zfs_write` assign a txg "n".
2. In a same process, thread #2, mmap page fault (which means the
`mm_sem` is hold) occurred, `zfs_dirty_inode` open a txg failed,
and wait previous txg "n" completed.
3. thread #1 call `uiomove` to write, however page fault is occurred
in `uiomove`, which means it need `mm_sem`, but `mm_sem` is hold by
thread #2, so it stuck and can't complete, then txg "n" will
not complete.
So thread #1 and thread #2 are deadlocked.
Reviewed-by: Chunwei Chen <tuxoko@gmail.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Matthew Ahrens <mahrens@delphix.com>
Signed-off-by: Grady Wong <grady.w@xtaotech.com>
Closes #7939
2018-10-16 21:11:24 +03:00
|
|
|
if (uio_prefaultpages(MIN(n, max_blksz), uio)) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
continue;
|
|
|
|
} else if (error != 0) {
|
|
|
|
dmu_tx_commit(tx);
|
|
|
|
break;
|
|
|
|
}
|
2009-07-03 02:44:48 +04:00
|
|
|
tx_bytes -= uio->uio_resid;
|
|
|
|
} else {
|
|
|
|
tx_bytes = nbytes;
|
2010-05-29 00:45:14 +04:00
|
|
|
ASSERT(xuio == NULL || tx_bytes == aiov->iov_len);
|
|
|
|
/*
|
|
|
|
* If this is not a full block write, but we are
|
|
|
|
* extending the file past EOF and this data starts
|
|
|
|
* block-aligned, use assign_arcbuf(). Otherwise,
|
|
|
|
* write via dmu_write().
|
|
|
|
*/
|
|
|
|
if (tx_bytes < max_blksz && (!write_eof ||
|
|
|
|
aiov->iov_base != abuf->b_data)) {
|
|
|
|
ASSERT(xuio);
|
2017-03-08 03:21:37 +03:00
|
|
|
dmu_write(zfsvfs->z_os, zp->z_id, woff,
|
2017-11-19 01:08:00 +03:00
|
|
|
/* cppcheck-suppress nullPointer */
|
2010-05-29 00:45:14 +04:00
|
|
|
aiov->iov_len, aiov->iov_base, tx);
|
|
|
|
dmu_return_arcbuf(abuf);
|
|
|
|
xuio_stat_wbuf_copied();
|
|
|
|
} else {
|
|
|
|
ASSERT(xuio || tx_bytes == max_blksz);
|
2019-01-18 02:47:08 +03:00
|
|
|
error = dmu_assign_arcbuf_by_dbuf(
|
2017-09-28 18:49:13 +03:00
|
|
|
sa_get_db(zp->z_sa_hdl), woff, abuf, tx);
|
2019-01-18 02:47:08 +03:00
|
|
|
if (error != 0) {
|
|
|
|
dmu_return_arcbuf(abuf);
|
|
|
|
dmu_tx_commit(tx);
|
|
|
|
break;
|
|
|
|
}
|
2010-05-29 00:45:14 +04:00
|
|
|
}
|
2009-07-03 02:44:48 +04:00
|
|
|
ASSERT(tx_bytes <= uio->uio_resid);
|
|
|
|
uioskip(uio, tx_bytes);
|
|
|
|
}
|
2017-03-08 03:21:37 +03:00
|
|
|
if (tx_bytes && zp->z_is_mapped && !(ioflag & O_DIRECT)) {
|
|
|
|
update_pages(ip, woff,
|
|
|
|
tx_bytes, zfsvfs->z_os, zp->z_id);
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If we made no progress, we're done. If we made even
|
|
|
|
* partial progress, update the znode and ZIL accordingly.
|
|
|
|
*/
|
|
|
|
if (tx_bytes == 0) {
|
2017-03-08 03:21:37 +03:00
|
|
|
(void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
|
2010-05-29 00:45:14 +04:00
|
|
|
(void *)&zp->z_size, sizeof (uint64_t), tx);
|
2008-11-20 23:01:55 +03:00
|
|
|
dmu_tx_commit(tx);
|
|
|
|
ASSERT(error != 0);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Clear Set-UID/Set-GID bits on successful write if not
|
2017-01-03 20:31:18 +03:00
|
|
|
* privileged and at least one of the execute bits is set.
|
2008-11-20 23:01:55 +03:00
|
|
|
*
|
2019-09-03 03:56:41 +03:00
|
|
|
* It would be nice to do this after all writes have
|
2008-11-20 23:01:55 +03:00
|
|
|
* been done, but that would still expose the ISUID/ISGID
|
|
|
|
* to another app after the partial write is committed.
|
|
|
|
*
|
2010-08-27 01:24:34 +04:00
|
|
|
* Note: we don't call zfs_fuid_map_id() here because
|
|
|
|
* user 0 is not an ephemeral uid.
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
|
|
|
mutex_enter(&zp->z_acl_lock);
|
2018-08-20 19:52:37 +03:00
|
|
|
uint32_t uid = KUID_TO_SUID(ip->i_uid);
|
2010-05-29 00:45:14 +04:00
|
|
|
if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) |
|
2008-11-20 23:01:55 +03:00
|
|
|
(S_IXUSR >> 6))) != 0 &&
|
2010-05-29 00:45:14 +04:00
|
|
|
(zp->z_mode & (S_ISUID | S_ISGID)) != 0 &&
|
2008-11-20 23:01:55 +03:00
|
|
|
secpolicy_vnode_setid_retain(cr,
|
2016-05-22 14:15:57 +03:00
|
|
|
((zp->z_mode & S_ISUID) != 0 && uid == 0)) != 0) {
|
2010-05-29 00:45:14 +04:00
|
|
|
uint64_t newmode;
|
|
|
|
zp->z_mode &= ~(S_ISUID | S_ISGID);
|
2016-09-28 00:08:52 +03:00
|
|
|
ip->i_mode = newmode = zp->z_mode;
|
2017-03-08 03:21:37 +03:00
|
|
|
(void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs),
|
2010-05-29 00:45:14 +04:00
|
|
|
(void *)&newmode, sizeof (uint64_t), tx);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
mutex_exit(&zp->z_acl_lock);
|
|
|
|
|
Fix atime handling and relatime
The problem for atime:
We have 3 places for atime: inode->i_atime, znode->z_atime and SA. And its
handling is a mess. A huge part of mess regarding atime comes from
zfs_tstamp_update_setup, zfs_inode_update, and zfs_getattr, which behave
inconsistently with those three values.
zfs_tstamp_update_setup clears z_atime_dirty unconditionally as long as you
don't pass ATTR_ATIME. Which means every write(2) operation which only updates
ctime and mtime will cause atime changes to not be written to disk.
Also zfs_inode_update from write(2) will replace inode->i_atime with what's
inside SA(stale). But doesn't touch z_atime. So after read(2) and write(2).
You'll have i_atime(stale), z_atime(new), SA(stale) and z_atime_dirty=0.
Now, if you do stat(2), zfs_getattr will actually replace i_atime with what's
inside, z_atime. So you will have now you'll have i_atime(new), z_atime(new),
SA(stale) and z_atime_dirty=0. These will all gone after umount. And you'll
leave with a stale atime.
The problem for relatime:
We do have a relatime config inside ZFS dataset, but how it should interact
with the mount flag MS_RELATIME is not well defined. It seems it wanted
relatime mount option to override the dataset config by showing it as
temporary in `zfs get`. But at the same time, `zfs set relatime=on|off` would
also seems to want to override the mount option. Not to mention that
MS_RELATIME flag is actually never passed into ZFS, so it never really worked.
How Linux handles atime:
The Linux kernel actually handles atime completely in VFS, except for writing
it to disk. So if we remove the atime handling in ZFS, things would just work,
no matter it's strictatime, relatime, noatime, or even O_NOATIME. And whenever
VFS updates the i_atime, it will notify the underlying filesystem via
sb->dirty_inode().
And also there's one thing to note about atime flags like MS_RELATIME and
other flags like MS_NODEV, etc. They are mount point flags rather than
filesystem(sb) flags. Since native linux filesystem can be mounted at multiple
places at the same time, they can all have different atime settings. So these
flags are never passed down to filesystem drivers.
What this patch tries to do:
We remove znode->z_atime, since we won't gain anything from it. We remove most
of the atime handling and leave it to VFS. The only thing we do with atime is
to write it when dirty_inode() or setattr() is called. We also add
file_accessed() in zpl_read() since it's not provided in vfs_read().
After this patch, only the MS_RELATIME flag will have effect. The setting in
dataset won't do anything. We will make zfstuil to mount ZFS with MS_RELATIME
set according to the setting in dataset in future patch.
Signed-off-by: Chunwei Chen <david.chen@osnexus.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Issue #4482
2016-03-30 03:53:34 +03:00
|
|
|
zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Update the file size (zp_size) if it has changed;
|
|
|
|
* account for possible concurrent updates.
|
|
|
|
*/
|
2010-05-29 00:45:14 +04:00
|
|
|
while ((end_size = zp->z_size) < uio->uio_loffset) {
|
|
|
|
(void) atomic_cas_64(&zp->z_size, end_size,
|
2008-11-20 23:01:55 +03:00
|
|
|
uio->uio_loffset);
|
2010-05-29 00:45:14 +04:00
|
|
|
ASSERT(error == 0);
|
|
|
|
}
|
2010-08-27 01:24:34 +04:00
|
|
|
/*
|
|
|
|
* If we are replaying and eof is non zero then force
|
|
|
|
* the file size to the specified eof. Note, there's no
|
|
|
|
* concurrency during replay.
|
|
|
|
*/
|
2017-03-08 03:21:37 +03:00
|
|
|
if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0)
|
|
|
|
zp->z_size = zfsvfs->z_replay_eof;
|
2010-08-27 01:24:34 +04:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
|
|
|
|
|
Only commit the ZIL once in zpl_writepages() (msync() case).
Currently, using msync() results in the following code path:
sys_msync -> zpl_fsync -> filemap_write_and_wait_range -> zpl_writepages -> write_cache_pages -> zpl_putpage
In such a code path, zil_commit() is called as part of zpl_putpage().
This means that for each page, the write is handed to the DMU, the ZIL
is committed, and only then do we move on to the next page. As one might
imagine, this results in atrocious performance where there is a large
number of pages to write: instead of committing a batch of N writes,
we do N commits containing one page each. In some extreme cases this
can result in msync() being ~700 times slower than it should be, as well
as very inefficient use of ZIL resources.
This patch fixes this issue by making sure that the requested writes
are batched and then committed only once. Unfortunately, the
implementation is somewhat non-trivial because there is no way to run
write_cache_pages in SYNC mode (so that we get all pages) without
making it wait on the writeback tag for each page.
The solution implemented here is composed of two parts:
- I added a new callback system to the ZIL, which allows the caller to
be notified when its ITX gets written to stable storage. One nice
thing is that the callback is called not only in zil_commit() but
in zil_sync() as well, which means that the caller doesn't have to
care whether the write ended up in the ZIL or the DMU: it will get
notified as soon as it's safe, period. This is an improvement over
dmu_tx_callback_register() that was used previously, which only
supports DMU writes. The rationale for this change is to allow
zpl_putpage() to be notified when a ZIL commit is completed without
having to block on zil_commit() itself.
- zpl_writepages() now calls write_cache_pages in non-SYNC mode, which
will prevent (1) write_cache_pages from blocking, and (2) zpl_putpage
from issuing ZIL commits. zpl_writepages() will issue the commit
itself instead of relying on zpl_putpage() to do it, thus nicely
batching the writes. Note, however, that we still have to call
write_cache_pages() again in SYNC mode because there is an edge case
documented in the implementation of write_cache_pages() whereas it
will not give us all dirty pages when running in non-SYNC mode. Thus
we need to run it at least once in SYNC mode to make sure we honor
persistency guarantees. This only happens when the pages are
modified at the same time msync() is running, which should be rare.
In most cases there won't be any additional pages and this second
call will do nothing.
Note that this change also fixes a bug related to #907 whereas calling
msync() on pages that were already handed over to the DMU in a previous
writepages() call would make msync() block until the next TXG sync
instead of returning as soon as the ZIL commit is complete. The new
callback system fixes that problem.
Signed-off-by: Richard Yao <ryao@gentoo.org>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1849
Closes #907
2013-11-10 19:00:11 +04:00
|
|
|
zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag,
|
|
|
|
NULL, NULL);
|
2008-11-20 23:01:55 +03:00
|
|
|
dmu_tx_commit(tx);
|
|
|
|
|
|
|
|
if (error != 0)
|
|
|
|
break;
|
|
|
|
ASSERT(tx_bytes == nbytes);
|
|
|
|
n -= nbytes;
|
2010-08-27 01:24:34 +04:00
|
|
|
|
deadlock between mm_sem and tx assign in zfs_write() and page fault
The bug time sequence:
1. thread #1, `zfs_write` assign a txg "n".
2. In a same process, thread #2, mmap page fault (which means the
`mm_sem` is hold) occurred, `zfs_dirty_inode` open a txg failed,
and wait previous txg "n" completed.
3. thread #1 call `uiomove` to write, however page fault is occurred
in `uiomove`, which means it need `mm_sem`, but `mm_sem` is hold by
thread #2, so it stuck and can't complete, then txg "n" will
not complete.
So thread #1 and thread #2 are deadlocked.
Reviewed-by: Chunwei Chen <tuxoko@gmail.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Matthew Ahrens <mahrens@delphix.com>
Signed-off-by: Grady Wong <grady.w@xtaotech.com>
Closes #7939
2018-10-16 21:11:24 +03:00
|
|
|
if (!xuio && n > 0) {
|
|
|
|
if (uio_prefaultpages(MIN(n, max_blksz), uio)) {
|
|
|
|
error = EFAULT;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2015-07-16 23:35:04 +03:00
|
|
|
zfs_inode_update(zp);
|
2019-10-04 01:54:29 +03:00
|
|
|
zfs_rangelock_exit(lr);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If we're in replay mode, or we made no progress, return error.
|
|
|
|
* Otherwise, it's at least a partial write, so it's successful.
|
|
|
|
*/
|
2017-03-08 03:21:37 +03:00
|
|
|
if (zfsvfs->z_replay || uio->uio_resid == start_resid) {
|
|
|
|
ZFS_EXIT(zfsvfs);
|
2008-11-20 23:01:55 +03:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
2019-11-21 20:32:57 +03:00
|
|
|
if (ioflag & (O_SYNC | O_DSYNC) ||
|
2017-03-08 03:21:37 +03:00
|
|
|
zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
|
2010-08-27 01:24:34 +04:00
|
|
|
zil_commit(zilog, zp->z_id);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2018-08-20 19:52:37 +03:00
|
|
|
int64_t nwritten = start_resid - uio->uio_resid;
|
|
|
|
dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, nwritten);
|
|
|
|
task_io_account_write(nwritten);
|
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2008-11-20 23:01:55 +03:00
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
2019-12-13 18:54:10 +03:00
|
|
|
/*
|
|
|
|
* Write the bytes to a file.
|
|
|
|
*
|
|
|
|
* IN: zp - znode of file to be written to
|
|
|
|
* data - bytes to write
|
|
|
|
* len - number of bytes to write
|
|
|
|
* pos - offset to start writing at
|
|
|
|
*
|
|
|
|
* OUT: resid - remaining bytes to write
|
|
|
|
*
|
|
|
|
* RETURN: 0 if success
|
|
|
|
* positive error code if failure
|
|
|
|
*
|
|
|
|
* Timestamps:
|
|
|
|
* zp - ctime|mtime updated if byte count > 0
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
zfs_write_simple(znode_t *zp, const void *data, size_t len,
|
|
|
|
loff_t pos, size_t *resid)
|
|
|
|
{
|
|
|
|
ssize_t written;
|
|
|
|
int error = 0;
|
|
|
|
|
|
|
|
written = zpl_write_common(ZTOI(zp), data, len, &pos,
|
|
|
|
UIO_SYSSPACE, 0, kcred);
|
|
|
|
if (written < 0) {
|
|
|
|
error = -written;
|
|
|
|
} else if (resid == NULL) {
|
|
|
|
if (written < len)
|
|
|
|
error = SET_ERROR(EIO); /* short write */
|
|
|
|
} else {
|
|
|
|
*resid = len - written;
|
|
|
|
}
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
2016-10-18 21:32:59 +03:00
|
|
|
/*
|
|
|
|
* Drop a reference on the passed inode asynchronously. This ensures
|
|
|
|
* that the caller will never drop the last reference on an inode in
|
|
|
|
* the current context. Doing so while holding open a tx could result
|
|
|
|
* in a deadlock if iput_final() re-enters the filesystem code.
|
|
|
|
*/
|
2014-08-05 00:30:20 +04:00
|
|
|
void
|
2019-12-11 22:53:57 +03:00
|
|
|
zfs_zrele_async(znode_t *zp)
|
2011-02-08 22:16:06 +03:00
|
|
|
{
|
2019-12-11 22:53:57 +03:00
|
|
|
struct inode *ip = ZTOI(zp);
|
2014-08-05 00:30:20 +04:00
|
|
|
objset_t *os = ITOZSB(ip)->z_os;
|
|
|
|
|
2011-02-08 22:16:06 +03:00
|
|
|
ASSERT(atomic_read(&ip->i_count) > 0);
|
2014-08-05 00:30:20 +04:00
|
|
|
ASSERT(os != NULL);
|
|
|
|
|
2011-02-08 22:16:06 +03:00
|
|
|
if (atomic_read(&ip->i_count) == 1)
|
2019-12-11 22:53:57 +03:00
|
|
|
VERIFY(taskq_dispatch(dsl_pool_zrele_taskq(dmu_objset_pool(os)),
|
2016-10-29 01:40:14 +03:00
|
|
|
(task_func_t *)iput, ip, TQ_SLEEP) != TASKQID_INVALID);
|
2011-02-08 22:16:06 +03:00
|
|
|
else
|
2019-12-11 22:53:57 +03:00
|
|
|
zrele(zp);
|
2011-02-08 22:16:06 +03:00
|
|
|
}
|
|
|
|
|
OpenZFS 9962 - zil_commit should omit cache thrash
As a result of the changes made in 8585, it's possible for an excessive
amount of vdev flush commands to be issued under some workloads.
Specifically, when the workload consists of mostly async write activity,
interspersed with some sync write and/or fsync activity, we can end up
issuing more flush commands to the underlying storage than is actually
necessary. As a result of these flush commands, the write latency and
overall throughput of the pool can be poorly impacted (latency
increases, throughput decreases).
Currently, any time an lwb completes, the vdev(s) written to as a result
of that lwb will be issued a flush command. The intenion is so the data
written to that vdev is on stable storage, prior to communicating to any
waiting threads that their data is safe on disk.
The problem with this scheme, is that sometimes an lwb will not have any
threads waiting for it to complete. This can occur when there's async
activity that gets "converted" to sync requests, as a result of calling
the zil_async_to_sync() function via zil_commit_impl(). When this
occurs, the current code may issue many lwbs that don't have waiters
associated with them, resulting in many flush commands, potentially to
the same vdev(s).
For example, given a pool with a single vdev, and a single fsync() call
that results in 10 lwbs being written out (e.g. due to other async
writes), that will result in 10 flush commands to that single vdev (a
flush issued after each lwb write completes). Ideally, we'd only issue a
single flush command to that vdev, after all 10 lwb writes completed.
Further, and most important as it pertains to this change, since the
flush commands are often very impactful to the performance of the pool's
underlying storage, unnecessarily issuing these flush commands can
poorly impact the performance of the lwb writes themselves. Thus, we
need to avoid issuing flush commands when possible, in order to acheive
the best possible performance out of the pool's underlying storage.
This change attempts to address this problem by changing the ZIL's logic
to only issue a vdev flush command when it detects an lwb that has a
thread waiting for it to complete. When an lwb does not have threads
waiting for it, the responsibility of issuing the flush command to the
vdevs involved with that lwb's write is passed on to the "next" lwb.
It's only once a write for an lwb with waiters completes, do we issue
the vdev flush command(s). As a result, now when we issue the flush(s),
we will issue them to the vdevs involved with that specific lwb's write,
but potentially also to vdevs involved with "previous" lwb writes (i.e.
if the previous lwbs did not have waiters associated with them).
Thus, in our prior example with 10 lwbs, it's only once the last lwb
completes (which will be the lwb containing the waiter for the thread
that called fsync) will we issue the vdev flush command; all of the
other lwbs will find they have no waiters, so they'll pass the
responsibility of the flush to the "next" lwb (until reaching the last
lwb that has the waiter).
Porting Notes:
* Reconciled conflicts with the fastwrite feature.
Authored by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Matt Ahrens <matt@delphix.com>
Reviewed by: Brad Lewis <brad.lewis@delphix.com>
Reviewed by: Patrick Mooney <patrick.mooney@joyent.com>
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Approved by: Joshua M. Clulow <josh@sysmgr.org>
Ported-by: Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
OpenZFS-issue: https://www.illumos.org/issues/9962
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/545190c6
Closes #8188
2018-10-24 00:14:27 +03:00
|
|
|
/* ARGSUSED */
|
2008-11-20 23:01:55 +03:00
|
|
|
void
|
2010-05-29 00:45:14 +04:00
|
|
|
zfs_get_done(zgd_t *zgd, int error)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2010-05-29 00:45:14 +04:00
|
|
|
znode_t *zp = zgd->zgd_private;
|
|
|
|
|
|
|
|
if (zgd->zgd_db)
|
|
|
|
dmu_buf_rele(zgd->zgd_db, zgd);
|
|
|
|
|
2019-10-04 01:54:29 +03:00
|
|
|
zfs_rangelock_exit(zgd->zgd_lr);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2009-07-03 02:44:48 +04:00
|
|
|
/*
|
|
|
|
* Release the vnode asynchronously as we currently have the
|
|
|
|
* txg stopped from syncing.
|
|
|
|
*/
|
2019-12-11 22:53:57 +03:00
|
|
|
zfs_zrele_async(zp);
|
2010-05-29 00:45:14 +04:00
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
kmem_free(zgd, sizeof (zgd_t));
|
|
|
|
}
|
|
|
|
|
2009-08-18 22:43:27 +04:00
|
|
|
#ifdef DEBUG
|
|
|
|
static int zil_fault_io = 0;
|
|
|
|
#endif
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* Get data to generate a TX_WRITE intent log record.
|
|
|
|
*/
|
|
|
|
int
|
OpenZFS 8585 - improve batching done in zil_commit()
Authored by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Brad Lewis <brad.lewis@delphix.com>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Dan McDonald <danmcd@joyent.com>
Ported-by: Prakash Surya <prakash.surya@delphix.com>
Problem
=======
The current implementation of zil_commit() can introduce significant
latency, beyond what is inherent due to the latency of the underlying
storage. The additional latency comes from two main problems:
1. When there's outstanding ZIL blocks being written (i.e. there's
already a "writer thread" in progress), then any new calls to
zil_commit() will block waiting for the currently oustanding ZIL
blocks to complete. The blocks written for each "writer thread" is
coined a "batch", and there can only ever be a single "batch" being
written at a time. When a batch is being written, any new ZIL
transactions will have to wait for the next batch to be written,
which won't occur until the current batch finishes.
As a result, the underlying storage may not be used as efficiently
as possible. While "new" threads enter zil_commit() and are blocked
waiting for the next batch, it's possible that the underlying
storage isn't fully utilized by the current batch of ZIL blocks. In
that case, it'd be better to allow these new threads to generate
(and issue) a new ZIL block, such that it could be serviced by the
underlying storage concurrently with the other ZIL blocks that are
being serviced.
2. Any call to zil_commit() must wait for all ZIL blocks in its "batch"
to complete, prior to zil_commit() returning. The size of any given
batch is proportional to the number of ZIL transaction in the queue
at the time that the batch starts processing the queue; which
doesn't occur until the previous batch completes. Thus, if there's a
lot of transactions in the queue, the batch could be composed of
many ZIL blocks, and each call to zil_commit() will have to wait for
all of these writes to complete (even if the thread calling
zil_commit() only cared about one of the transactions in the batch).
To further complicate the situation, these two issues result in the
following side effect:
3. If a given batch takes longer to complete than normal, this results
in larger batch sizes, which then take longer to complete and
further drive up the latency of zil_commit(). This can occur for a
number of reasons, including (but not limited to): transient changes
in the workload, and storage latency irregularites.
Solution
========
The solution attempted by this change has the following goals:
1. no on-disk changes; maintain current on-disk format.
2. modify the "batch size" to be equal to the "ZIL block size".
3. allow new batches to be generated and issued to disk, while there's
already batches being serviced by the disk.
4. allow zil_commit() to wait for as few ZIL blocks as possible.
5. use as few ZIL blocks as possible, for the same amount of ZIL
transactions, without introducing significant latency to any
individual ZIL transaction. i.e. use fewer, but larger, ZIL blocks.
In theory, with these goals met, the new allgorithm will allow the
following improvements:
1. new ZIL blocks can be generated and issued, while there's already
oustanding ZIL blocks being serviced by the storage.
2. the latency of zil_commit() should be proportional to the underlying
storage latency, rather than the incoming synchronous workload.
Porting Notes
=============
Due to the changes made in commit 119a394ab0, the lifetime of an itx
structure differs than in OpenZFS. Specifically, the itx structure is
kept around until the data associated with the itx is considered to be
safe on disk; this is so that the itx's callback can be called after the
data is committed to stable storage. Since OpenZFS doesn't have this itx
callback mechanism, it's able to destroy the itx structure immediately
after the itx is committed to an lwb (before the lwb is written to
disk).
To support this difference, and to ensure the itx's callbacks can still
be called after the itx's data is on disk, a few changes had to be made:
* A list of itxs was added to the lwb structure. This list contains
all of the itxs that have been committed to the lwb, such that the
callbacks for these itxs can be called from zil_lwb_flush_vdevs_done(),
after the data for the itxs is committed to disk.
* A list of itxs was added on the stack of the zil_process_commit_list()
function; the "nolwb_itxs" list. In some circumstances, an itx may
not be committed to an lwb (e.g. if allocating the "next" ZIL block
on disk fails), so this list is used to keep track of which itxs
fall into this state, such that their callbacks can be called after
the ZIL's writer pipeline is "stalled".
* The logic to actually call the itx's callback was moved into the
zil_itx_destroy() function. Since all consumers of zil_itx_destroy()
were effectively performing the same logic (i.e. if callback is
non-null, call the callback), it seemed like useful code cleanup to
consolidate this logic into a single function.
Additionally, the existing Linux tracepoint infrastructure dealing with
the ZIL's probes and structures had to be updated to reflect these code
changes. Specifically:
* The "zil__cw1" and "zil__cw2" probes were removed, so they had to be
removed from "trace_zil.h" as well.
* Some of the zilog structure's fields were removed, which affected
the tracepoint definitions of the structure.
* New tracepoints had to be added for the following 3 new probes:
* zil__process__commit__itx
* zil__process__normal__itx
* zil__commit__io__error
OpenZFS-issue: https://www.illumos.org/issues/8585
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/5d95a3a
Closes #6566
2017-12-05 20:39:16 +03:00
|
|
|
zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2017-03-08 03:21:37 +03:00
|
|
|
zfsvfs_t *zfsvfs = arg;
|
|
|
|
objset_t *os = zfsvfs->z_os;
|
2008-11-20 23:01:55 +03:00
|
|
|
znode_t *zp;
|
2010-05-29 00:45:14 +04:00
|
|
|
uint64_t object = lr->lr_foid;
|
|
|
|
uint64_t offset = lr->lr_offset;
|
|
|
|
uint64_t size = lr->lr_length;
|
2008-11-20 23:01:55 +03:00
|
|
|
dmu_buf_t *db;
|
|
|
|
zgd_t *zgd;
|
|
|
|
int error = 0;
|
|
|
|
|
OpenZFS 8585 - improve batching done in zil_commit()
Authored by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Brad Lewis <brad.lewis@delphix.com>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Dan McDonald <danmcd@joyent.com>
Ported-by: Prakash Surya <prakash.surya@delphix.com>
Problem
=======
The current implementation of zil_commit() can introduce significant
latency, beyond what is inherent due to the latency of the underlying
storage. The additional latency comes from two main problems:
1. When there's outstanding ZIL blocks being written (i.e. there's
already a "writer thread" in progress), then any new calls to
zil_commit() will block waiting for the currently oustanding ZIL
blocks to complete. The blocks written for each "writer thread" is
coined a "batch", and there can only ever be a single "batch" being
written at a time. When a batch is being written, any new ZIL
transactions will have to wait for the next batch to be written,
which won't occur until the current batch finishes.
As a result, the underlying storage may not be used as efficiently
as possible. While "new" threads enter zil_commit() and are blocked
waiting for the next batch, it's possible that the underlying
storage isn't fully utilized by the current batch of ZIL blocks. In
that case, it'd be better to allow these new threads to generate
(and issue) a new ZIL block, such that it could be serviced by the
underlying storage concurrently with the other ZIL blocks that are
being serviced.
2. Any call to zil_commit() must wait for all ZIL blocks in its "batch"
to complete, prior to zil_commit() returning. The size of any given
batch is proportional to the number of ZIL transaction in the queue
at the time that the batch starts processing the queue; which
doesn't occur until the previous batch completes. Thus, if there's a
lot of transactions in the queue, the batch could be composed of
many ZIL blocks, and each call to zil_commit() will have to wait for
all of these writes to complete (even if the thread calling
zil_commit() only cared about one of the transactions in the batch).
To further complicate the situation, these two issues result in the
following side effect:
3. If a given batch takes longer to complete than normal, this results
in larger batch sizes, which then take longer to complete and
further drive up the latency of zil_commit(). This can occur for a
number of reasons, including (but not limited to): transient changes
in the workload, and storage latency irregularites.
Solution
========
The solution attempted by this change has the following goals:
1. no on-disk changes; maintain current on-disk format.
2. modify the "batch size" to be equal to the "ZIL block size".
3. allow new batches to be generated and issued to disk, while there's
already batches being serviced by the disk.
4. allow zil_commit() to wait for as few ZIL blocks as possible.
5. use as few ZIL blocks as possible, for the same amount of ZIL
transactions, without introducing significant latency to any
individual ZIL transaction. i.e. use fewer, but larger, ZIL blocks.
In theory, with these goals met, the new allgorithm will allow the
following improvements:
1. new ZIL blocks can be generated and issued, while there's already
oustanding ZIL blocks being serviced by the storage.
2. the latency of zil_commit() should be proportional to the underlying
storage latency, rather than the incoming synchronous workload.
Porting Notes
=============
Due to the changes made in commit 119a394ab0, the lifetime of an itx
structure differs than in OpenZFS. Specifically, the itx structure is
kept around until the data associated with the itx is considered to be
safe on disk; this is so that the itx's callback can be called after the
data is committed to stable storage. Since OpenZFS doesn't have this itx
callback mechanism, it's able to destroy the itx structure immediately
after the itx is committed to an lwb (before the lwb is written to
disk).
To support this difference, and to ensure the itx's callbacks can still
be called after the itx's data is on disk, a few changes had to be made:
* A list of itxs was added to the lwb structure. This list contains
all of the itxs that have been committed to the lwb, such that the
callbacks for these itxs can be called from zil_lwb_flush_vdevs_done(),
after the data for the itxs is committed to disk.
* A list of itxs was added on the stack of the zil_process_commit_list()
function; the "nolwb_itxs" list. In some circumstances, an itx may
not be committed to an lwb (e.g. if allocating the "next" ZIL block
on disk fails), so this list is used to keep track of which itxs
fall into this state, such that their callbacks can be called after
the ZIL's writer pipeline is "stalled".
* The logic to actually call the itx's callback was moved into the
zil_itx_destroy() function. Since all consumers of zil_itx_destroy()
were effectively performing the same logic (i.e. if callback is
non-null, call the callback), it seemed like useful code cleanup to
consolidate this logic into a single function.
Additionally, the existing Linux tracepoint infrastructure dealing with
the ZIL's probes and structures had to be updated to reflect these code
changes. Specifically:
* The "zil__cw1" and "zil__cw2" probes were removed, so they had to be
removed from "trace_zil.h" as well.
* Some of the zilog structure's fields were removed, which affected
the tracepoint definitions of the structure.
* New tracepoints had to be added for the following 3 new probes:
* zil__process__commit__itx
* zil__process__normal__itx
* zil__commit__io__error
OpenZFS-issue: https://www.illumos.org/issues/8585
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/5d95a3a
Closes #6566
2017-12-05 20:39:16 +03:00
|
|
|
ASSERT3P(lwb, !=, NULL);
|
|
|
|
ASSERT3P(zio, !=, NULL);
|
|
|
|
ASSERT3U(size, !=, 0);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Nothing to do if the file has been removed
|
|
|
|
*/
|
2017-03-08 03:21:37 +03:00
|
|
|
if (zfs_zget(zfsvfs, object, &zp) != 0)
|
2013-03-08 22:41:28 +04:00
|
|
|
return (SET_ERROR(ENOENT));
|
2008-11-20 23:01:55 +03:00
|
|
|
if (zp->z_unlinked) {
|
2009-07-03 02:44:48 +04:00
|
|
|
/*
|
|
|
|
* Release the vnode asynchronously as we currently have the
|
|
|
|
* txg stopped from syncing.
|
|
|
|
*/
|
2019-12-11 22:53:57 +03:00
|
|
|
zfs_zrele_async(zp);
|
2013-03-08 22:41:28 +04:00
|
|
|
return (SET_ERROR(ENOENT));
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2019-07-30 19:18:30 +03:00
|
|
|
zgd = kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
|
OpenZFS 8585 - improve batching done in zil_commit()
Authored by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Brad Lewis <brad.lewis@delphix.com>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Dan McDonald <danmcd@joyent.com>
Ported-by: Prakash Surya <prakash.surya@delphix.com>
Problem
=======
The current implementation of zil_commit() can introduce significant
latency, beyond what is inherent due to the latency of the underlying
storage. The additional latency comes from two main problems:
1. When there's outstanding ZIL blocks being written (i.e. there's
already a "writer thread" in progress), then any new calls to
zil_commit() will block waiting for the currently oustanding ZIL
blocks to complete. The blocks written for each "writer thread" is
coined a "batch", and there can only ever be a single "batch" being
written at a time. When a batch is being written, any new ZIL
transactions will have to wait for the next batch to be written,
which won't occur until the current batch finishes.
As a result, the underlying storage may not be used as efficiently
as possible. While "new" threads enter zil_commit() and are blocked
waiting for the next batch, it's possible that the underlying
storage isn't fully utilized by the current batch of ZIL blocks. In
that case, it'd be better to allow these new threads to generate
(and issue) a new ZIL block, such that it could be serviced by the
underlying storage concurrently with the other ZIL blocks that are
being serviced.
2. Any call to zil_commit() must wait for all ZIL blocks in its "batch"
to complete, prior to zil_commit() returning. The size of any given
batch is proportional to the number of ZIL transaction in the queue
at the time that the batch starts processing the queue; which
doesn't occur until the previous batch completes. Thus, if there's a
lot of transactions in the queue, the batch could be composed of
many ZIL blocks, and each call to zil_commit() will have to wait for
all of these writes to complete (even if the thread calling
zil_commit() only cared about one of the transactions in the batch).
To further complicate the situation, these two issues result in the
following side effect:
3. If a given batch takes longer to complete than normal, this results
in larger batch sizes, which then take longer to complete and
further drive up the latency of zil_commit(). This can occur for a
number of reasons, including (but not limited to): transient changes
in the workload, and storage latency irregularites.
Solution
========
The solution attempted by this change has the following goals:
1. no on-disk changes; maintain current on-disk format.
2. modify the "batch size" to be equal to the "ZIL block size".
3. allow new batches to be generated and issued to disk, while there's
already batches being serviced by the disk.
4. allow zil_commit() to wait for as few ZIL blocks as possible.
5. use as few ZIL blocks as possible, for the same amount of ZIL
transactions, without introducing significant latency to any
individual ZIL transaction. i.e. use fewer, but larger, ZIL blocks.
In theory, with these goals met, the new allgorithm will allow the
following improvements:
1. new ZIL blocks can be generated and issued, while there's already
oustanding ZIL blocks being serviced by the storage.
2. the latency of zil_commit() should be proportional to the underlying
storage latency, rather than the incoming synchronous workload.
Porting Notes
=============
Due to the changes made in commit 119a394ab0, the lifetime of an itx
structure differs than in OpenZFS. Specifically, the itx structure is
kept around until the data associated with the itx is considered to be
safe on disk; this is so that the itx's callback can be called after the
data is committed to stable storage. Since OpenZFS doesn't have this itx
callback mechanism, it's able to destroy the itx structure immediately
after the itx is committed to an lwb (before the lwb is written to
disk).
To support this difference, and to ensure the itx's callbacks can still
be called after the itx's data is on disk, a few changes had to be made:
* A list of itxs was added to the lwb structure. This list contains
all of the itxs that have been committed to the lwb, such that the
callbacks for these itxs can be called from zil_lwb_flush_vdevs_done(),
after the data for the itxs is committed to disk.
* A list of itxs was added on the stack of the zil_process_commit_list()
function; the "nolwb_itxs" list. In some circumstances, an itx may
not be committed to an lwb (e.g. if allocating the "next" ZIL block
on disk fails), so this list is used to keep track of which itxs
fall into this state, such that their callbacks can be called after
the ZIL's writer pipeline is "stalled".
* The logic to actually call the itx's callback was moved into the
zil_itx_destroy() function. Since all consumers of zil_itx_destroy()
were effectively performing the same logic (i.e. if callback is
non-null, call the callback), it seemed like useful code cleanup to
consolidate this logic into a single function.
Additionally, the existing Linux tracepoint infrastructure dealing with
the ZIL's probes and structures had to be updated to reflect these code
changes. Specifically:
* The "zil__cw1" and "zil__cw2" probes were removed, so they had to be
removed from "trace_zil.h" as well.
* Some of the zilog structure's fields were removed, which affected
the tracepoint definitions of the structure.
* New tracepoints had to be added for the following 3 new probes:
* zil__process__commit__itx
* zil__process__normal__itx
* zil__commit__io__error
OpenZFS-issue: https://www.illumos.org/issues/8585
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/5d95a3a
Closes #6566
2017-12-05 20:39:16 +03:00
|
|
|
zgd->zgd_lwb = lwb;
|
2010-05-29 00:45:14 +04:00
|
|
|
zgd->zgd_private = zp;
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* Write records come in two flavors: immediate and indirect.
|
|
|
|
* For small writes it's cheaper to store the data with the
|
|
|
|
* log record (immediate); for large writes it's cheaper to
|
|
|
|
* sync the data and get a pointer to it (indirect) so that
|
|
|
|
* we don't have to write the data twice.
|
|
|
|
*/
|
|
|
|
if (buf != NULL) { /* immediate write */
|
2019-10-04 01:54:29 +03:00
|
|
|
zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock,
|
2018-10-02 01:13:12 +03:00
|
|
|
offset, size, RL_READER);
|
2008-11-20 23:01:55 +03:00
|
|
|
/* test for truncation needs to be done while range locked */
|
2010-05-29 00:45:14 +04:00
|
|
|
if (offset >= zp->z_size) {
|
2013-03-08 22:41:28 +04:00
|
|
|
error = SET_ERROR(ENOENT);
|
2010-05-29 00:45:14 +04:00
|
|
|
} else {
|
|
|
|
error = dmu_read(os, object, offset, size, buf,
|
|
|
|
DMU_READ_NO_PREFETCH);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
2010-05-29 00:45:14 +04:00
|
|
|
ASSERT(error == 0 || error == ENOENT);
|
2008-11-20 23:01:55 +03:00
|
|
|
} else { /* indirect write */
|
|
|
|
/*
|
|
|
|
* Have to lock the whole block to ensure when it's
|
2017-08-21 18:59:48 +03:00
|
|
|
* written out and its checksum is being calculated
|
2008-11-20 23:01:55 +03:00
|
|
|
* that no one can change the data. We need to re-check
|
|
|
|
* blocksize after we get the lock in case it's changed!
|
|
|
|
*/
|
|
|
|
for (;;) {
|
2010-05-29 00:45:14 +04:00
|
|
|
uint64_t blkoff;
|
|
|
|
size = zp->z_blksz;
|
|
|
|
blkoff = ISP2(size) ? P2PHASE(offset, size) : offset;
|
|
|
|
offset -= blkoff;
|
2019-10-04 01:54:29 +03:00
|
|
|
zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock,
|
2018-10-02 01:13:12 +03:00
|
|
|
offset, size, RL_READER);
|
2010-05-29 00:45:14 +04:00
|
|
|
if (zp->z_blksz == size)
|
2008-11-20 23:01:55 +03:00
|
|
|
break;
|
2010-05-29 00:45:14 +04:00
|
|
|
offset += blkoff;
|
2019-10-04 01:54:29 +03:00
|
|
|
zfs_rangelock_exit(zgd->zgd_lr);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
/* test for truncation needs to be done while range locked */
|
2010-05-29 00:45:14 +04:00
|
|
|
if (lr->lr_offset >= zp->z_size)
|
2013-03-08 22:41:28 +04:00
|
|
|
error = SET_ERROR(ENOENT);
|
2009-08-18 22:43:27 +04:00
|
|
|
#ifdef DEBUG
|
|
|
|
if (zil_fault_io) {
|
2013-03-08 22:41:28 +04:00
|
|
|
error = SET_ERROR(EIO);
|
2009-08-18 22:43:27 +04:00
|
|
|
zil_fault_io = 0;
|
|
|
|
}
|
|
|
|
#endif
|
2008-11-20 23:01:55 +03:00
|
|
|
if (error == 0)
|
2010-05-29 00:45:14 +04:00
|
|
|
error = dmu_buf_hold(os, object, offset, zgd, &db,
|
|
|
|
DMU_READ_NO_PREFETCH);
|
|
|
|
|
|
|
|
if (error == 0) {
|
2017-04-14 22:59:18 +03:00
|
|
|
blkptr_t *bp = &lr->lr_blkptr;
|
2013-05-10 23:47:54 +04:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
zgd->zgd_db = db;
|
|
|
|
zgd->zgd_bp = bp;
|
|
|
|
|
|
|
|
ASSERT(db->db_offset == offset);
|
|
|
|
ASSERT(db->db_size == size);
|
|
|
|
|
|
|
|
error = dmu_sync(zio, lr->lr_common.lrc_txg,
|
|
|
|
zfs_get_done, zgd);
|
2017-01-18 02:18:59 +03:00
|
|
|
ASSERT(error || lr->lr_length <= size);
|
2010-05-29 00:45:14 +04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* On success, we need to wait for the write I/O
|
|
|
|
* initiated by dmu_sync() to complete before we can
|
|
|
|
* release this dbuf. We will finish everything up
|
|
|
|
* in the zfs_get_done() callback.
|
|
|
|
*/
|
|
|
|
if (error == 0)
|
|
|
|
return (0);
|
|
|
|
|
|
|
|
if (error == EALREADY) {
|
|
|
|
lr->lr_common.lrc_txtype = TX_WRITE2;
|
OpenZFS 7614, 9064 - zfs device evacuation/removal
OpenZFS 7614 - zfs device evacuation/removal
OpenZFS 9064 - remove_mirror should wait for device removal to complete
This project allows top-level vdevs to be removed from the storage pool
with "zpool remove", reducing the total amount of storage in the pool.
This operation copies all allocated regions of the device to be removed
onto other devices, recording the mapping from old to new location.
After the removal is complete, read and free operations to the removed
(now "indirect") vdev must be remapped and performed at the new location
on disk. The indirect mapping table is kept in memory whenever the pool
is loaded, so there is minimal performance overhead when doing operations
on the indirect vdev.
The size of the in-memory mapping table will be reduced when its entries
become "obsolete" because they are no longer used by any block pointers
in the pool. An entry becomes obsolete when all the blocks that use
it are freed. An entry can also become obsolete when all the snapshots
that reference it are deleted, and the block pointers that reference it
have been "remapped" in all filesystems/zvols (and clones). Whenever an
indirect block is written, all the block pointers in it will be "remapped"
to their new (concrete) locations if possible. This process can be
accelerated by using the "zfs remap" command to proactively rewrite all
indirect blocks that reference indirect (removed) vdevs.
Note that when a device is removed, we do not verify the checksum of
the data that is copied. This makes the process much faster, but if it
were used on redundant vdevs (i.e. mirror or raidz vdevs), it would be
possible to copy the wrong data, when we have the correct data on e.g.
the other side of the mirror.
At the moment, only mirrors and simple top-level vdevs can be removed
and no removal is allowed if any of the top-level vdevs are raidz.
Porting Notes:
* Avoid zero-sized kmem_alloc() in vdev_compact_children().
The device evacuation code adds a dependency that
vdev_compact_children() be able to properly empty the vdev_child
array by setting it to NULL and zeroing vdev_children. Under Linux,
kmem_alloc() and related functions return a sentinel pointer rather
than NULL for zero-sized allocations.
* Remove comment regarding "mpt" driver where zfs_remove_max_segment
is initialized to SPA_MAXBLOCKSIZE.
Change zfs_condense_indirect_commit_entry_delay_ticks to
zfs_condense_indirect_commit_entry_delay_ms for consistency with
most other tunables in which delays are specified in ms.
* ZTS changes:
Use set_tunable rather than mdb
Use zpool sync as appropriate
Use sync_pool instead of sync
Kill jobs during test_removal_with_operation to allow unmount/export
Don't add non-disk names such as "mirror" or "raidz" to $DISKS
Use $TEST_BASE_DIR instead of /tmp
Increase HZ from 100 to 1000 which is more common on Linux
removal_multiple_indirection.ksh
Reduce iterations in order to not time out on the code
coverage builders.
removal_resume_export:
Functionally, the test case is correct but there exists a race
where the kernel thread hasn't been fully started yet and is
not visible. Wait for up to 1 second for the removal thread
to be started before giving up on it. Also, increase the
amount of data copied in order that the removal not finish
before the export has a chance to fail.
* MMP compatibility, the concept of concrete versus non-concrete devices
has slightly changed the semantics of vdev_writeable(). Update
mmp_random_leaf_impl() accordingly.
* Updated dbuf_remap() to handle the org.zfsonlinux:large_dnode pool
feature which is not supported by OpenZFS.
* Added support for new vdev removal tracepoints.
* Test cases removal_with_zdb and removal_condense_export have been
intentionally disabled. When run manually they pass as intended,
but when running in the automated test environment they produce
unreliable results on the latest Fedora release.
They may work better once the upstream pool import refectoring is
merged into ZoL at which point they will be re-enabled.
Authored by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Alex Reece <alex@delphix.com>
Reviewed-by: George Wilson <george.wilson@delphix.com>
Reviewed-by: John Kennedy <john.kennedy@delphix.com>
Reviewed-by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Richard Laager <rlaager@wiktel.com>
Reviewed by: Tim Chase <tim@chase2k.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Garrett D'Amore <garrett@damore.org>
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Tim Chase <tim@chase2k.com>
OpenZFS-issue: https://www.illumos.org/issues/7614
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/f539f1eb
Closes #6900
2016-09-22 19:30:13 +03:00
|
|
|
/*
|
|
|
|
* TX_WRITE2 relies on the data previously
|
|
|
|
* written by the TX_WRITE that caused
|
|
|
|
* EALREADY. We zero out the BP because
|
OpenZFS 9962 - zil_commit should omit cache thrash
As a result of the changes made in 8585, it's possible for an excessive
amount of vdev flush commands to be issued under some workloads.
Specifically, when the workload consists of mostly async write activity,
interspersed with some sync write and/or fsync activity, we can end up
issuing more flush commands to the underlying storage than is actually
necessary. As a result of these flush commands, the write latency and
overall throughput of the pool can be poorly impacted (latency
increases, throughput decreases).
Currently, any time an lwb completes, the vdev(s) written to as a result
of that lwb will be issued a flush command. The intenion is so the data
written to that vdev is on stable storage, prior to communicating to any
waiting threads that their data is safe on disk.
The problem with this scheme, is that sometimes an lwb will not have any
threads waiting for it to complete. This can occur when there's async
activity that gets "converted" to sync requests, as a result of calling
the zil_async_to_sync() function via zil_commit_impl(). When this
occurs, the current code may issue many lwbs that don't have waiters
associated with them, resulting in many flush commands, potentially to
the same vdev(s).
For example, given a pool with a single vdev, and a single fsync() call
that results in 10 lwbs being written out (e.g. due to other async
writes), that will result in 10 flush commands to that single vdev (a
flush issued after each lwb write completes). Ideally, we'd only issue a
single flush command to that vdev, after all 10 lwb writes completed.
Further, and most important as it pertains to this change, since the
flush commands are often very impactful to the performance of the pool's
underlying storage, unnecessarily issuing these flush commands can
poorly impact the performance of the lwb writes themselves. Thus, we
need to avoid issuing flush commands when possible, in order to acheive
the best possible performance out of the pool's underlying storage.
This change attempts to address this problem by changing the ZIL's logic
to only issue a vdev flush command when it detects an lwb that has a
thread waiting for it to complete. When an lwb does not have threads
waiting for it, the responsibility of issuing the flush command to the
vdevs involved with that lwb's write is passed on to the "next" lwb.
It's only once a write for an lwb with waiters completes, do we issue
the vdev flush command(s). As a result, now when we issue the flush(s),
we will issue them to the vdevs involved with that specific lwb's write,
but potentially also to vdevs involved with "previous" lwb writes (i.e.
if the previous lwbs did not have waiters associated with them).
Thus, in our prior example with 10 lwbs, it's only once the last lwb
completes (which will be the lwb containing the waiter for the thread
that called fsync) will we issue the vdev flush command; all of the
other lwbs will find they have no waiters, so they'll pass the
responsibility of the flush to the "next" lwb (until reaching the last
lwb that has the waiter).
Porting Notes:
* Reconciled conflicts with the fastwrite feature.
Authored by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Matt Ahrens <matt@delphix.com>
Reviewed by: Brad Lewis <brad.lewis@delphix.com>
Reviewed by: Patrick Mooney <patrick.mooney@joyent.com>
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Approved by: Joshua M. Clulow <josh@sysmgr.org>
Ported-by: Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
OpenZFS-issue: https://www.illumos.org/issues/9962
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/545190c6
Closes #8188
2018-10-24 00:14:27 +03:00
|
|
|
* it is the old, currently-on-disk BP.
|
OpenZFS 7614, 9064 - zfs device evacuation/removal
OpenZFS 7614 - zfs device evacuation/removal
OpenZFS 9064 - remove_mirror should wait for device removal to complete
This project allows top-level vdevs to be removed from the storage pool
with "zpool remove", reducing the total amount of storage in the pool.
This operation copies all allocated regions of the device to be removed
onto other devices, recording the mapping from old to new location.
After the removal is complete, read and free operations to the removed
(now "indirect") vdev must be remapped and performed at the new location
on disk. The indirect mapping table is kept in memory whenever the pool
is loaded, so there is minimal performance overhead when doing operations
on the indirect vdev.
The size of the in-memory mapping table will be reduced when its entries
become "obsolete" because they are no longer used by any block pointers
in the pool. An entry becomes obsolete when all the blocks that use
it are freed. An entry can also become obsolete when all the snapshots
that reference it are deleted, and the block pointers that reference it
have been "remapped" in all filesystems/zvols (and clones). Whenever an
indirect block is written, all the block pointers in it will be "remapped"
to their new (concrete) locations if possible. This process can be
accelerated by using the "zfs remap" command to proactively rewrite all
indirect blocks that reference indirect (removed) vdevs.
Note that when a device is removed, we do not verify the checksum of
the data that is copied. This makes the process much faster, but if it
were used on redundant vdevs (i.e. mirror or raidz vdevs), it would be
possible to copy the wrong data, when we have the correct data on e.g.
the other side of the mirror.
At the moment, only mirrors and simple top-level vdevs can be removed
and no removal is allowed if any of the top-level vdevs are raidz.
Porting Notes:
* Avoid zero-sized kmem_alloc() in vdev_compact_children().
The device evacuation code adds a dependency that
vdev_compact_children() be able to properly empty the vdev_child
array by setting it to NULL and zeroing vdev_children. Under Linux,
kmem_alloc() and related functions return a sentinel pointer rather
than NULL for zero-sized allocations.
* Remove comment regarding "mpt" driver where zfs_remove_max_segment
is initialized to SPA_MAXBLOCKSIZE.
Change zfs_condense_indirect_commit_entry_delay_ticks to
zfs_condense_indirect_commit_entry_delay_ms for consistency with
most other tunables in which delays are specified in ms.
* ZTS changes:
Use set_tunable rather than mdb
Use zpool sync as appropriate
Use sync_pool instead of sync
Kill jobs during test_removal_with_operation to allow unmount/export
Don't add non-disk names such as "mirror" or "raidz" to $DISKS
Use $TEST_BASE_DIR instead of /tmp
Increase HZ from 100 to 1000 which is more common on Linux
removal_multiple_indirection.ksh
Reduce iterations in order to not time out on the code
coverage builders.
removal_resume_export:
Functionally, the test case is correct but there exists a race
where the kernel thread hasn't been fully started yet and is
not visible. Wait for up to 1 second for the removal thread
to be started before giving up on it. Also, increase the
amount of data copied in order that the removal not finish
before the export has a chance to fail.
* MMP compatibility, the concept of concrete versus non-concrete devices
has slightly changed the semantics of vdev_writeable(). Update
mmp_random_leaf_impl() accordingly.
* Updated dbuf_remap() to handle the org.zfsonlinux:large_dnode pool
feature which is not supported by OpenZFS.
* Added support for new vdev removal tracepoints.
* Test cases removal_with_zdb and removal_condense_export have been
intentionally disabled. When run manually they pass as intended,
but when running in the automated test environment they produce
unreliable results on the latest Fedora release.
They may work better once the upstream pool import refectoring is
merged into ZoL at which point they will be re-enabled.
Authored by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Alex Reece <alex@delphix.com>
Reviewed-by: George Wilson <george.wilson@delphix.com>
Reviewed-by: John Kennedy <john.kennedy@delphix.com>
Reviewed-by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Richard Laager <rlaager@wiktel.com>
Reviewed by: Tim Chase <tim@chase2k.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Garrett D'Amore <garrett@damore.org>
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Tim Chase <tim@chase2k.com>
OpenZFS-issue: https://www.illumos.org/issues/7614
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/f539f1eb
Closes #6900
2016-09-22 19:30:13 +03:00
|
|
|
*/
|
|
|
|
zgd->zgd_bp = NULL;
|
|
|
|
BP_ZERO(bp);
|
2010-05-29 00:45:14 +04:00
|
|
|
error = 0;
|
|
|
|
}
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
2010-05-29 00:45:14 +04:00
|
|
|
|
|
|
|
zfs_get_done(zgd, error);
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*ARGSUSED*/
|
2011-02-08 22:16:06 +03:00
|
|
|
int
|
|
|
|
zfs_access(struct inode *ip, int mode, int flag, cred_t *cr)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2011-02-08 22:16:06 +03:00
|
|
|
znode_t *zp = ITOZ(ip);
|
2017-03-08 03:21:37 +03:00
|
|
|
zfsvfs_t *zfsvfs = ITOZSB(ip);
|
2008-11-20 23:01:55 +03:00
|
|
|
int error;
|
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_ENTER(zfsvfs);
|
2008-11-20 23:01:55 +03:00
|
|
|
ZFS_VERIFY_ZP(zp);
|
|
|
|
|
|
|
|
if (flag & V_ACE_MASK)
|
|
|
|
error = zfs_zaccess(zp, mode, flag, B_FALSE, cr);
|
|
|
|
else
|
|
|
|
error = zfs_zaccess_rwx(zp, mode, flag, cr);
|
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2009-08-18 22:43:27 +04:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* Lookup an entry in a directory, or an extended attribute directory.
|
2011-02-08 22:16:06 +03:00
|
|
|
* If it exists, return a held inode reference for it.
|
2008-11-20 23:01:55 +03:00
|
|
|
*
|
2019-12-11 22:53:57 +03:00
|
|
|
* IN: zdp - znode of directory to search.
|
2008-11-20 23:01:55 +03:00
|
|
|
* nm - name of entry to lookup.
|
|
|
|
* flags - LOOKUP_XATTR set if looking for an attribute.
|
|
|
|
* cr - credentials of caller.
|
|
|
|
* direntflags - directory lookup flags
|
|
|
|
* realpnp - returned pathname.
|
|
|
|
*
|
2019-12-11 22:53:57 +03:00
|
|
|
* OUT: zpp - znode of located entry, NULL if not found.
|
2008-11-20 23:01:55 +03:00
|
|
|
*
|
2013-06-11 21:12:34 +04:00
|
|
|
* RETURN: 0 on success, error code on failure.
|
2008-11-20 23:01:55 +03:00
|
|
|
*
|
|
|
|
* Timestamps:
|
|
|
|
* NA
|
|
|
|
*/
|
|
|
|
/* ARGSUSED */
|
2010-12-17 22:18:08 +03:00
|
|
|
int
|
2019-12-11 22:53:57 +03:00
|
|
|
zfs_lookup(znode_t *zdp, char *nm, znode_t **zpp, int flags,
|
2011-02-08 22:16:06 +03:00
|
|
|
cred_t *cr, int *direntflags, pathname_t *realpnp)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2019-12-11 22:53:57 +03:00
|
|
|
zfsvfs_t *zfsvfs = ZTOZSB(zdp);
|
2011-02-08 22:16:06 +03:00
|
|
|
int error = 0;
|
2009-08-18 22:43:27 +04:00
|
|
|
|
2017-02-03 01:13:41 +03:00
|
|
|
/*
|
|
|
|
* Fast path lookup, however we must skip DNLC lookup
|
|
|
|
* for case folding or normalizing lookups because the
|
|
|
|
* DNLC code only stores the passed in name. This means
|
|
|
|
* creating 'a' and removing 'A' on a case insensitive
|
|
|
|
* file system would work, but DNLC still thinks 'a'
|
|
|
|
* exists and won't let you create it again on the next
|
|
|
|
* pass through fast path.
|
|
|
|
*/
|
2009-08-18 22:43:27 +04:00
|
|
|
if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) {
|
|
|
|
|
2019-12-11 22:53:57 +03:00
|
|
|
if (!S_ISDIR(ZTOI(zdp)->i_mode)) {
|
2013-03-08 22:41:28 +04:00
|
|
|
return (SET_ERROR(ENOTDIR));
|
2010-05-29 00:45:14 +04:00
|
|
|
} else if (zdp->z_sa_hdl == NULL) {
|
2013-03-08 22:41:28 +04:00
|
|
|
return (SET_ERROR(EIO));
|
2009-08-18 22:43:27 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) {
|
|
|
|
error = zfs_fastaccesschk_execute(zdp, cr);
|
|
|
|
if (!error) {
|
2019-12-11 22:53:57 +03:00
|
|
|
*zpp = zdp;
|
|
|
|
zhold(*zpp);
|
2009-08-18 22:43:27 +04:00
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_ENTER(zfsvfs);
|
2008-11-20 23:01:55 +03:00
|
|
|
ZFS_VERIFY_ZP(zdp);
|
|
|
|
|
2019-12-11 22:53:57 +03:00
|
|
|
*zpp = NULL;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
if (flags & LOOKUP_XATTR) {
|
|
|
|
/*
|
|
|
|
* We don't allow recursive attributes..
|
|
|
|
* Maybe someday we will.
|
|
|
|
*/
|
2010-05-29 00:45:14 +04:00
|
|
|
if (zdp->z_pflags & ZFS_XATTR) {
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2013-03-08 22:41:28 +04:00
|
|
|
return (SET_ERROR(EINVAL));
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2019-12-11 22:53:57 +03:00
|
|
|
if ((error = zfs_get_xattrdir(zdp, zpp, cr, flags))) {
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2008-11-20 23:01:55 +03:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Do we have permission to get into attribute directory?
|
|
|
|
*/
|
|
|
|
|
2019-12-11 22:53:57 +03:00
|
|
|
if ((error = zfs_zaccess(*zpp, ACE_EXECUTE, 0,
|
2010-12-17 01:05:42 +03:00
|
|
|
B_FALSE, cr))) {
|
2019-12-11 22:53:57 +03:00
|
|
|
zrele(*zpp);
|
|
|
|
*zpp = NULL;
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2008-11-20 23:01:55 +03:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
2019-12-11 22:53:57 +03:00
|
|
|
if (!S_ISDIR(ZTOI(zdp)->i_mode)) {
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2013-03-08 22:41:28 +04:00
|
|
|
return (SET_ERROR(ENOTDIR));
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Check accessibility of directory.
|
|
|
|
*/
|
|
|
|
|
2010-12-17 01:05:42 +03:00
|
|
|
if ((error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr))) {
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2008-11-20 23:01:55 +03:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
|
2008-11-20 23:01:55 +03:00
|
|
|
NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2013-03-08 22:41:28 +04:00
|
|
|
return (SET_ERROR(EILSEQ));
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2019-12-11 22:53:57 +03:00
|
|
|
error = zfs_dirlook(zdp, nm, zpp, flags, direntflags, realpnp);
|
|
|
|
if ((error == 0) && (*zpp))
|
|
|
|
zfs_inode_update(*zpp);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2008-11-20 23:01:55 +03:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Attempt to create a new entry in a directory. If the entry
|
|
|
|
* already exists, truncate the file if permissible, else return
|
2011-02-08 22:16:06 +03:00
|
|
|
* an error. Return the ip of the created or trunc'd file.
|
2008-11-20 23:01:55 +03:00
|
|
|
*
|
2019-12-11 22:53:57 +03:00
|
|
|
* IN: dzp - znode of directory to put new file entry in.
|
2008-11-20 23:01:55 +03:00
|
|
|
* name - name of new file entry.
|
|
|
|
* vap - attributes of new file.
|
|
|
|
* excl - flag indicating exclusive or non-exclusive mode.
|
|
|
|
* mode - mode to open file with.
|
|
|
|
* cr - credentials of caller.
|
2019-04-19 22:03:32 +03:00
|
|
|
* flag - file flag.
|
2011-02-08 22:16:06 +03:00
|
|
|
* vsecp - ACL to be set
|
2008-11-20 23:01:55 +03:00
|
|
|
*
|
2019-12-11 22:53:57 +03:00
|
|
|
* OUT: zpp - znode of created or trunc'd entry.
|
2008-11-20 23:01:55 +03:00
|
|
|
*
|
2013-06-11 21:12:34 +04:00
|
|
|
* RETURN: 0 on success, error code on failure.
|
2008-11-20 23:01:55 +03:00
|
|
|
*
|
|
|
|
* Timestamps:
|
2019-12-11 22:53:57 +03:00
|
|
|
* dzp - ctime|mtime updated if new entry created
|
|
|
|
* zp - ctime|mtime always, atime if new
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
|
|
|
|
|
|
|
/* ARGSUSED */
|
2010-12-17 22:18:08 +03:00
|
|
|
int
|
2019-12-11 22:53:57 +03:00
|
|
|
zfs_create(znode_t *dzp, char *name, vattr_t *vap, int excl,
|
|
|
|
int mode, znode_t **zpp, cred_t *cr, int flag, vsecattr_t *vsecp)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2019-12-11 22:53:57 +03:00
|
|
|
znode_t *zp;
|
|
|
|
zfsvfs_t *zfsvfs = ZTOZSB(dzp);
|
2008-11-20 23:01:55 +03:00
|
|
|
zilog_t *zilog;
|
|
|
|
objset_t *os;
|
|
|
|
zfs_dirlock_t *dl;
|
|
|
|
dmu_tx_t *tx;
|
|
|
|
int error;
|
2008-12-03 23:09:06 +03:00
|
|
|
uid_t uid;
|
2010-12-17 01:05:42 +03:00
|
|
|
gid_t gid;
|
2010-05-29 00:45:14 +04:00
|
|
|
zfs_acl_ids_t acl_ids;
|
2009-07-03 02:44:48 +04:00
|
|
|
boolean_t fuid_dirtied;
|
2010-05-29 00:45:14 +04:00
|
|
|
boolean_t have_acl = B_FALSE;
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-29 07:01:20 +04:00
|
|
|
boolean_t waited = B_FALSE;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If we have an ephemeral id, ACL, or XVATTR then
|
|
|
|
* make sure file system is at proper version
|
|
|
|
*/
|
|
|
|
|
2010-12-17 01:05:42 +03:00
|
|
|
gid = crgetgid(cr);
|
2011-02-08 22:16:06 +03:00
|
|
|
uid = crgetuid(cr);
|
2008-12-03 23:09:06 +03:00
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
if (zfsvfs->z_use_fuids == B_FALSE &&
|
2011-02-08 22:16:06 +03:00
|
|
|
(vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
|
2013-03-08 22:41:28 +04:00
|
|
|
return (SET_ERROR(EINVAL));
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2016-11-10 19:50:32 +03:00
|
|
|
if (name == NULL)
|
|
|
|
return (SET_ERROR(EINVAL));
|
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_ENTER(zfsvfs);
|
2008-11-20 23:01:55 +03:00
|
|
|
ZFS_VERIFY_ZP(dzp);
|
2017-03-08 03:21:37 +03:00
|
|
|
os = zfsvfs->z_os;
|
|
|
|
zilog = zfsvfs->z_log;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
|
2008-11-20 23:01:55 +03:00
|
|
|
NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2013-03-08 22:41:28 +04:00
|
|
|
return (SET_ERROR(EILSEQ));
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2011-03-01 23:24:09 +03:00
|
|
|
if (vap->va_mask & ATTR_XVATTR) {
|
2008-11-20 23:01:55 +03:00
|
|
|
if ((error = secpolicy_xvattr((xvattr_t *)vap,
|
2011-02-08 22:16:06 +03:00
|
|
|
crgetuid(cr), cr, vap->va_mode)) != 0) {
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2008-11-20 23:01:55 +03:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-02-08 22:16:06 +03:00
|
|
|
top:
|
2019-12-11 22:53:57 +03:00
|
|
|
*zpp = NULL;
|
2008-11-20 23:01:55 +03:00
|
|
|
if (*name == '\0') {
|
|
|
|
/*
|
|
|
|
* Null component name refers to the directory itself.
|
|
|
|
*/
|
2019-12-11 22:53:57 +03:00
|
|
|
zhold(dzp);
|
2008-11-20 23:01:55 +03:00
|
|
|
zp = dzp;
|
|
|
|
dl = NULL;
|
|
|
|
error = 0;
|
|
|
|
} else {
|
2011-02-08 22:16:06 +03:00
|
|
|
/* possible igrab(zp) */
|
2008-11-20 23:01:55 +03:00
|
|
|
int zflg = 0;
|
|
|
|
|
|
|
|
if (flag & FIGNORECASE)
|
|
|
|
zflg |= ZCILOOK;
|
|
|
|
|
|
|
|
error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
|
|
|
|
NULL, NULL);
|
|
|
|
if (error) {
|
2010-08-27 01:24:34 +04:00
|
|
|
if (have_acl)
|
|
|
|
zfs_acl_ids_free(&acl_ids);
|
2008-11-20 23:01:55 +03:00
|
|
|
if (strcmp(name, "..") == 0)
|
2013-03-08 22:41:28 +04:00
|
|
|
error = SET_ERROR(EISDIR);
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2008-11-20 23:01:55 +03:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
}
|
2010-05-29 00:45:14 +04:00
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
if (zp == NULL) {
|
|
|
|
uint64_t txtype;
|
2018-02-14 01:54:54 +03:00
|
|
|
uint64_t projid = ZFS_DEFAULT_PROJID;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Create a new file object and update the directory
|
|
|
|
* to reference it.
|
|
|
|
*/
|
2010-12-17 01:05:42 +03:00
|
|
|
if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) {
|
2010-08-27 01:24:34 +04:00
|
|
|
if (have_acl)
|
|
|
|
zfs_acl_ids_free(&acl_ids);
|
2008-11-20 23:01:55 +03:00
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We only support the creation of regular files in
|
|
|
|
* extended attribute directories.
|
|
|
|
*/
|
2010-05-29 00:45:14 +04:00
|
|
|
|
2011-02-08 22:16:06 +03:00
|
|
|
if ((dzp->z_pflags & ZFS_XATTR) && !S_ISREG(vap->va_mode)) {
|
2010-08-27 01:24:34 +04:00
|
|
|
if (have_acl)
|
|
|
|
zfs_acl_ids_free(&acl_ids);
|
2013-03-08 22:41:28 +04:00
|
|
|
error = SET_ERROR(EINVAL);
|
2008-11-20 23:01:55 +03:00
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap,
|
|
|
|
cr, vsecp, &acl_ids)) != 0)
|
2009-07-03 02:44:48 +04:00
|
|
|
goto out;
|
2010-05-29 00:45:14 +04:00
|
|
|
have_acl = B_TRUE;
|
|
|
|
|
2018-02-14 01:54:54 +03:00
|
|
|
if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode))
|
|
|
|
projid = zfs_inherit_projid(dzp);
|
|
|
|
if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) {
|
2009-08-18 22:43:27 +04:00
|
|
|
zfs_acl_ids_free(&acl_ids);
|
2013-03-08 22:41:28 +04:00
|
|
|
error = SET_ERROR(EDQUOT);
|
2009-07-03 02:44:48 +04:00
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
tx = dmu_tx_create(os);
|
2010-05-29 00:45:14 +04:00
|
|
|
|
|
|
|
dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
|
|
|
|
ZFS_SA_BASE_ATTR_SIZE);
|
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
fuid_dirtied = zfsvfs->z_fuid_dirty;
|
2009-07-03 02:44:48 +04:00
|
|
|
if (fuid_dirtied)
|
2017-03-08 03:21:37 +03:00
|
|
|
zfs_fuid_txhold(zfsvfs, tx);
|
2008-11-20 23:01:55 +03:00
|
|
|
dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
|
2010-05-29 00:45:14 +04:00
|
|
|
dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
|
2017-03-08 03:21:37 +03:00
|
|
|
if (!zfsvfs->z_use_sa &&
|
2010-05-29 00:45:14 +04:00
|
|
|
acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
|
2008-11-20 23:01:55 +03:00
|
|
|
dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
|
2010-05-29 00:45:14 +04:00
|
|
|
0, acl_ids.z_aclp->z_acl_bytes);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
Fix ENOSPC in "Handle zap_add() failures in ..."
Commit cc63068 caused ENOSPC error when copy a large amount of files
between two directories. The reason is that the patch limits zap leaf
expansion to 2 retries, and return ENOSPC when failed.
The intent for limiting retries is to prevent pointlessly growing table
to max size when adding a block full of entries with same name in
different case in mixed mode. However, it turns out we cannot use any
limit on the retry. When we copy files from one directory in readdir
order, we are copying in hash order, one leaf block at a time. Which
means that if the leaf block in source directory has expanded 6 times,
and you copy those entries in that block, by the time you need to expand
the leaf in destination directory, you need to expand it 6 times in one
go. So any limit on the retry will result in error where it shouldn't.
Note that while we do use different salt for different directories, it
seems that the salt/hash function doesn't provide enough randomization
to the hash distance to prevent this from happening.
Since cc63068 has already been reverted. This patch adds it back and
removes the retry limit.
Also, as it turn out, failing on zap_add() has a serious side effect for
mzap_upgrade(). When upgrading from micro zap to fat zap, it will
call zap_add() to transfer entries one at a time. If it hit any error
halfway through, the remaining entries will be lost, causing those files
to become orphan. This patch add a VERIFY to catch it.
Reviewed-by: Sanjeev Bagewadi <sanjeev.bagewadi@gmail.com>
Reviewed-by: Richard Yao <ryao@gentoo.org>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Albert Lee <trisk@forkgnu.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Signed-off-by: Chunwei Chen <david.chen@nutanix.com>
Closes #7401
Closes #7421
2018-04-19 00:19:50 +03:00
|
|
|
|
OpenZFS 8997 - ztest assertion failure in zil_lwb_write_issue
PROBLEM
=======
When `dmu_tx_assign` is called from `zil_lwb_write_issue`, it's possible
for either `ERESTART` or `EIO` to be returned.
If `ERESTART` is returned, this will cause an assertion to fail directly
in `zil_lwb_write_issue`, where the code assumes the return value is
`EIO` if `dmu_tx_assign` returns a non-zero value. This can occur if the
SPA is suspended when `dmu_tx_assign` is called, and most often occurs
when running `zloop`.
If `EIO` is returned, this can cause assertions to fail elsewhere in the
ZIL code. For example, `zil_commit_waiter_timeout` contains the
following logic:
lwb_t *nlwb = zil_lwb_write_issue(zilog, lwb);
ASSERT3S(lwb->lwb_state, !=, LWB_STATE_OPENED);
In this case, if `dmu_tx_assign` returned `EIO` from within
`zil_lwb_write_issue`, the `lwb` variable passed in will not be issued
to disk. Thus, it's `lwb_state` field will remain `LWB_STATE_OPENED` and
this assertion will fail. `zil_commit_waiter_timeout` assumes that after
it calls `zil_lwb_write_issue`, the `lwb` will be issued to disk, and
doesn't handle the case where this is not true; i.e. it doesn't handle
the case where `dmu_tx_assign` returns `EIO`.
SOLUTION
========
This change modifies the `dmu_tx_assign` function such that `txg_how` is
a bitmask, rather than of the `txg_how_t` enum type. Now, the previous
`TXG_WAITED` semantics can be used via `TXG_NOTHROTTLE`, along with
specifying either `TXG_NOWAIT` or `TXG_WAIT` semantics.
Previously, when `TXG_WAITED` was specified, `TXG_NOWAIT` semantics was
automatically invoked. This was not ideal when using `TXG_WAITED` within
`zil_lwb_write_issued`, leading the problem described above. Rather, we
want to achieve the semantics of `TXG_WAIT`, while also preventing the
`tx` from being penalized via the dirty delay throttling.
With this change, `zil_lwb_write_issued` can acheive the semtantics that
it requires by passing in the value `TXG_WAIT | TXG_NOTHROTTLE` to
`dmu_tx_assign`.
Further, consumers of `dmu_tx_assign` wishing to achieve the old
`TXG_WAITED` semantics can pass in the value `TXG_NOWAIT | TXG_NOTHROTTLE`.
Authored by: Prakash Surya <prakash.surya@delphix.com>
Approved by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: Andriy Gapon <avg@FreeBSD.org>
Ported-by: Brian Behlendorf <behlendorf1@llnl.gov>
Porting Notes:
- Additionally updated `zfs_tmpfile` to use `TXG_NOTHROTTLE`
OpenZFS-issue: https://www.illumos.org/issues/8997
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/19ea6cb0f9
Closes #7084
2018-01-09 00:45:53 +03:00
|
|
|
error = dmu_tx_assign(tx,
|
|
|
|
(waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
|
2008-11-20 23:01:55 +03:00
|
|
|
if (error) {
|
|
|
|
zfs_dirent_unlock(dl);
|
2009-01-16 00:59:39 +03:00
|
|
|
if (error == ERESTART) {
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-29 07:01:20 +04:00
|
|
|
waited = B_TRUE;
|
2008-11-20 23:01:55 +03:00
|
|
|
dmu_tx_wait(tx);
|
|
|
|
dmu_tx_abort(tx);
|
|
|
|
goto top;
|
|
|
|
}
|
2010-05-29 00:45:14 +04:00
|
|
|
zfs_acl_ids_free(&acl_ids);
|
2008-11-20 23:01:55 +03:00
|
|
|
dmu_tx_abort(tx);
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2008-11-20 23:01:55 +03:00
|
|
|
return (error);
|
|
|
|
}
|
2010-05-29 00:45:14 +04:00
|
|
|
zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
|
2009-07-03 02:44:48 +04:00
|
|
|
|
Fix ENOSPC in "Handle zap_add() failures in ..."
Commit cc63068 caused ENOSPC error when copy a large amount of files
between two directories. The reason is that the patch limits zap leaf
expansion to 2 retries, and return ENOSPC when failed.
The intent for limiting retries is to prevent pointlessly growing table
to max size when adding a block full of entries with same name in
different case in mixed mode. However, it turns out we cannot use any
limit on the retry. When we copy files from one directory in readdir
order, we are copying in hash order, one leaf block at a time. Which
means that if the leaf block in source directory has expanded 6 times,
and you copy those entries in that block, by the time you need to expand
the leaf in destination directory, you need to expand it 6 times in one
go. So any limit on the retry will result in error where it shouldn't.
Note that while we do use different salt for different directories, it
seems that the salt/hash function doesn't provide enough randomization
to the hash distance to prevent this from happening.
Since cc63068 has already been reverted. This patch adds it back and
removes the retry limit.
Also, as it turn out, failing on zap_add() has a serious side effect for
mzap_upgrade(). When upgrading from micro zap to fat zap, it will
call zap_add() to transfer entries one at a time. If it hit any error
halfway through, the remaining entries will be lost, causing those files
to become orphan. This patch add a VERIFY to catch it.
Reviewed-by: Sanjeev Bagewadi <sanjeev.bagewadi@gmail.com>
Reviewed-by: Richard Yao <ryao@gentoo.org>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Albert Lee <trisk@forkgnu.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Signed-off-by: Chunwei Chen <david.chen@nutanix.com>
Closes #7401
Closes #7421
2018-04-19 00:19:50 +03:00
|
|
|
error = zfs_link_create(dl, zp, tx, ZNEW);
|
|
|
|
if (error != 0) {
|
|
|
|
/*
|
|
|
|
* Since, we failed to add the directory entry for it,
|
|
|
|
* delete the newly created dnode.
|
|
|
|
*/
|
|
|
|
zfs_znode_delete(zp, tx);
|
|
|
|
remove_inode_hash(ZTOI(zp));
|
|
|
|
zfs_acl_ids_free(&acl_ids);
|
|
|
|
dmu_tx_commit(tx);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2009-07-03 02:44:48 +04:00
|
|
|
if (fuid_dirtied)
|
2017-03-08 03:21:37 +03:00
|
|
|
zfs_fuid_sync(zfsvfs, tx);
|
2009-07-03 02:44:48 +04:00
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
|
|
|
|
if (flag & FIGNORECASE)
|
|
|
|
txtype |= TX_CI;
|
|
|
|
zfs_log_create(zilog, tx, txtype, dzp, zp, name,
|
2009-07-03 02:44:48 +04:00
|
|
|
vsecp, acl_ids.z_fuidp, vap);
|
|
|
|
zfs_acl_ids_free(&acl_ids);
|
2008-11-20 23:01:55 +03:00
|
|
|
dmu_tx_commit(tx);
|
|
|
|
} else {
|
2019-11-21 20:32:57 +03:00
|
|
|
int aflags = (flag & O_APPEND) ? V_APPEND : 0;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-08-27 01:24:34 +04:00
|
|
|
if (have_acl)
|
|
|
|
zfs_acl_ids_free(&acl_ids);
|
|
|
|
have_acl = B_FALSE;
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* A directory entry already exists for this name.
|
|
|
|
*/
|
|
|
|
/*
|
|
|
|
* Can't truncate an existing file if in exclusive mode.
|
|
|
|
*/
|
2011-02-08 22:16:06 +03:00
|
|
|
if (excl) {
|
2013-03-08 22:41:28 +04:00
|
|
|
error = SET_ERROR(EEXIST);
|
2008-11-20 23:01:55 +03:00
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* Can't open a directory for writing.
|
|
|
|
*/
|
2011-02-08 22:16:06 +03:00
|
|
|
if (S_ISDIR(ZTOI(zp)->i_mode)) {
|
2013-03-08 22:41:28 +04:00
|
|
|
error = SET_ERROR(EISDIR);
|
2008-11-20 23:01:55 +03:00
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* Verify requested access to file.
|
|
|
|
*/
|
|
|
|
if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr))) {
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
mutex_enter(&dzp->z_lock);
|
|
|
|
dzp->z_seq++;
|
|
|
|
mutex_exit(&dzp->z_lock);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Truncate regular files if requested.
|
|
|
|
*/
|
2011-02-08 22:16:06 +03:00
|
|
|
if (S_ISREG(ZTOI(zp)->i_mode) &&
|
|
|
|
(vap->va_mask & ATTR_SIZE) && (vap->va_size == 0)) {
|
2008-12-03 23:09:06 +03:00
|
|
|
/* we can't hold any locks when calling zfs_freesp() */
|
2016-09-21 05:09:22 +03:00
|
|
|
if (dl) {
|
|
|
|
zfs_dirent_unlock(dl);
|
|
|
|
dl = NULL;
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
error = zfs_freesp(zp, 0, 0, mode, TRUE);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
out:
|
|
|
|
|
|
|
|
if (dl)
|
|
|
|
zfs_dirent_unlock(dl);
|
|
|
|
|
|
|
|
if (error) {
|
|
|
|
if (zp)
|
2019-12-11 22:53:57 +03:00
|
|
|
zrele(zp);
|
2008-11-20 23:01:55 +03:00
|
|
|
} else {
|
2011-01-06 01:27:30 +03:00
|
|
|
zfs_inode_update(dzp);
|
|
|
|
zfs_inode_update(zp);
|
2019-12-11 22:53:57 +03:00
|
|
|
*zpp = zp;
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
|
2010-08-27 01:24:34 +04:00
|
|
|
zil_commit(zilog, 0);
|
2010-05-29 00:45:14 +04:00
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2008-11-20 23:01:55 +03:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
2016-01-26 23:29:46 +03:00
|
|
|
/* ARGSUSED */
|
|
|
|
int
|
|
|
|
zfs_tmpfile(struct inode *dip, vattr_t *vap, int excl,
|
|
|
|
int mode, struct inode **ipp, cred_t *cr, int flag, vsecattr_t *vsecp)
|
|
|
|
{
|
|
|
|
znode_t *zp = NULL, *dzp = ITOZ(dip);
|
2017-03-08 03:21:37 +03:00
|
|
|
zfsvfs_t *zfsvfs = ITOZSB(dip);
|
2016-01-26 23:29:46 +03:00
|
|
|
objset_t *os;
|
|
|
|
dmu_tx_t *tx;
|
|
|
|
int error;
|
|
|
|
uid_t uid;
|
|
|
|
gid_t gid;
|
|
|
|
zfs_acl_ids_t acl_ids;
|
2018-02-14 01:54:54 +03:00
|
|
|
uint64_t projid = ZFS_DEFAULT_PROJID;
|
2016-01-26 23:29:46 +03:00
|
|
|
boolean_t fuid_dirtied;
|
|
|
|
boolean_t have_acl = B_FALSE;
|
|
|
|
boolean_t waited = B_FALSE;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If we have an ephemeral id, ACL, or XVATTR then
|
|
|
|
* make sure file system is at proper version
|
|
|
|
*/
|
|
|
|
|
|
|
|
gid = crgetgid(cr);
|
|
|
|
uid = crgetuid(cr);
|
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
if (zfsvfs->z_use_fuids == B_FALSE &&
|
2016-01-26 23:29:46 +03:00
|
|
|
(vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
|
|
|
|
return (SET_ERROR(EINVAL));
|
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_ENTER(zfsvfs);
|
2016-01-26 23:29:46 +03:00
|
|
|
ZFS_VERIFY_ZP(dzp);
|
2017-03-08 03:21:37 +03:00
|
|
|
os = zfsvfs->z_os;
|
2016-01-26 23:29:46 +03:00
|
|
|
|
|
|
|
if (vap->va_mask & ATTR_XVATTR) {
|
|
|
|
if ((error = secpolicy_xvattr((xvattr_t *)vap,
|
|
|
|
crgetuid(cr), cr, vap->va_mode)) != 0) {
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2016-01-26 23:29:46 +03:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
top:
|
|
|
|
*ipp = NULL;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Create a new file object and update the directory
|
|
|
|
* to reference it.
|
|
|
|
*/
|
|
|
|
if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) {
|
|
|
|
if (have_acl)
|
|
|
|
zfs_acl_ids_free(&acl_ids);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap,
|
|
|
|
cr, vsecp, &acl_ids)) != 0)
|
|
|
|
goto out;
|
|
|
|
have_acl = B_TRUE;
|
|
|
|
|
2018-02-14 01:54:54 +03:00
|
|
|
if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode))
|
|
|
|
projid = zfs_inherit_projid(dzp);
|
|
|
|
if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) {
|
2016-01-26 23:29:46 +03:00
|
|
|
zfs_acl_ids_free(&acl_ids);
|
|
|
|
error = SET_ERROR(EDQUOT);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
tx = dmu_tx_create(os);
|
|
|
|
|
|
|
|
dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
|
|
|
|
ZFS_SA_BASE_ATTR_SIZE);
|
2017-03-08 03:21:37 +03:00
|
|
|
dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
|
2016-01-26 23:29:46 +03:00
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
fuid_dirtied = zfsvfs->z_fuid_dirty;
|
2016-01-26 23:29:46 +03:00
|
|
|
if (fuid_dirtied)
|
2017-03-08 03:21:37 +03:00
|
|
|
zfs_fuid_txhold(zfsvfs, tx);
|
|
|
|
if (!zfsvfs->z_use_sa &&
|
2016-01-26 23:29:46 +03:00
|
|
|
acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
|
|
|
|
dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
|
|
|
|
0, acl_ids.z_aclp->z_acl_bytes);
|
|
|
|
}
|
OpenZFS 8997 - ztest assertion failure in zil_lwb_write_issue
PROBLEM
=======
When `dmu_tx_assign` is called from `zil_lwb_write_issue`, it's possible
for either `ERESTART` or `EIO` to be returned.
If `ERESTART` is returned, this will cause an assertion to fail directly
in `zil_lwb_write_issue`, where the code assumes the return value is
`EIO` if `dmu_tx_assign` returns a non-zero value. This can occur if the
SPA is suspended when `dmu_tx_assign` is called, and most often occurs
when running `zloop`.
If `EIO` is returned, this can cause assertions to fail elsewhere in the
ZIL code. For example, `zil_commit_waiter_timeout` contains the
following logic:
lwb_t *nlwb = zil_lwb_write_issue(zilog, lwb);
ASSERT3S(lwb->lwb_state, !=, LWB_STATE_OPENED);
In this case, if `dmu_tx_assign` returned `EIO` from within
`zil_lwb_write_issue`, the `lwb` variable passed in will not be issued
to disk. Thus, it's `lwb_state` field will remain `LWB_STATE_OPENED` and
this assertion will fail. `zil_commit_waiter_timeout` assumes that after
it calls `zil_lwb_write_issue`, the `lwb` will be issued to disk, and
doesn't handle the case where this is not true; i.e. it doesn't handle
the case where `dmu_tx_assign` returns `EIO`.
SOLUTION
========
This change modifies the `dmu_tx_assign` function such that `txg_how` is
a bitmask, rather than of the `txg_how_t` enum type. Now, the previous
`TXG_WAITED` semantics can be used via `TXG_NOTHROTTLE`, along with
specifying either `TXG_NOWAIT` or `TXG_WAIT` semantics.
Previously, when `TXG_WAITED` was specified, `TXG_NOWAIT` semantics was
automatically invoked. This was not ideal when using `TXG_WAITED` within
`zil_lwb_write_issued`, leading the problem described above. Rather, we
want to achieve the semantics of `TXG_WAIT`, while also preventing the
`tx` from being penalized via the dirty delay throttling.
With this change, `zil_lwb_write_issued` can acheive the semtantics that
it requires by passing in the value `TXG_WAIT | TXG_NOTHROTTLE` to
`dmu_tx_assign`.
Further, consumers of `dmu_tx_assign` wishing to achieve the old
`TXG_WAITED` semantics can pass in the value `TXG_NOWAIT | TXG_NOTHROTTLE`.
Authored by: Prakash Surya <prakash.surya@delphix.com>
Approved by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: Andriy Gapon <avg@FreeBSD.org>
Ported-by: Brian Behlendorf <behlendorf1@llnl.gov>
Porting Notes:
- Additionally updated `zfs_tmpfile` to use `TXG_NOTHROTTLE`
OpenZFS-issue: https://www.illumos.org/issues/8997
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/19ea6cb0f9
Closes #7084
2018-01-09 00:45:53 +03:00
|
|
|
error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
|
2016-01-26 23:29:46 +03:00
|
|
|
if (error) {
|
|
|
|
if (error == ERESTART) {
|
|
|
|
waited = B_TRUE;
|
|
|
|
dmu_tx_wait(tx);
|
|
|
|
dmu_tx_abort(tx);
|
|
|
|
goto top;
|
|
|
|
}
|
|
|
|
zfs_acl_ids_free(&acl_ids);
|
|
|
|
dmu_tx_abort(tx);
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2016-01-26 23:29:46 +03:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
zfs_mknode(dzp, vap, tx, cr, IS_TMPFILE, &zp, &acl_ids);
|
|
|
|
|
|
|
|
if (fuid_dirtied)
|
2017-03-08 03:21:37 +03:00
|
|
|
zfs_fuid_sync(zfsvfs, tx);
|
2016-01-26 23:29:46 +03:00
|
|
|
|
|
|
|
/* Add to unlinked set */
|
2019-08-13 16:58:02 +03:00
|
|
|
zp->z_unlinked = B_TRUE;
|
2016-01-26 23:29:46 +03:00
|
|
|
zfs_unlinked_add(zp, tx);
|
|
|
|
zfs_acl_ids_free(&acl_ids);
|
|
|
|
dmu_tx_commit(tx);
|
|
|
|
out:
|
|
|
|
|
|
|
|
if (error) {
|
|
|
|
if (zp)
|
2019-12-11 22:53:57 +03:00
|
|
|
zrele(zp);
|
2016-01-26 23:29:46 +03:00
|
|
|
} else {
|
|
|
|
zfs_inode_update(dzp);
|
|
|
|
zfs_inode_update(zp);
|
|
|
|
*ipp = ZTOI(zp);
|
|
|
|
}
|
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2016-01-26 23:29:46 +03:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* Remove an entry from a directory.
|
|
|
|
*
|
2019-12-11 22:53:57 +03:00
|
|
|
* IN: dzp - znode of directory to remove entry from.
|
2008-11-20 23:01:55 +03:00
|
|
|
* name - name of entry to remove.
|
|
|
|
* cr - credentials of caller.
|
2019-05-26 00:29:10 +03:00
|
|
|
* flags - case flags.
|
2008-11-20 23:01:55 +03:00
|
|
|
*
|
|
|
|
* RETURN: 0 if success
|
|
|
|
* error code if failure
|
|
|
|
*
|
|
|
|
* Timestamps:
|
2019-12-11 22:53:57 +03:00
|
|
|
* dzp - ctime|mtime
|
2011-02-08 22:16:06 +03:00
|
|
|
* ip - ctime (if nlink > 0)
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
2010-05-29 00:45:14 +04:00
|
|
|
|
|
|
|
uint64_t null_xattr = 0;
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*ARGSUSED*/
|
2010-12-17 22:18:08 +03:00
|
|
|
int
|
2019-12-11 22:53:57 +03:00
|
|
|
zfs_remove(znode_t *dzp, char *name, cred_t *cr, int flags)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2019-12-11 22:53:57 +03:00
|
|
|
znode_t *zp;
|
2010-08-27 01:24:34 +04:00
|
|
|
znode_t *xzp;
|
2019-12-11 22:53:57 +03:00
|
|
|
zfsvfs_t *zfsvfs = ZTOZSB(dzp);
|
2008-11-20 23:01:55 +03:00
|
|
|
zilog_t *zilog;
|
2015-08-21 04:43:10 +03:00
|
|
|
uint64_t acl_obj, xattr_obj;
|
2011-02-08 22:16:06 +03:00
|
|
|
uint64_t xattr_obj_unlinked = 0;
|
2010-08-27 01:24:34 +04:00
|
|
|
uint64_t obj = 0;
|
2016-07-14 17:44:38 +03:00
|
|
|
uint64_t links;
|
2008-11-20 23:01:55 +03:00
|
|
|
zfs_dirlock_t *dl;
|
|
|
|
dmu_tx_t *tx;
|
2015-08-21 04:43:10 +03:00
|
|
|
boolean_t may_delete_now, delete_now = FALSE;
|
|
|
|
boolean_t unlinked, toobig = FALSE;
|
2008-11-20 23:01:55 +03:00
|
|
|
uint64_t txtype;
|
|
|
|
pathname_t *realnmp = NULL;
|
|
|
|
pathname_t realnm;
|
|
|
|
int error;
|
|
|
|
int zflg = ZEXISTS;
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-29 07:01:20 +04:00
|
|
|
boolean_t waited = B_FALSE;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2016-11-10 19:50:32 +03:00
|
|
|
if (name == NULL)
|
|
|
|
return (SET_ERROR(EINVAL));
|
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_ENTER(zfsvfs);
|
2008-11-20 23:01:55 +03:00
|
|
|
ZFS_VERIFY_ZP(dzp);
|
2017-03-08 03:21:37 +03:00
|
|
|
zilog = zfsvfs->z_log;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
if (flags & FIGNORECASE) {
|
|
|
|
zflg |= ZCILOOK;
|
|
|
|
pn_alloc(&realnm);
|
|
|
|
realnmp = &realnm;
|
|
|
|
}
|
|
|
|
|
|
|
|
top:
|
2010-08-27 01:24:34 +04:00
|
|
|
xattr_obj = 0;
|
|
|
|
xzp = NULL;
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* Attempt to lock directory; fail if entry doesn't exist.
|
|
|
|
*/
|
2010-12-17 01:05:42 +03:00
|
|
|
if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
|
|
|
|
NULL, realnmp))) {
|
2008-11-20 23:01:55 +03:00
|
|
|
if (realnmp)
|
|
|
|
pn_free(realnmp);
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2008-11-20 23:01:55 +03:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
2010-12-17 01:05:42 +03:00
|
|
|
if ((error = zfs_zaccess_delete(dzp, zp, cr))) {
|
2008-11-20 23:01:55 +03:00
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Need to use rmdir for removing directories.
|
|
|
|
*/
|
2019-12-11 22:53:57 +03:00
|
|
|
if (S_ISDIR(ZTOI(zp)->i_mode)) {
|
2013-03-08 22:41:28 +04:00
|
|
|
error = SET_ERROR(EPERM);
|
2008-11-20 23:01:55 +03:00
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2014-07-07 23:49:36 +04:00
|
|
|
mutex_enter(&zp->z_lock);
|
2019-12-11 22:53:57 +03:00
|
|
|
may_delete_now = atomic_read(&ZTOI(zp)->i_count) == 1 &&
|
|
|
|
!(zp->z_is_mapped);
|
2014-07-07 23:49:36 +04:00
|
|
|
mutex_exit(&zp->z_lock);
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
2015-08-21 04:43:10 +03:00
|
|
|
* We may delete the znode now, or we may put it in the unlinked set;
|
|
|
|
* it depends on whether we're the last link, and on whether there are
|
|
|
|
* other holds on the inode. So we dmu_tx_hold() the right things to
|
|
|
|
* allow for either case.
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
2010-08-27 01:24:34 +04:00
|
|
|
obj = zp->z_id;
|
2017-03-08 03:21:37 +03:00
|
|
|
tx = dmu_tx_create(zfsvfs->z_os);
|
2008-11-20 23:01:55 +03:00
|
|
|
dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
|
2010-05-29 00:45:14 +04:00
|
|
|
dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
|
|
|
|
zfs_sa_upgrade_txholds(tx, zp);
|
|
|
|
zfs_sa_upgrade_txholds(tx, dzp);
|
2015-08-21 04:43:10 +03:00
|
|
|
if (may_delete_now) {
|
|
|
|
toobig = zp->z_size > zp->z_blksz * zfs_delete_blocks;
|
|
|
|
/* if the file is too big, only hold_free a token amount */
|
|
|
|
dmu_tx_hold_free(tx, zp->z_id, 0,
|
|
|
|
(toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END));
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
/* are there any extended attributes? */
|
2017-03-08 03:21:37 +03:00
|
|
|
error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
|
2010-05-29 00:45:14 +04:00
|
|
|
&xattr_obj, sizeof (xattr_obj));
|
2010-08-27 01:24:34 +04:00
|
|
|
if (error == 0 && xattr_obj) {
|
2017-03-08 03:21:37 +03:00
|
|
|
error = zfs_zget(zfsvfs, xattr_obj, &xzp);
|
2013-05-11 01:17:03 +04:00
|
|
|
ASSERT0(error);
|
2010-05-29 00:45:14 +04:00
|
|
|
dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
|
|
|
|
dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2015-08-21 04:43:10 +03:00
|
|
|
mutex_enter(&zp->z_lock);
|
|
|
|
if ((acl_obj = zfs_external_acl(zp)) != 0 && may_delete_now)
|
|
|
|
dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
|
|
|
|
mutex_exit(&zp->z_lock);
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/* charge as an update -- would be nice not to charge at all */
|
2017-03-08 03:21:37 +03:00
|
|
|
dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2014-07-07 23:49:36 +04:00
|
|
|
/*
|
2016-01-23 03:00:59 +03:00
|
|
|
* Mark this transaction as typically resulting in a net free of space
|
2014-07-07 23:49:36 +04:00
|
|
|
*/
|
2016-01-23 03:00:59 +03:00
|
|
|
dmu_tx_mark_netfree(tx);
|
2014-07-07 23:49:36 +04:00
|
|
|
|
OpenZFS 8997 - ztest assertion failure in zil_lwb_write_issue
PROBLEM
=======
When `dmu_tx_assign` is called from `zil_lwb_write_issue`, it's possible
for either `ERESTART` or `EIO` to be returned.
If `ERESTART` is returned, this will cause an assertion to fail directly
in `zil_lwb_write_issue`, where the code assumes the return value is
`EIO` if `dmu_tx_assign` returns a non-zero value. This can occur if the
SPA is suspended when `dmu_tx_assign` is called, and most often occurs
when running `zloop`.
If `EIO` is returned, this can cause assertions to fail elsewhere in the
ZIL code. For example, `zil_commit_waiter_timeout` contains the
following logic:
lwb_t *nlwb = zil_lwb_write_issue(zilog, lwb);
ASSERT3S(lwb->lwb_state, !=, LWB_STATE_OPENED);
In this case, if `dmu_tx_assign` returned `EIO` from within
`zil_lwb_write_issue`, the `lwb` variable passed in will not be issued
to disk. Thus, it's `lwb_state` field will remain `LWB_STATE_OPENED` and
this assertion will fail. `zil_commit_waiter_timeout` assumes that after
it calls `zil_lwb_write_issue`, the `lwb` will be issued to disk, and
doesn't handle the case where this is not true; i.e. it doesn't handle
the case where `dmu_tx_assign` returns `EIO`.
SOLUTION
========
This change modifies the `dmu_tx_assign` function such that `txg_how` is
a bitmask, rather than of the `txg_how_t` enum type. Now, the previous
`TXG_WAITED` semantics can be used via `TXG_NOTHROTTLE`, along with
specifying either `TXG_NOWAIT` or `TXG_WAIT` semantics.
Previously, when `TXG_WAITED` was specified, `TXG_NOWAIT` semantics was
automatically invoked. This was not ideal when using `TXG_WAITED` within
`zil_lwb_write_issued`, leading the problem described above. Rather, we
want to achieve the semantics of `TXG_WAIT`, while also preventing the
`tx` from being penalized via the dirty delay throttling.
With this change, `zil_lwb_write_issued` can acheive the semtantics that
it requires by passing in the value `TXG_WAIT | TXG_NOTHROTTLE` to
`dmu_tx_assign`.
Further, consumers of `dmu_tx_assign` wishing to achieve the old
`TXG_WAITED` semantics can pass in the value `TXG_NOWAIT | TXG_NOTHROTTLE`.
Authored by: Prakash Surya <prakash.surya@delphix.com>
Approved by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: Andriy Gapon <avg@FreeBSD.org>
Ported-by: Brian Behlendorf <behlendorf1@llnl.gov>
Porting Notes:
- Additionally updated `zfs_tmpfile` to use `TXG_NOTHROTTLE`
OpenZFS-issue: https://www.illumos.org/issues/8997
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/19ea6cb0f9
Closes #7084
2018-01-09 00:45:53 +03:00
|
|
|
error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
|
2008-11-20 23:01:55 +03:00
|
|
|
if (error) {
|
|
|
|
zfs_dirent_unlock(dl);
|
2009-01-16 00:59:39 +03:00
|
|
|
if (error == ERESTART) {
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-29 07:01:20 +04:00
|
|
|
waited = B_TRUE;
|
2008-11-20 23:01:55 +03:00
|
|
|
dmu_tx_wait(tx);
|
|
|
|
dmu_tx_abort(tx);
|
2019-12-11 22:53:57 +03:00
|
|
|
zrele(zp);
|
2017-02-08 02:57:50 +03:00
|
|
|
if (xzp)
|
2019-12-11 22:53:57 +03:00
|
|
|
zrele(xzp);
|
2008-11-20 23:01:55 +03:00
|
|
|
goto top;
|
|
|
|
}
|
|
|
|
if (realnmp)
|
|
|
|
pn_free(realnmp);
|
|
|
|
dmu_tx_abort(tx);
|
2019-12-11 22:53:57 +03:00
|
|
|
zrele(zp);
|
2017-02-08 02:57:50 +03:00
|
|
|
if (xzp)
|
2019-12-11 22:53:57 +03:00
|
|
|
zrele(xzp);
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2008-11-20 23:01:55 +03:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Remove the directory entry.
|
|
|
|
*/
|
|
|
|
error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked);
|
|
|
|
|
|
|
|
if (error) {
|
|
|
|
dmu_tx_commit(tx);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (unlinked) {
|
2010-08-27 01:24:34 +04:00
|
|
|
/*
|
|
|
|
* Hold z_lock so that we can make sure that the ACL obj
|
|
|
|
* hasn't changed. Could have been deleted due to
|
|
|
|
* zfs_sa_upgrade().
|
|
|
|
*/
|
|
|
|
mutex_enter(&zp->z_lock);
|
2017-03-08 03:21:37 +03:00
|
|
|
(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
|
2010-05-29 00:45:14 +04:00
|
|
|
&xattr_obj_unlinked, sizeof (xattr_obj_unlinked));
|
2015-08-21 04:43:10 +03:00
|
|
|
delete_now = may_delete_now && !toobig &&
|
2019-12-11 22:53:57 +03:00
|
|
|
atomic_read(&ZTOI(zp)->i_count) == 1 &&
|
|
|
|
!(zp->z_is_mapped) && xattr_obj == xattr_obj_unlinked &&
|
|
|
|
zfs_external_acl(zp) == acl_obj;
|
2015-08-21 04:43:10 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
if (delete_now) {
|
|
|
|
if (xattr_obj_unlinked) {
|
2016-07-14 17:44:38 +03:00
|
|
|
ASSERT3U(ZTOI(xzp)->i_nlink, ==, 2);
|
2015-08-21 04:43:10 +03:00
|
|
|
mutex_enter(&xzp->z_lock);
|
2019-08-13 16:58:02 +03:00
|
|
|
xzp->z_unlinked = B_TRUE;
|
2016-07-14 17:44:38 +03:00
|
|
|
clear_nlink(ZTOI(xzp));
|
|
|
|
links = 0;
|
2017-03-08 03:21:37 +03:00
|
|
|
error = sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs),
|
2016-07-14 17:44:38 +03:00
|
|
|
&links, sizeof (links), tx);
|
2015-08-21 04:43:10 +03:00
|
|
|
ASSERT3U(error, ==, 0);
|
|
|
|
mutex_exit(&xzp->z_lock);
|
|
|
|
zfs_unlinked_add(xzp, tx);
|
|
|
|
|
|
|
|
if (zp->z_is_sa)
|
|
|
|
error = sa_remove(zp->z_sa_hdl,
|
2017-03-08 03:21:37 +03:00
|
|
|
SA_ZPL_XATTR(zfsvfs), tx);
|
2015-08-21 04:43:10 +03:00
|
|
|
else
|
|
|
|
error = sa_update(zp->z_sa_hdl,
|
2017-03-08 03:21:37 +03:00
|
|
|
SA_ZPL_XATTR(zfsvfs), &null_xattr,
|
2015-08-21 04:43:10 +03:00
|
|
|
sizeof (uint64_t), tx);
|
|
|
|
ASSERT0(error);
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* Add to the unlinked set because a new reference could be
|
|
|
|
* taken concurrently resulting in a deferred destruction.
|
|
|
|
*/
|
|
|
|
zfs_unlinked_add(zp, tx);
|
|
|
|
mutex_exit(&zp->z_lock);
|
|
|
|
} else if (unlinked) {
|
2010-08-27 01:24:34 +04:00
|
|
|
mutex_exit(&zp->z_lock);
|
2008-11-20 23:01:55 +03:00
|
|
|
zfs_unlinked_add(zp, tx);
|
|
|
|
}
|
|
|
|
|
|
|
|
txtype = TX_REMOVE;
|
|
|
|
if (flags & FIGNORECASE)
|
|
|
|
txtype |= TX_CI;
|
2019-08-14 06:21:27 +03:00
|
|
|
zfs_log_remove(zilog, tx, txtype, dzp, name, obj, unlinked);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
dmu_tx_commit(tx);
|
|
|
|
out:
|
|
|
|
if (realnmp)
|
|
|
|
pn_free(realnmp);
|
|
|
|
|
|
|
|
zfs_dirent_unlock(dl);
|
2011-01-06 01:27:30 +03:00
|
|
|
zfs_inode_update(dzp);
|
2017-02-08 02:57:50 +03:00
|
|
|
zfs_inode_update(zp);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2017-02-08 02:57:50 +03:00
|
|
|
if (delete_now)
|
2019-12-11 22:53:57 +03:00
|
|
|
zrele(zp);
|
2017-02-08 02:57:50 +03:00
|
|
|
else
|
2019-12-11 22:53:57 +03:00
|
|
|
zfs_zrele_async(zp);
|
2015-08-21 04:43:10 +03:00
|
|
|
|
|
|
|
if (xzp) {
|
|
|
|
zfs_inode_update(xzp);
|
2019-12-11 22:53:57 +03:00
|
|
|
zfs_zrele_async(xzp);
|
2015-08-21 04:43:10 +03:00
|
|
|
}
|
2010-05-29 00:45:14 +04:00
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
|
2010-08-27 01:24:34 +04:00
|
|
|
zil_commit(zilog, 0);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2008-11-20 23:01:55 +03:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2019-12-11 22:53:57 +03:00
|
|
|
* Create a new directory and insert it into dzp using the name
|
2008-11-20 23:01:55 +03:00
|
|
|
* provided. Return a pointer to the inserted directory.
|
|
|
|
*
|
2019-12-11 22:53:57 +03:00
|
|
|
* IN: dzp - znode of directory to add subdir to.
|
2008-11-20 23:01:55 +03:00
|
|
|
* dirname - name of new directory.
|
|
|
|
* vap - attributes of new directory.
|
|
|
|
* cr - credentials of caller.
|
2019-05-26 00:29:10 +03:00
|
|
|
* flags - case flags.
|
2008-11-20 23:01:55 +03:00
|
|
|
* vsecp - ACL to be set
|
|
|
|
*
|
2019-12-11 22:53:57 +03:00
|
|
|
* OUT: zpp - znode of created directory.
|
2008-11-20 23:01:55 +03:00
|
|
|
*
|
|
|
|
* RETURN: 0 if success
|
|
|
|
* error code if failure
|
|
|
|
*
|
|
|
|
* Timestamps:
|
2019-12-11 22:53:57 +03:00
|
|
|
* dzp - ctime|mtime updated
|
|
|
|
* zpp - ctime|mtime|atime updated
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
|
|
|
/*ARGSUSED*/
|
2010-12-17 22:18:08 +03:00
|
|
|
int
|
2019-12-11 22:53:57 +03:00
|
|
|
zfs_mkdir(znode_t *dzp, char *dirname, vattr_t *vap, znode_t **zpp,
|
2011-02-08 22:16:06 +03:00
|
|
|
cred_t *cr, int flags, vsecattr_t *vsecp)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2019-12-11 22:53:57 +03:00
|
|
|
znode_t *zp;
|
|
|
|
zfsvfs_t *zfsvfs = ZTOZSB(dzp);
|
2008-11-20 23:01:55 +03:00
|
|
|
zilog_t *zilog;
|
|
|
|
zfs_dirlock_t *dl;
|
|
|
|
uint64_t txtype;
|
|
|
|
dmu_tx_t *tx;
|
|
|
|
int error;
|
|
|
|
int zf = ZNEW;
|
2008-12-03 23:09:06 +03:00
|
|
|
uid_t uid;
|
|
|
|
gid_t gid = crgetgid(cr);
|
2010-05-29 00:45:14 +04:00
|
|
|
zfs_acl_ids_t acl_ids;
|
2009-07-03 02:44:48 +04:00
|
|
|
boolean_t fuid_dirtied;
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-29 07:01:20 +04:00
|
|
|
boolean_t waited = B_FALSE;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2011-02-08 22:16:06 +03:00
|
|
|
ASSERT(S_ISDIR(vap->va_mode));
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If we have an ephemeral id, ACL, or XVATTR then
|
|
|
|
* make sure file system is at proper version
|
|
|
|
*/
|
|
|
|
|
2011-02-08 22:16:06 +03:00
|
|
|
uid = crgetuid(cr);
|
2017-03-08 03:21:37 +03:00
|
|
|
if (zfsvfs->z_use_fuids == B_FALSE &&
|
2011-02-08 22:16:06 +03:00
|
|
|
(vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
|
2013-03-08 22:41:28 +04:00
|
|
|
return (SET_ERROR(EINVAL));
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2016-11-10 19:50:32 +03:00
|
|
|
if (dirname == NULL)
|
|
|
|
return (SET_ERROR(EINVAL));
|
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_ENTER(zfsvfs);
|
2008-11-20 23:01:55 +03:00
|
|
|
ZFS_VERIFY_ZP(dzp);
|
2017-03-08 03:21:37 +03:00
|
|
|
zilog = zfsvfs->z_log;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
if (dzp->z_pflags & ZFS_XATTR) {
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2013-03-08 22:41:28 +04:00
|
|
|
return (SET_ERROR(EINVAL));
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
if (zfsvfs->z_utf8 && u8_validate(dirname,
|
2008-11-20 23:01:55 +03:00
|
|
|
strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2013-03-08 22:41:28 +04:00
|
|
|
return (SET_ERROR(EILSEQ));
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
if (flags & FIGNORECASE)
|
|
|
|
zf |= ZCILOOK;
|
|
|
|
|
2011-03-01 23:24:09 +03:00
|
|
|
if (vap->va_mask & ATTR_XVATTR) {
|
2008-11-20 23:01:55 +03:00
|
|
|
if ((error = secpolicy_xvattr((xvattr_t *)vap,
|
2011-02-08 22:16:06 +03:00
|
|
|
crgetuid(cr), cr, vap->va_mode)) != 0) {
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2008-11-20 23:01:55 +03:00
|
|
|
return (error);
|
|
|
|
}
|
2010-05-29 00:45:14 +04:00
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
if ((error = zfs_acl_ids_create(dzp, 0, vap, cr,
|
|
|
|
vsecp, &acl_ids)) != 0) {
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2010-05-29 00:45:14 +04:00
|
|
|
return (error);
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* First make sure the new directory doesn't exist.
|
2010-05-29 00:45:14 +04:00
|
|
|
*
|
|
|
|
* Existence is checked first to make sure we don't return
|
|
|
|
* EACCES instead of EEXIST which can cause some applications
|
|
|
|
* to fail.
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
|
|
|
top:
|
2019-12-11 22:53:57 +03:00
|
|
|
*zpp = NULL;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-12-17 01:05:42 +03:00
|
|
|
if ((error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf,
|
|
|
|
NULL, NULL))) {
|
2010-05-29 00:45:14 +04:00
|
|
|
zfs_acl_ids_free(&acl_ids);
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2008-11-20 23:01:55 +03:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
2010-12-17 01:05:42 +03:00
|
|
|
if ((error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr))) {
|
2010-05-29 00:45:14 +04:00
|
|
|
zfs_acl_ids_free(&acl_ids);
|
2008-11-20 23:01:55 +03:00
|
|
|
zfs_dirent_unlock(dl);
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2008-11-20 23:01:55 +03:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
2018-02-14 01:54:54 +03:00
|
|
|
if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zfs_inherit_projid(dzp))) {
|
2009-08-18 22:43:27 +04:00
|
|
|
zfs_acl_ids_free(&acl_ids);
|
2009-07-03 02:44:48 +04:00
|
|
|
zfs_dirent_unlock(dl);
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2013-03-08 22:41:28 +04:00
|
|
|
return (SET_ERROR(EDQUOT));
|
2009-07-03 02:44:48 +04:00
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* Add a new entry to the directory.
|
|
|
|
*/
|
2017-03-08 03:21:37 +03:00
|
|
|
tx = dmu_tx_create(zfsvfs->z_os);
|
2008-11-20 23:01:55 +03:00
|
|
|
dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
|
|
|
|
dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
|
2017-03-08 03:21:37 +03:00
|
|
|
fuid_dirtied = zfsvfs->z_fuid_dirty;
|
2009-07-03 02:44:48 +04:00
|
|
|
if (fuid_dirtied)
|
2017-03-08 03:21:37 +03:00
|
|
|
zfs_fuid_txhold(zfsvfs, tx);
|
|
|
|
if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
|
2010-05-29 00:45:14 +04:00
|
|
|
dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
|
|
|
|
acl_ids.z_aclp->z_acl_bytes);
|
|
|
|
}
|
|
|
|
|
|
|
|
dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
|
|
|
|
ZFS_SA_BASE_ATTR_SIZE);
|
|
|
|
|
OpenZFS 8997 - ztest assertion failure in zil_lwb_write_issue
PROBLEM
=======
When `dmu_tx_assign` is called from `zil_lwb_write_issue`, it's possible
for either `ERESTART` or `EIO` to be returned.
If `ERESTART` is returned, this will cause an assertion to fail directly
in `zil_lwb_write_issue`, where the code assumes the return value is
`EIO` if `dmu_tx_assign` returns a non-zero value. This can occur if the
SPA is suspended when `dmu_tx_assign` is called, and most often occurs
when running `zloop`.
If `EIO` is returned, this can cause assertions to fail elsewhere in the
ZIL code. For example, `zil_commit_waiter_timeout` contains the
following logic:
lwb_t *nlwb = zil_lwb_write_issue(zilog, lwb);
ASSERT3S(lwb->lwb_state, !=, LWB_STATE_OPENED);
In this case, if `dmu_tx_assign` returned `EIO` from within
`zil_lwb_write_issue`, the `lwb` variable passed in will not be issued
to disk. Thus, it's `lwb_state` field will remain `LWB_STATE_OPENED` and
this assertion will fail. `zil_commit_waiter_timeout` assumes that after
it calls `zil_lwb_write_issue`, the `lwb` will be issued to disk, and
doesn't handle the case where this is not true; i.e. it doesn't handle
the case where `dmu_tx_assign` returns `EIO`.
SOLUTION
========
This change modifies the `dmu_tx_assign` function such that `txg_how` is
a bitmask, rather than of the `txg_how_t` enum type. Now, the previous
`TXG_WAITED` semantics can be used via `TXG_NOTHROTTLE`, along with
specifying either `TXG_NOWAIT` or `TXG_WAIT` semantics.
Previously, when `TXG_WAITED` was specified, `TXG_NOWAIT` semantics was
automatically invoked. This was not ideal when using `TXG_WAITED` within
`zil_lwb_write_issued`, leading the problem described above. Rather, we
want to achieve the semantics of `TXG_WAIT`, while also preventing the
`tx` from being penalized via the dirty delay throttling.
With this change, `zil_lwb_write_issued` can acheive the semtantics that
it requires by passing in the value `TXG_WAIT | TXG_NOTHROTTLE` to
`dmu_tx_assign`.
Further, consumers of `dmu_tx_assign` wishing to achieve the old
`TXG_WAITED` semantics can pass in the value `TXG_NOWAIT | TXG_NOTHROTTLE`.
Authored by: Prakash Surya <prakash.surya@delphix.com>
Approved by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: Andriy Gapon <avg@FreeBSD.org>
Ported-by: Brian Behlendorf <behlendorf1@llnl.gov>
Porting Notes:
- Additionally updated `zfs_tmpfile` to use `TXG_NOTHROTTLE`
OpenZFS-issue: https://www.illumos.org/issues/8997
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/19ea6cb0f9
Closes #7084
2018-01-09 00:45:53 +03:00
|
|
|
error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
|
2008-11-20 23:01:55 +03:00
|
|
|
if (error) {
|
|
|
|
zfs_dirent_unlock(dl);
|
2009-01-16 00:59:39 +03:00
|
|
|
if (error == ERESTART) {
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-29 07:01:20 +04:00
|
|
|
waited = B_TRUE;
|
2008-11-20 23:01:55 +03:00
|
|
|
dmu_tx_wait(tx);
|
|
|
|
dmu_tx_abort(tx);
|
|
|
|
goto top;
|
|
|
|
}
|
2010-05-29 00:45:14 +04:00
|
|
|
zfs_acl_ids_free(&acl_ids);
|
2008-11-20 23:01:55 +03:00
|
|
|
dmu_tx_abort(tx);
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2008-11-20 23:01:55 +03:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Create new node.
|
|
|
|
*/
|
2010-05-29 00:45:14 +04:00
|
|
|
zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Now put new name in parent dir.
|
|
|
|
*/
|
Fix ENOSPC in "Handle zap_add() failures in ..."
Commit cc63068 caused ENOSPC error when copy a large amount of files
between two directories. The reason is that the patch limits zap leaf
expansion to 2 retries, and return ENOSPC when failed.
The intent for limiting retries is to prevent pointlessly growing table
to max size when adding a block full of entries with same name in
different case in mixed mode. However, it turns out we cannot use any
limit on the retry. When we copy files from one directory in readdir
order, we are copying in hash order, one leaf block at a time. Which
means that if the leaf block in source directory has expanded 6 times,
and you copy those entries in that block, by the time you need to expand
the leaf in destination directory, you need to expand it 6 times in one
go. So any limit on the retry will result in error where it shouldn't.
Note that while we do use different salt for different directories, it
seems that the salt/hash function doesn't provide enough randomization
to the hash distance to prevent this from happening.
Since cc63068 has already been reverted. This patch adds it back and
removes the retry limit.
Also, as it turn out, failing on zap_add() has a serious side effect for
mzap_upgrade(). When upgrading from micro zap to fat zap, it will
call zap_add() to transfer entries one at a time. If it hit any error
halfway through, the remaining entries will be lost, causing those files
to become orphan. This patch add a VERIFY to catch it.
Reviewed-by: Sanjeev Bagewadi <sanjeev.bagewadi@gmail.com>
Reviewed-by: Richard Yao <ryao@gentoo.org>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Albert Lee <trisk@forkgnu.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Signed-off-by: Chunwei Chen <david.chen@nutanix.com>
Closes #7401
Closes #7421
2018-04-19 00:19:50 +03:00
|
|
|
error = zfs_link_create(dl, zp, tx, ZNEW);
|
|
|
|
if (error != 0) {
|
|
|
|
zfs_znode_delete(zp, tx);
|
|
|
|
remove_inode_hash(ZTOI(zp));
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (fuid_dirtied)
|
|
|
|
zfs_fuid_sync(zfsvfs, tx);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2019-12-11 22:53:57 +03:00
|
|
|
*zpp = zp;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap);
|
|
|
|
if (flags & FIGNORECASE)
|
|
|
|
txtype |= TX_CI;
|
2009-07-03 02:44:48 +04:00
|
|
|
zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp,
|
|
|
|
acl_ids.z_fuidp, vap);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
Fix ENOSPC in "Handle zap_add() failures in ..."
Commit cc63068 caused ENOSPC error when copy a large amount of files
between two directories. The reason is that the patch limits zap leaf
expansion to 2 retries, and return ENOSPC when failed.
The intent for limiting retries is to prevent pointlessly growing table
to max size when adding a block full of entries with same name in
different case in mixed mode. However, it turns out we cannot use any
limit on the retry. When we copy files from one directory in readdir
order, we are copying in hash order, one leaf block at a time. Which
means that if the leaf block in source directory has expanded 6 times,
and you copy those entries in that block, by the time you need to expand
the leaf in destination directory, you need to expand it 6 times in one
go. So any limit on the retry will result in error where it shouldn't.
Note that while we do use different salt for different directories, it
seems that the salt/hash function doesn't provide enough randomization
to the hash distance to prevent this from happening.
Since cc63068 has already been reverted. This patch adds it back and
removes the retry limit.
Also, as it turn out, failing on zap_add() has a serious side effect for
mzap_upgrade(). When upgrading from micro zap to fat zap, it will
call zap_add() to transfer entries one at a time. If it hit any error
halfway through, the remaining entries will be lost, causing those files
to become orphan. This patch add a VERIFY to catch it.
Reviewed-by: Sanjeev Bagewadi <sanjeev.bagewadi@gmail.com>
Reviewed-by: Richard Yao <ryao@gentoo.org>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Albert Lee <trisk@forkgnu.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Signed-off-by: Chunwei Chen <david.chen@nutanix.com>
Closes #7401
Closes #7421
2018-04-19 00:19:50 +03:00
|
|
|
out:
|
2009-07-03 02:44:48 +04:00
|
|
|
zfs_acl_ids_free(&acl_ids);
|
2010-05-29 00:45:14 +04:00
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
dmu_tx_commit(tx);
|
|
|
|
|
|
|
|
zfs_dirent_unlock(dl);
|
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
|
2010-08-27 01:24:34 +04:00
|
|
|
zil_commit(zilog, 0);
|
2010-05-29 00:45:14 +04:00
|
|
|
|
Fix ENOSPC in "Handle zap_add() failures in ..."
Commit cc63068 caused ENOSPC error when copy a large amount of files
between two directories. The reason is that the patch limits zap leaf
expansion to 2 retries, and return ENOSPC when failed.
The intent for limiting retries is to prevent pointlessly growing table
to max size when adding a block full of entries with same name in
different case in mixed mode. However, it turns out we cannot use any
limit on the retry. When we copy files from one directory in readdir
order, we are copying in hash order, one leaf block at a time. Which
means that if the leaf block in source directory has expanded 6 times,
and you copy those entries in that block, by the time you need to expand
the leaf in destination directory, you need to expand it 6 times in one
go. So any limit on the retry will result in error where it shouldn't.
Note that while we do use different salt for different directories, it
seems that the salt/hash function doesn't provide enough randomization
to the hash distance to prevent this from happening.
Since cc63068 has already been reverted. This patch adds it back and
removes the retry limit.
Also, as it turn out, failing on zap_add() has a serious side effect for
mzap_upgrade(). When upgrading from micro zap to fat zap, it will
call zap_add() to transfer entries one at a time. If it hit any error
halfway through, the remaining entries will be lost, causing those files
to become orphan. This patch add a VERIFY to catch it.
Reviewed-by: Sanjeev Bagewadi <sanjeev.bagewadi@gmail.com>
Reviewed-by: Richard Yao <ryao@gentoo.org>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Albert Lee <trisk@forkgnu.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Signed-off-by: Chunwei Chen <david.chen@nutanix.com>
Closes #7401
Closes #7421
2018-04-19 00:19:50 +03:00
|
|
|
if (error != 0) {
|
2019-12-11 22:53:57 +03:00
|
|
|
zrele(zp);
|
Fix ENOSPC in "Handle zap_add() failures in ..."
Commit cc63068 caused ENOSPC error when copy a large amount of files
between two directories. The reason is that the patch limits zap leaf
expansion to 2 retries, and return ENOSPC when failed.
The intent for limiting retries is to prevent pointlessly growing table
to max size when adding a block full of entries with same name in
different case in mixed mode. However, it turns out we cannot use any
limit on the retry. When we copy files from one directory in readdir
order, we are copying in hash order, one leaf block at a time. Which
means that if the leaf block in source directory has expanded 6 times,
and you copy those entries in that block, by the time you need to expand
the leaf in destination directory, you need to expand it 6 times in one
go. So any limit on the retry will result in error where it shouldn't.
Note that while we do use different salt for different directories, it
seems that the salt/hash function doesn't provide enough randomization
to the hash distance to prevent this from happening.
Since cc63068 has already been reverted. This patch adds it back and
removes the retry limit.
Also, as it turn out, failing on zap_add() has a serious side effect for
mzap_upgrade(). When upgrading from micro zap to fat zap, it will
call zap_add() to transfer entries one at a time. If it hit any error
halfway through, the remaining entries will be lost, causing those files
to become orphan. This patch add a VERIFY to catch it.
Reviewed-by: Sanjeev Bagewadi <sanjeev.bagewadi@gmail.com>
Reviewed-by: Richard Yao <ryao@gentoo.org>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Albert Lee <trisk@forkgnu.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Signed-off-by: Chunwei Chen <david.chen@nutanix.com>
Closes #7401
Closes #7421
2018-04-19 00:19:50 +03:00
|
|
|
} else {
|
|
|
|
zfs_inode_update(dzp);
|
|
|
|
zfs_inode_update(zp);
|
|
|
|
}
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
Fix ENOSPC in "Handle zap_add() failures in ..."
Commit cc63068 caused ENOSPC error when copy a large amount of files
between two directories. The reason is that the patch limits zap leaf
expansion to 2 retries, and return ENOSPC when failed.
The intent for limiting retries is to prevent pointlessly growing table
to max size when adding a block full of entries with same name in
different case in mixed mode. However, it turns out we cannot use any
limit on the retry. When we copy files from one directory in readdir
order, we are copying in hash order, one leaf block at a time. Which
means that if the leaf block in source directory has expanded 6 times,
and you copy those entries in that block, by the time you need to expand
the leaf in destination directory, you need to expand it 6 times in one
go. So any limit on the retry will result in error where it shouldn't.
Note that while we do use different salt for different directories, it
seems that the salt/hash function doesn't provide enough randomization
to the hash distance to prevent this from happening.
Since cc63068 has already been reverted. This patch adds it back and
removes the retry limit.
Also, as it turn out, failing on zap_add() has a serious side effect for
mzap_upgrade(). When upgrading from micro zap to fat zap, it will
call zap_add() to transfer entries one at a time. If it hit any error
halfway through, the remaining entries will be lost, causing those files
to become orphan. This patch add a VERIFY to catch it.
Reviewed-by: Sanjeev Bagewadi <sanjeev.bagewadi@gmail.com>
Reviewed-by: Richard Yao <ryao@gentoo.org>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Albert Lee <trisk@forkgnu.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Signed-off-by: Chunwei Chen <david.chen@nutanix.com>
Closes #7401
Closes #7421
2018-04-19 00:19:50 +03:00
|
|
|
return (error);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Remove a directory subdir entry. If the current working
|
|
|
|
* directory is the same as the subdir to be removed, the
|
|
|
|
* remove will fail.
|
|
|
|
*
|
2019-12-11 22:53:57 +03:00
|
|
|
* IN: dzp - znode of directory to remove from.
|
2008-11-20 23:01:55 +03:00
|
|
|
* name - name of directory to be removed.
|
2011-02-08 22:16:06 +03:00
|
|
|
* cwd - inode of current working directory.
|
2008-11-20 23:01:55 +03:00
|
|
|
* cr - credentials of caller.
|
|
|
|
* flags - case flags
|
|
|
|
*
|
2013-06-11 21:12:34 +04:00
|
|
|
* RETURN: 0 on success, error code on failure.
|
2008-11-20 23:01:55 +03:00
|
|
|
*
|
|
|
|
* Timestamps:
|
2019-12-11 22:53:57 +03:00
|
|
|
* dzp - ctime|mtime updated
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
|
|
|
/*ARGSUSED*/
|
2010-12-17 22:18:08 +03:00
|
|
|
int
|
2019-12-11 22:53:57 +03:00
|
|
|
zfs_rmdir(znode_t *dzp, char *name, znode_t *cwd, cred_t *cr,
|
2011-02-08 22:16:06 +03:00
|
|
|
int flags)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
|
|
|
znode_t *zp;
|
2019-12-11 22:53:57 +03:00
|
|
|
zfsvfs_t *zfsvfs = ZTOZSB(dzp);
|
2008-11-20 23:01:55 +03:00
|
|
|
zilog_t *zilog;
|
|
|
|
zfs_dirlock_t *dl;
|
|
|
|
dmu_tx_t *tx;
|
|
|
|
int error;
|
|
|
|
int zflg = ZEXISTS;
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-29 07:01:20 +04:00
|
|
|
boolean_t waited = B_FALSE;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2016-11-10 19:50:32 +03:00
|
|
|
if (name == NULL)
|
|
|
|
return (SET_ERROR(EINVAL));
|
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_ENTER(zfsvfs);
|
2008-11-20 23:01:55 +03:00
|
|
|
ZFS_VERIFY_ZP(dzp);
|
2017-03-08 03:21:37 +03:00
|
|
|
zilog = zfsvfs->z_log;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
if (flags & FIGNORECASE)
|
|
|
|
zflg |= ZCILOOK;
|
|
|
|
top:
|
|
|
|
zp = NULL;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Attempt to lock directory; fail if entry doesn't exist.
|
|
|
|
*/
|
2010-12-17 01:05:42 +03:00
|
|
|
if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
|
|
|
|
NULL, NULL))) {
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2008-11-20 23:01:55 +03:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
2010-12-17 01:05:42 +03:00
|
|
|
if ((error = zfs_zaccess_delete(dzp, zp, cr))) {
|
2008-11-20 23:01:55 +03:00
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2019-12-11 22:53:57 +03:00
|
|
|
if (!S_ISDIR(ZTOI(zp)->i_mode)) {
|
2013-03-08 22:41:28 +04:00
|
|
|
error = SET_ERROR(ENOTDIR);
|
2008-11-20 23:01:55 +03:00
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2019-12-11 22:53:57 +03:00
|
|
|
if (zp == cwd) {
|
2013-03-08 22:41:28 +04:00
|
|
|
error = SET_ERROR(EINVAL);
|
2008-11-20 23:01:55 +03:00
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2017-01-03 20:31:18 +03:00
|
|
|
* Grab a lock on the directory to make sure that no one is
|
2008-11-20 23:01:55 +03:00
|
|
|
* trying to add (or lookup) entries while we are removing it.
|
|
|
|
*/
|
|
|
|
rw_enter(&zp->z_name_lock, RW_WRITER);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Grab a lock on the parent pointer to make sure we play well
|
|
|
|
* with the treewalk and directory rename code.
|
|
|
|
*/
|
|
|
|
rw_enter(&zp->z_parent_lock, RW_WRITER);
|
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
tx = dmu_tx_create(zfsvfs->z_os);
|
2008-11-20 23:01:55 +03:00
|
|
|
dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
|
2010-05-29 00:45:14 +04:00
|
|
|
dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
|
2017-03-08 03:21:37 +03:00
|
|
|
dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
|
2010-05-29 00:45:14 +04:00
|
|
|
zfs_sa_upgrade_txholds(tx, zp);
|
|
|
|
zfs_sa_upgrade_txholds(tx, dzp);
|
2016-08-30 16:03:05 +03:00
|
|
|
dmu_tx_mark_netfree(tx);
|
OpenZFS 8997 - ztest assertion failure in zil_lwb_write_issue
PROBLEM
=======
When `dmu_tx_assign` is called from `zil_lwb_write_issue`, it's possible
for either `ERESTART` or `EIO` to be returned.
If `ERESTART` is returned, this will cause an assertion to fail directly
in `zil_lwb_write_issue`, where the code assumes the return value is
`EIO` if `dmu_tx_assign` returns a non-zero value. This can occur if the
SPA is suspended when `dmu_tx_assign` is called, and most often occurs
when running `zloop`.
If `EIO` is returned, this can cause assertions to fail elsewhere in the
ZIL code. For example, `zil_commit_waiter_timeout` contains the
following logic:
lwb_t *nlwb = zil_lwb_write_issue(zilog, lwb);
ASSERT3S(lwb->lwb_state, !=, LWB_STATE_OPENED);
In this case, if `dmu_tx_assign` returned `EIO` from within
`zil_lwb_write_issue`, the `lwb` variable passed in will not be issued
to disk. Thus, it's `lwb_state` field will remain `LWB_STATE_OPENED` and
this assertion will fail. `zil_commit_waiter_timeout` assumes that after
it calls `zil_lwb_write_issue`, the `lwb` will be issued to disk, and
doesn't handle the case where this is not true; i.e. it doesn't handle
the case where `dmu_tx_assign` returns `EIO`.
SOLUTION
========
This change modifies the `dmu_tx_assign` function such that `txg_how` is
a bitmask, rather than of the `txg_how_t` enum type. Now, the previous
`TXG_WAITED` semantics can be used via `TXG_NOTHROTTLE`, along with
specifying either `TXG_NOWAIT` or `TXG_WAIT` semantics.
Previously, when `TXG_WAITED` was specified, `TXG_NOWAIT` semantics was
automatically invoked. This was not ideal when using `TXG_WAITED` within
`zil_lwb_write_issued`, leading the problem described above. Rather, we
want to achieve the semantics of `TXG_WAIT`, while also preventing the
`tx` from being penalized via the dirty delay throttling.
With this change, `zil_lwb_write_issued` can acheive the semtantics that
it requires by passing in the value `TXG_WAIT | TXG_NOTHROTTLE` to
`dmu_tx_assign`.
Further, consumers of `dmu_tx_assign` wishing to achieve the old
`TXG_WAITED` semantics can pass in the value `TXG_NOWAIT | TXG_NOTHROTTLE`.
Authored by: Prakash Surya <prakash.surya@delphix.com>
Approved by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: Andriy Gapon <avg@FreeBSD.org>
Ported-by: Brian Behlendorf <behlendorf1@llnl.gov>
Porting Notes:
- Additionally updated `zfs_tmpfile` to use `TXG_NOTHROTTLE`
OpenZFS-issue: https://www.illumos.org/issues/8997
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/19ea6cb0f9
Closes #7084
2018-01-09 00:45:53 +03:00
|
|
|
error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
|
2008-11-20 23:01:55 +03:00
|
|
|
if (error) {
|
|
|
|
rw_exit(&zp->z_parent_lock);
|
|
|
|
rw_exit(&zp->z_name_lock);
|
|
|
|
zfs_dirent_unlock(dl);
|
2009-01-16 00:59:39 +03:00
|
|
|
if (error == ERESTART) {
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-29 07:01:20 +04:00
|
|
|
waited = B_TRUE;
|
2008-11-20 23:01:55 +03:00
|
|
|
dmu_tx_wait(tx);
|
|
|
|
dmu_tx_abort(tx);
|
2019-12-11 22:53:57 +03:00
|
|
|
zrele(zp);
|
2008-11-20 23:01:55 +03:00
|
|
|
goto top;
|
|
|
|
}
|
|
|
|
dmu_tx_abort(tx);
|
2019-12-11 22:53:57 +03:00
|
|
|
zrele(zp);
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2008-11-20 23:01:55 +03:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
|
|
|
error = zfs_link_destroy(dl, zp, tx, zflg, NULL);
|
|
|
|
|
|
|
|
if (error == 0) {
|
|
|
|
uint64_t txtype = TX_RMDIR;
|
|
|
|
if (flags & FIGNORECASE)
|
|
|
|
txtype |= TX_CI;
|
2019-08-14 06:21:27 +03:00
|
|
|
zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT,
|
|
|
|
B_FALSE);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
dmu_tx_commit(tx);
|
|
|
|
|
|
|
|
rw_exit(&zp->z_parent_lock);
|
|
|
|
rw_exit(&zp->z_name_lock);
|
|
|
|
out:
|
|
|
|
zfs_dirent_unlock(dl);
|
|
|
|
|
2012-09-12 22:16:08 +04:00
|
|
|
zfs_inode_update(dzp);
|
|
|
|
zfs_inode_update(zp);
|
2019-12-11 22:53:57 +03:00
|
|
|
zrele(zp);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
|
2010-08-27 01:24:34 +04:00
|
|
|
zil_commit(zilog, 0);
|
2010-05-29 00:45:14 +04:00
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2008-11-20 23:01:55 +03:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2019-05-26 00:29:10 +03:00
|
|
|
* Read directory entries from the given directory cursor position and emit
|
|
|
|
* name and position for each entry.
|
2008-11-20 23:01:55 +03:00
|
|
|
*
|
2011-02-08 22:16:06 +03:00
|
|
|
* IN: ip - inode of directory to read.
|
2019-05-26 00:29:10 +03:00
|
|
|
* ctx - directory entry context.
|
|
|
|
* cr - credentials of caller.
|
2008-11-20 23:01:55 +03:00
|
|
|
*
|
|
|
|
* RETURN: 0 if success
|
|
|
|
* error code if failure
|
|
|
|
*
|
|
|
|
* Timestamps:
|
2011-02-08 22:16:06 +03:00
|
|
|
* ip - atime updated
|
2008-11-20 23:01:55 +03:00
|
|
|
*
|
|
|
|
* Note that the low 4 bits of the cookie returned by zap is always zero.
|
|
|
|
* This allows us to use the low range for "special" directory entries:
|
|
|
|
* We use 0 for '.', and 1 for '..'. If this is the root of the filesystem,
|
|
|
|
* we use the offset 2 for the '.zfs' directory.
|
|
|
|
*/
|
|
|
|
/* ARGSUSED */
|
2011-02-08 22:16:06 +03:00
|
|
|
int
|
2018-05-03 01:01:24 +03:00
|
|
|
zfs_readdir(struct inode *ip, zpl_dir_context_t *ctx, cred_t *cr)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2011-02-08 22:16:06 +03:00
|
|
|
znode_t *zp = ITOZ(ip);
|
2017-03-08 03:21:37 +03:00
|
|
|
zfsvfs_t *zfsvfs = ITOZSB(ip);
|
2008-11-20 23:01:55 +03:00
|
|
|
objset_t *os;
|
|
|
|
zap_cursor_t zc;
|
|
|
|
zap_attribute_t zap;
|
|
|
|
int error;
|
|
|
|
uint8_t prefetch;
|
2013-10-02 19:22:53 +04:00
|
|
|
uint8_t type;
|
2011-02-08 22:16:06 +03:00
|
|
|
int done = 0;
|
|
|
|
uint64_t parent;
|
2013-10-02 19:22:53 +04:00
|
|
|
uint64_t offset; /* must be unsigned; checks for < 1 */
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_ENTER(zfsvfs);
|
2008-11-20 23:01:55 +03:00
|
|
|
ZFS_VERIFY_ZP(zp);
|
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
|
2011-02-08 22:16:06 +03:00
|
|
|
&parent, sizeof (parent))) != 0)
|
|
|
|
goto out;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Quit if directory has been removed (posix)
|
|
|
|
*/
|
2011-02-08 22:16:06 +03:00
|
|
|
if (zp->z_unlinked)
|
|
|
|
goto out;
|
|
|
|
|
2013-10-02 19:22:53 +04:00
|
|
|
error = 0;
|
2017-03-08 03:21:37 +03:00
|
|
|
os = zfsvfs->z_os;
|
2013-10-02 19:22:53 +04:00
|
|
|
offset = ctx->pos;
|
2008-11-20 23:01:55 +03:00
|
|
|
prefetch = zp->z_zn_prefetch;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Initialize the iterator cursor.
|
|
|
|
*/
|
2013-10-02 19:22:53 +04:00
|
|
|
if (offset <= 3) {
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* Start iteration from the beginning of the directory.
|
|
|
|
*/
|
|
|
|
zap_cursor_init(&zc, os, zp->z_id);
|
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* The offset is a serialized cursor.
|
|
|
|
*/
|
2013-10-02 19:22:53 +04:00
|
|
|
zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Transform to file-system independent format
|
|
|
|
*/
|
2011-02-08 22:16:06 +03:00
|
|
|
while (!done) {
|
|
|
|
uint64_t objnum;
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* Special case `.', `..', and `.zfs'.
|
|
|
|
*/
|
2013-10-02 19:22:53 +04:00
|
|
|
if (offset == 0) {
|
2008-11-20 23:01:55 +03:00
|
|
|
(void) strcpy(zap.za_name, ".");
|
|
|
|
zap.za_normalization_conflict = 0;
|
|
|
|
objnum = zp->z_id;
|
2013-10-02 19:22:53 +04:00
|
|
|
type = DT_DIR;
|
|
|
|
} else if (offset == 1) {
|
2008-11-20 23:01:55 +03:00
|
|
|
(void) strcpy(zap.za_name, "..");
|
|
|
|
zap.za_normalization_conflict = 0;
|
2010-05-29 00:45:14 +04:00
|
|
|
objnum = parent;
|
2013-10-02 19:22:53 +04:00
|
|
|
type = DT_DIR;
|
|
|
|
} else if (offset == 2 && zfs_show_ctldir(zp)) {
|
2008-11-20 23:01:55 +03:00
|
|
|
(void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
|
|
|
|
zap.za_normalization_conflict = 0;
|
|
|
|
objnum = ZFSCTL_INO_ROOT;
|
2013-10-02 19:22:53 +04:00
|
|
|
type = DT_DIR;
|
2008-11-20 23:01:55 +03:00
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* Grab next entry.
|
|
|
|
*/
|
2011-02-08 22:16:06 +03:00
|
|
|
if ((error = zap_cursor_retrieve(&zc, &zap))) {
|
|
|
|
if (error == ENOENT)
|
2008-11-20 23:01:55 +03:00
|
|
|
break;
|
|
|
|
else
|
|
|
|
goto update;
|
|
|
|
}
|
|
|
|
|
2012-01-28 01:43:23 +04:00
|
|
|
/*
|
|
|
|
* Allow multiple entries provided the first entry is
|
|
|
|
* the object id. Non-zpl consumers may safely make
|
|
|
|
* use of the additional space.
|
|
|
|
*
|
|
|
|
* XXX: This should be a feature flag for compatibility
|
|
|
|
*/
|
2008-11-20 23:01:55 +03:00
|
|
|
if (zap.za_integer_length != 8 ||
|
2012-01-28 01:43:23 +04:00
|
|
|
zap.za_num_integers == 0) {
|
2008-11-20 23:01:55 +03:00
|
|
|
cmn_err(CE_WARN, "zap_readdir: bad directory "
|
2012-01-28 01:43:23 +04:00
|
|
|
"entry, obj = %lld, offset = %lld, "
|
|
|
|
"length = %d, num = %lld\n",
|
2008-11-20 23:01:55 +03:00
|
|
|
(u_longlong_t)zp->z_id,
|
2013-10-02 19:22:53 +04:00
|
|
|
(u_longlong_t)offset,
|
2012-01-28 01:43:23 +04:00
|
|
|
zap.za_integer_length,
|
|
|
|
(u_longlong_t)zap.za_num_integers);
|
2013-03-08 22:41:28 +04:00
|
|
|
error = SET_ERROR(ENXIO);
|
2008-11-20 23:01:55 +03:00
|
|
|
goto update;
|
|
|
|
}
|
|
|
|
|
|
|
|
objnum = ZFS_DIRENT_OBJ(zap.za_first_integer);
|
2013-10-02 19:22:53 +04:00
|
|
|
type = ZFS_DIRENT_TYPE(zap.za_first_integer);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
2013-08-07 16:53:45 +04:00
|
|
|
|
2018-05-03 01:01:24 +03:00
|
|
|
done = !zpl_dir_emit(ctx, zap.za_name, strlen(zap.za_name),
|
2013-10-02 19:22:53 +04:00
|
|
|
objnum, type);
|
2013-08-07 16:53:45 +04:00
|
|
|
if (done)
|
2008-11-20 23:01:55 +03:00
|
|
|
break;
|
|
|
|
|
|
|
|
/* Prefetch znode */
|
2011-02-08 22:16:06 +03:00
|
|
|
if (prefetch) {
|
2015-12-22 04:31:57 +03:00
|
|
|
dmu_prefetch(os, objnum, 0, 0, 0,
|
|
|
|
ZIO_PRIORITY_SYNC_READ);
|
2011-02-08 22:16:06 +03:00
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2013-10-02 19:22:53 +04:00
|
|
|
/*
|
|
|
|
* Move to the next entry, fill in the previous offset.
|
|
|
|
*/
|
|
|
|
if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
|
2008-11-20 23:01:55 +03:00
|
|
|
zap_cursor_advance(&zc);
|
2013-10-02 19:22:53 +04:00
|
|
|
offset = zap_cursor_serialize(&zc);
|
2008-11-20 23:01:55 +03:00
|
|
|
} else {
|
2013-10-02 19:22:53 +04:00
|
|
|
offset += 1;
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
2013-10-02 19:22:53 +04:00
|
|
|
ctx->pos = offset;
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
|
|
|
|
|
|
|
|
update:
|
|
|
|
zap_cursor_fini(&zc);
|
|
|
|
if (error == ENOENT)
|
|
|
|
error = 0;
|
2011-02-08 22:16:06 +03:00
|
|
|
out:
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
2012-12-20 21:55:47 +04:00
|
|
|
ulong_t zfs_fsync_sync_cnt = 4;
|
|
|
|
|
2010-12-17 22:18:08 +03:00
|
|
|
int
|
2019-12-11 22:53:57 +03:00
|
|
|
zfs_fsync(znode_t *zp, int syncflag, cred_t *cr)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2019-12-11 22:53:57 +03:00
|
|
|
zfsvfs_t *zfsvfs = ZTOZSB(zp);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2012-12-20 21:55:47 +04:00
|
|
|
(void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt);
|
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) {
|
|
|
|
ZFS_ENTER(zfsvfs);
|
2010-05-29 00:45:14 +04:00
|
|
|
ZFS_VERIFY_ZP(zp);
|
2017-03-08 03:21:37 +03:00
|
|
|
zil_commit(zfsvfs->z_log, zp->z_id);
|
|
|
|
ZFS_EXIT(zfsvfs);
|
2010-05-29 00:45:14 +04:00
|
|
|
}
|
2015-04-01 21:18:05 +03:00
|
|
|
tsd_set(zfs_fsyncer_key, NULL);
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
2011-07-10 02:44:16 +04:00
|
|
|
/*
|
|
|
|
* Get the basic file attributes and place them in the provided kstat
|
|
|
|
* structure. The inode is assumed to be the authoritative source
|
|
|
|
* for most of the attributes. However, the znode currently has the
|
|
|
|
* authoritative atime, blksize, and block count.
|
|
|
|
*
|
|
|
|
* IN: ip - inode of file.
|
|
|
|
*
|
|
|
|
* OUT: sp - kstat values.
|
|
|
|
*
|
|
|
|
* RETURN: 0 (always succeeds)
|
|
|
|
*/
|
|
|
|
/* ARGSUSED */
|
|
|
|
int
|
|
|
|
zfs_getattr_fast(struct inode *ip, struct kstat *sp)
|
|
|
|
{
|
|
|
|
znode_t *zp = ITOZ(ip);
|
2017-03-08 03:21:37 +03:00
|
|
|
zfsvfs_t *zfsvfs = ITOZSB(ip);
|
2014-01-08 03:17:24 +04:00
|
|
|
uint32_t blksize;
|
|
|
|
u_longlong_t nblocks;
|
2011-07-10 02:44:16 +04:00
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_ENTER(zfsvfs);
|
2011-11-03 09:48:13 +04:00
|
|
|
ZFS_VERIFY_ZP(zp);
|
|
|
|
|
2011-07-10 02:44:16 +04:00
|
|
|
mutex_enter(&zp->z_lock);
|
|
|
|
|
|
|
|
generic_fillattr(ip, sp);
|
2019-05-09 02:40:51 +03:00
|
|
|
/*
|
|
|
|
* +1 link count for root inode with visible '.zfs' directory.
|
|
|
|
*/
|
|
|
|
if ((zp->z_id == zfsvfs->z_root) && zfs_show_ctldir(zp))
|
|
|
|
if (sp->nlink < ZFS_LINK_MAX)
|
|
|
|
sp->nlink++;
|
2011-07-10 02:44:16 +04:00
|
|
|
|
2014-01-08 03:17:24 +04:00
|
|
|
sa_object_size(zp->z_sa_hdl, &blksize, &nblocks);
|
|
|
|
sp->blksize = blksize;
|
|
|
|
sp->blocks = nblocks;
|
|
|
|
|
2011-07-10 02:44:16 +04:00
|
|
|
if (unlikely(zp->z_blksz == 0)) {
|
|
|
|
/*
|
|
|
|
* Block size hasn't been set; suggest maximal I/O transfers.
|
|
|
|
*/
|
2017-03-08 03:21:37 +03:00
|
|
|
sp->blksize = zfsvfs->z_max_blksz;
|
2011-07-10 02:44:16 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
mutex_exit(&zp->z_lock);
|
|
|
|
|
2013-07-30 13:59:34 +04:00
|
|
|
/*
|
|
|
|
* Required to prevent NFS client from detecting different inode
|
|
|
|
* numbers of snapshot root dentry before and after snapshot mount.
|
|
|
|
*/
|
2017-03-08 03:21:37 +03:00
|
|
|
if (zfsvfs->z_issnap) {
|
2013-07-30 13:59:34 +04:00
|
|
|
if (ip->i_sb->s_root->d_inode == ip)
|
|
|
|
sp->ino = ZFSCTL_INO_SNAPDIRS -
|
2017-03-08 03:21:37 +03:00
|
|
|
dmu_objset_id(zfsvfs->z_os);
|
2013-07-30 13:59:34 +04:00
|
|
|
}
|
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2011-11-03 09:48:13 +04:00
|
|
|
|
2011-07-10 02:44:16 +04:00
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
2018-02-14 01:54:54 +03:00
|
|
|
/*
|
|
|
|
* For the operation of changing file's user/group/project, we need to
|
|
|
|
* handle not only the main object that is assigned to the file directly,
|
|
|
|
* but also the ones that are used by the file via hidden xattr directory.
|
|
|
|
*
|
|
|
|
* Because the xattr directory may contains many EA entries, as to it may
|
|
|
|
* be impossible to change all of them via the transaction of changing the
|
|
|
|
* main object's user/group/project attributes. Then we have to change them
|
|
|
|
* via other multiple independent transactions one by one. It may be not good
|
|
|
|
* solution, but we have no better idea yet.
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
zfs_setattr_dir(znode_t *dzp)
|
|
|
|
{
|
|
|
|
struct inode *dxip = ZTOI(dzp);
|
|
|
|
struct inode *xip = NULL;
|
2019-12-11 22:53:57 +03:00
|
|
|
zfsvfs_t *zfsvfs = ZTOZSB(dzp);
|
2018-02-14 01:54:54 +03:00
|
|
|
objset_t *os = zfsvfs->z_os;
|
|
|
|
zap_cursor_t zc;
|
|
|
|
zap_attribute_t zap;
|
|
|
|
zfs_dirlock_t *dl;
|
|
|
|
znode_t *zp;
|
|
|
|
dmu_tx_t *tx = NULL;
|
|
|
|
uint64_t uid, gid;
|
|
|
|
sa_bulk_attr_t bulk[4];
|
2019-04-11 01:38:21 +03:00
|
|
|
int count;
|
2018-02-14 01:54:54 +03:00
|
|
|
int err;
|
|
|
|
|
|
|
|
zap_cursor_init(&zc, os, dzp->z_id);
|
|
|
|
while ((err = zap_cursor_retrieve(&zc, &zap)) == 0) {
|
2019-04-11 01:38:21 +03:00
|
|
|
count = 0;
|
2018-02-14 01:54:54 +03:00
|
|
|
if (zap.za_integer_length != 8 || zap.za_num_integers != 1) {
|
|
|
|
err = ENXIO;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
err = zfs_dirent_lock(&dl, dzp, (char *)zap.za_name, &zp,
|
|
|
|
ZEXISTS, NULL, NULL);
|
|
|
|
if (err == ENOENT)
|
|
|
|
goto next;
|
|
|
|
if (err)
|
|
|
|
break;
|
|
|
|
|
|
|
|
xip = ZTOI(zp);
|
|
|
|
if (KUID_TO_SUID(xip->i_uid) == KUID_TO_SUID(dxip->i_uid) &&
|
|
|
|
KGID_TO_SGID(xip->i_gid) == KGID_TO_SGID(dxip->i_gid) &&
|
|
|
|
zp->z_projid == dzp->z_projid)
|
|
|
|
goto next;
|
|
|
|
|
|
|
|
tx = dmu_tx_create(os);
|
|
|
|
if (!(zp->z_pflags & ZFS_PROJID))
|
|
|
|
dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
|
|
|
|
else
|
|
|
|
dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
|
|
|
|
|
|
|
|
err = dmu_tx_assign(tx, TXG_WAIT);
|
|
|
|
if (err)
|
|
|
|
break;
|
|
|
|
|
|
|
|
mutex_enter(&dzp->z_lock);
|
|
|
|
|
|
|
|
if (KUID_TO_SUID(xip->i_uid) != KUID_TO_SUID(dxip->i_uid)) {
|
|
|
|
xip->i_uid = dxip->i_uid;
|
|
|
|
uid = zfs_uid_read(dxip);
|
|
|
|
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
|
|
|
|
&uid, sizeof (uid));
|
|
|
|
}
|
|
|
|
|
|
|
|
if (KGID_TO_SGID(xip->i_gid) != KGID_TO_SGID(dxip->i_gid)) {
|
|
|
|
xip->i_gid = dxip->i_gid;
|
|
|
|
gid = zfs_gid_read(dxip);
|
|
|
|
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
|
|
|
|
&gid, sizeof (gid));
|
|
|
|
}
|
|
|
|
|
|
|
|
if (zp->z_projid != dzp->z_projid) {
|
|
|
|
if (!(zp->z_pflags & ZFS_PROJID)) {
|
|
|
|
zp->z_pflags |= ZFS_PROJID;
|
|
|
|
SA_ADD_BULK_ATTR(bulk, count,
|
|
|
|
SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags,
|
|
|
|
sizeof (zp->z_pflags));
|
|
|
|
}
|
|
|
|
|
|
|
|
zp->z_projid = dzp->z_projid;
|
|
|
|
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PROJID(zfsvfs),
|
|
|
|
NULL, &zp->z_projid, sizeof (zp->z_projid));
|
|
|
|
}
|
|
|
|
|
|
|
|
mutex_exit(&dzp->z_lock);
|
|
|
|
|
|
|
|
if (likely(count > 0)) {
|
|
|
|
err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
|
|
|
|
dmu_tx_commit(tx);
|
|
|
|
} else {
|
|
|
|
dmu_tx_abort(tx);
|
|
|
|
}
|
|
|
|
tx = NULL;
|
|
|
|
if (err != 0 && err != ENOENT)
|
|
|
|
break;
|
|
|
|
|
|
|
|
next:
|
2019-12-11 22:53:57 +03:00
|
|
|
if (zp) {
|
|
|
|
zrele(zp);
|
|
|
|
zp = NULL;
|
2018-02-14 01:54:54 +03:00
|
|
|
zfs_dirent_unlock(dl);
|
|
|
|
}
|
|
|
|
zap_cursor_advance(&zc);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (tx)
|
|
|
|
dmu_tx_abort(tx);
|
2019-12-11 22:53:57 +03:00
|
|
|
if (zp) {
|
|
|
|
zrele(zp);
|
2018-02-14 01:54:54 +03:00
|
|
|
zfs_dirent_unlock(dl);
|
|
|
|
}
|
|
|
|
zap_cursor_fini(&zc);
|
|
|
|
|
|
|
|
return (err == ENOENT ? 0 : err);
|
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* Set the file attributes to the values contained in the
|
|
|
|
* vattr structure.
|
|
|
|
*
|
2019-12-11 22:53:57 +03:00
|
|
|
* IN: zp - znode of file to be modified.
|
2008-11-20 23:01:55 +03:00
|
|
|
* vap - new attribute values.
|
2011-03-01 23:24:09 +03:00
|
|
|
* If ATTR_XVATTR set, then optional attrs are being set
|
2008-11-20 23:01:55 +03:00
|
|
|
* flags - ATTR_UTIME set if non-default time values provided.
|
|
|
|
* - ATTR_NOACLCHECK (CIFS context only).
|
|
|
|
* cr - credentials of caller.
|
|
|
|
*
|
|
|
|
* RETURN: 0 if success
|
|
|
|
* error code if failure
|
|
|
|
*
|
|
|
|
* Timestamps:
|
2011-02-08 22:16:06 +03:00
|
|
|
* ip - ctime updated, mtime updated if size changed.
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
|
|
|
/* ARGSUSED */
|
2010-12-17 22:18:08 +03:00
|
|
|
int
|
2019-12-11 22:53:57 +03:00
|
|
|
zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2019-12-11 22:53:57 +03:00
|
|
|
struct inode *ip;
|
|
|
|
zfsvfs_t *zfsvfs = ZTOZSB(zp);
|
2018-02-14 01:54:54 +03:00
|
|
|
objset_t *os = zfsvfs->z_os;
|
2008-11-20 23:01:55 +03:00
|
|
|
zilog_t *zilog;
|
|
|
|
dmu_tx_t *tx;
|
|
|
|
vattr_t oldva;
|
2011-03-03 01:18:40 +03:00
|
|
|
xvattr_t *tmpxvattr;
|
2011-03-01 23:24:09 +03:00
|
|
|
uint_t mask = vap->va_mask;
|
2013-02-11 10:21:05 +04:00
|
|
|
uint_t saved_mask = 0;
|
2008-11-20 23:01:55 +03:00
|
|
|
int trim_mask = 0;
|
|
|
|
uint64_t new_mode;
|
2016-08-04 00:31:08 +03:00
|
|
|
uint64_t new_kuid = 0, new_kgid = 0, new_uid, new_gid;
|
2010-08-27 01:24:34 +04:00
|
|
|
uint64_t xattr_obj;
|
Fix atime handling and relatime
The problem for atime:
We have 3 places for atime: inode->i_atime, znode->z_atime and SA. And its
handling is a mess. A huge part of mess regarding atime comes from
zfs_tstamp_update_setup, zfs_inode_update, and zfs_getattr, which behave
inconsistently with those three values.
zfs_tstamp_update_setup clears z_atime_dirty unconditionally as long as you
don't pass ATTR_ATIME. Which means every write(2) operation which only updates
ctime and mtime will cause atime changes to not be written to disk.
Also zfs_inode_update from write(2) will replace inode->i_atime with what's
inside SA(stale). But doesn't touch z_atime. So after read(2) and write(2).
You'll have i_atime(stale), z_atime(new), SA(stale) and z_atime_dirty=0.
Now, if you do stat(2), zfs_getattr will actually replace i_atime with what's
inside, z_atime. So you will have now you'll have i_atime(new), z_atime(new),
SA(stale) and z_atime_dirty=0. These will all gone after umount. And you'll
leave with a stale atime.
The problem for relatime:
We do have a relatime config inside ZFS dataset, but how it should interact
with the mount flag MS_RELATIME is not well defined. It seems it wanted
relatime mount option to override the dataset config by showing it as
temporary in `zfs get`. But at the same time, `zfs set relatime=on|off` would
also seems to want to override the mount option. Not to mention that
MS_RELATIME flag is actually never passed into ZFS, so it never really worked.
How Linux handles atime:
The Linux kernel actually handles atime completely in VFS, except for writing
it to disk. So if we remove the atime handling in ZFS, things would just work,
no matter it's strictatime, relatime, noatime, or even O_NOATIME. And whenever
VFS updates the i_atime, it will notify the underlying filesystem via
sb->dirty_inode().
And also there's one thing to note about atime flags like MS_RELATIME and
other flags like MS_NODEV, etc. They are mount point flags rather than
filesystem(sb) flags. Since native linux filesystem can be mounted at multiple
places at the same time, they can all have different atime settings. So these
flags are never passed down to filesystem drivers.
What this patch tries to do:
We remove znode->z_atime, since we won't gain anything from it. We remove most
of the atime handling and leave it to VFS. The only thing we do with atime is
to write it when dirty_inode() or setattr() is called. We also add
file_accessed() in zpl_read() since it's not provided in vfs_read().
After this patch, only the MS_RELATIME flag will have effect. The setting in
dataset won't do anything. We will make zfstuil to mount ZFS with MS_RELATIME
set according to the setting in dataset in future patch.
Signed-off-by: Chunwei Chen <david.chen@osnexus.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Issue #4482
2016-03-30 03:53:34 +03:00
|
|
|
uint64_t mtime[2], ctime[2], atime[2];
|
2018-02-14 01:54:54 +03:00
|
|
|
uint64_t projid = ZFS_INVALID_PROJID;
|
2008-11-20 23:01:55 +03:00
|
|
|
znode_t *attrzp;
|
|
|
|
int need_policy = FALSE;
|
2018-02-14 01:54:54 +03:00
|
|
|
int err, err2 = 0;
|
2008-11-20 23:01:55 +03:00
|
|
|
zfs_fuid_info_t *fuidp = NULL;
|
2011-03-01 23:24:09 +03:00
|
|
|
xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */
|
|
|
|
xoptattr_t *xoap;
|
|
|
|
zfs_acl_t *aclp;
|
2008-11-20 23:01:55 +03:00
|
|
|
boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
|
2010-05-29 00:45:14 +04:00
|
|
|
boolean_t fuid_dirtied = B_FALSE;
|
2018-02-14 01:54:54 +03:00
|
|
|
boolean_t handle_eadir = B_FALSE;
|
2011-03-09 21:48:49 +03:00
|
|
|
sa_bulk_attr_t *bulk, *xattr_bulk;
|
2018-02-14 01:54:54 +03:00
|
|
|
int count = 0, xattr_count = 0, bulks = 8;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
if (mask == 0)
|
|
|
|
return (0);
|
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_ENTER(zfsvfs);
|
2008-11-20 23:01:55 +03:00
|
|
|
ZFS_VERIFY_ZP(zp);
|
2019-12-11 22:53:57 +03:00
|
|
|
ip = ZTOI(zp);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2018-02-14 01:54:54 +03:00
|
|
|
/*
|
|
|
|
* If this is a xvattr_t, then get a pointer to the structure of
|
|
|
|
* optional attributes. If this is NULL, then we have a vattr_t.
|
|
|
|
*/
|
|
|
|
xoap = xva_getxoptattr(xvap);
|
|
|
|
if (xoap != NULL && (mask & ATTR_XVATTR)) {
|
|
|
|
if (XVA_ISSET_REQ(xvap, XAT_PROJID)) {
|
|
|
|
if (!dmu_objset_projectquota_enabled(os) ||
|
|
|
|
(!S_ISREG(ip->i_mode) && !S_ISDIR(ip->i_mode))) {
|
|
|
|
ZFS_EXIT(zfsvfs);
|
|
|
|
return (SET_ERROR(ENOTSUP));
|
|
|
|
}
|
|
|
|
|
|
|
|
projid = xoap->xoa_projid;
|
|
|
|
if (unlikely(projid == ZFS_INVALID_PROJID)) {
|
|
|
|
ZFS_EXIT(zfsvfs);
|
|
|
|
return (SET_ERROR(EINVAL));
|
|
|
|
}
|
|
|
|
|
|
|
|
if (projid == zp->z_projid && zp->z_pflags & ZFS_PROJID)
|
|
|
|
projid = ZFS_INVALID_PROJID;
|
|
|
|
else
|
|
|
|
need_policy = TRUE;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT) &&
|
2018-03-05 23:56:27 +03:00
|
|
|
(xoap->xoa_projinherit !=
|
|
|
|
((zp->z_pflags & ZFS_PROJINHERIT) != 0)) &&
|
2018-02-14 01:54:54 +03:00
|
|
|
(!dmu_objset_projectquota_enabled(os) ||
|
|
|
|
(!S_ISREG(ip->i_mode) && !S_ISDIR(ip->i_mode)))) {
|
2018-03-05 23:56:27 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
|
|
|
return (SET_ERROR(ENOTSUP));
|
2018-02-14 01:54:54 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
zilog = zfsvfs->z_log;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Make sure that if we have ephemeral uid/gid or xvattr specified
|
|
|
|
* that file system is at proper version level
|
|
|
|
*/
|
2011-03-01 23:24:09 +03:00
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
if (zfsvfs->z_use_fuids == B_FALSE &&
|
2011-03-01 23:24:09 +03:00
|
|
|
(((mask & ATTR_UID) && IS_EPHEMERAL(vap->va_uid)) ||
|
|
|
|
((mask & ATTR_GID) && IS_EPHEMERAL(vap->va_gid)) ||
|
|
|
|
(mask & ATTR_XVATTR))) {
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2013-03-08 22:41:28 +04:00
|
|
|
return (SET_ERROR(EINVAL));
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2011-02-08 22:16:06 +03:00
|
|
|
if (mask & ATTR_SIZE && S_ISDIR(ip->i_mode)) {
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2013-03-08 22:41:28 +04:00
|
|
|
return (SET_ERROR(EISDIR));
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2011-02-08 22:16:06 +03:00
|
|
|
if (mask & ATTR_SIZE && !S_ISREG(ip->i_mode) && !S_ISFIFO(ip->i_mode)) {
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2013-03-08 22:41:28 +04:00
|
|
|
return (SET_ERROR(EINVAL));
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2013-11-01 23:26:11 +04:00
|
|
|
tmpxvattr = kmem_alloc(sizeof (xvattr_t), KM_SLEEP);
|
2011-03-03 01:18:40 +03:00
|
|
|
xva_init(tmpxvattr);
|
2011-03-01 23:24:09 +03:00
|
|
|
|
2018-02-14 01:54:54 +03:00
|
|
|
bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * bulks, KM_SLEEP);
|
|
|
|
xattr_bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * bulks, KM_SLEEP);
|
2011-03-09 21:48:49 +03:00
|
|
|
|
2011-03-01 23:24:09 +03:00
|
|
|
/*
|
|
|
|
* Immutable files can only alter immutable bit and atime
|
|
|
|
*/
|
|
|
|
if ((zp->z_pflags & ZFS_IMMUTABLE) &&
|
|
|
|
((mask & (ATTR_SIZE|ATTR_UID|ATTR_GID|ATTR_MTIME|ATTR_MODE)) ||
|
|
|
|
((mask & ATTR_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) {
|
2017-08-03 07:16:12 +03:00
|
|
|
err = SET_ERROR(EPERM);
|
2011-03-03 01:18:40 +03:00
|
|
|
goto out3;
|
2011-03-01 23:24:09 +03:00
|
|
|
}
|
|
|
|
|
2011-02-08 22:16:06 +03:00
|
|
|
if ((mask & ATTR_SIZE) && (zp->z_pflags & ZFS_READONLY)) {
|
2017-08-03 07:16:12 +03:00
|
|
|
err = SET_ERROR(EPERM);
|
2011-03-03 01:18:40 +03:00
|
|
|
goto out3;
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2011-03-01 23:24:09 +03:00
|
|
|
/*
|
|
|
|
* Verify timestamps doesn't overflow 32 bits.
|
|
|
|
* ZFS can handle large timestamps, but 32bit syscalls can't
|
|
|
|
* handle times greater than 2039. This check should be removed
|
|
|
|
* once large timestamps are fully supported.
|
|
|
|
*/
|
|
|
|
if (mask & (ATTR_ATIME | ATTR_MTIME)) {
|
2013-11-01 23:26:11 +04:00
|
|
|
if (((mask & ATTR_ATIME) &&
|
|
|
|
TIMESPEC_OVERFLOW(&vap->va_atime)) ||
|
|
|
|
((mask & ATTR_MTIME) &&
|
|
|
|
TIMESPEC_OVERFLOW(&vap->va_mtime))) {
|
2017-08-03 07:16:12 +03:00
|
|
|
err = SET_ERROR(EOVERFLOW);
|
2011-03-03 01:18:40 +03:00
|
|
|
goto out3;
|
2011-03-01 23:24:09 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
top:
|
|
|
|
attrzp = NULL;
|
2010-08-27 01:24:34 +04:00
|
|
|
aclp = NULL;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2009-08-18 22:43:27 +04:00
|
|
|
/* Can this be moved to before the top label? */
|
2017-03-08 03:21:37 +03:00
|
|
|
if (zfs_is_readonly(zfsvfs)) {
|
2017-08-03 07:16:12 +03:00
|
|
|
err = SET_ERROR(EROFS);
|
2011-03-03 01:18:40 +03:00
|
|
|
goto out3;
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* First validate permissions
|
|
|
|
*/
|
|
|
|
|
2011-02-08 22:16:06 +03:00
|
|
|
if (mask & ATTR_SIZE) {
|
2008-11-20 23:01:55 +03:00
|
|
|
err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr);
|
2011-03-03 01:18:40 +03:00
|
|
|
if (err)
|
|
|
|
goto out3;
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* XXX - Note, we are not providing any open
|
|
|
|
* mode flags here (like FNDELAY), so we may
|
|
|
|
* block if there are locks present... this
|
|
|
|
* should be addressed in openat().
|
|
|
|
*/
|
2008-12-03 23:09:06 +03:00
|
|
|
/* XXX - would it be OK to generate a log record here? */
|
2011-03-01 23:24:09 +03:00
|
|
|
err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
|
2011-03-03 01:18:40 +03:00
|
|
|
if (err)
|
|
|
|
goto out3;
|
2010-05-29 00:45:14 +04:00
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2011-03-01 23:24:09 +03:00
|
|
|
if (mask & (ATTR_ATIME|ATTR_MTIME) ||
|
|
|
|
((mask & ATTR_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) ||
|
|
|
|
XVA_ISSET_REQ(xvap, XAT_READONLY) ||
|
|
|
|
XVA_ISSET_REQ(xvap, XAT_ARCHIVE) ||
|
|
|
|
XVA_ISSET_REQ(xvap, XAT_OFFLINE) ||
|
|
|
|
XVA_ISSET_REQ(xvap, XAT_SPARSE) ||
|
|
|
|
XVA_ISSET_REQ(xvap, XAT_CREATETIME) ||
|
|
|
|
XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) {
|
|
|
|
need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
|
|
|
|
skipaclchk, cr);
|
|
|
|
}
|
|
|
|
|
2011-02-08 22:16:06 +03:00
|
|
|
if (mask & (ATTR_UID|ATTR_GID)) {
|
|
|
|
int idmask = (mask & (ATTR_UID|ATTR_GID));
|
2008-11-20 23:01:55 +03:00
|
|
|
int take_owner;
|
|
|
|
int take_group;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* NOTE: even if a new mode is being set,
|
|
|
|
* we may clear S_ISUID/S_ISGID bits.
|
|
|
|
*/
|
|
|
|
|
2011-02-08 22:16:06 +03:00
|
|
|
if (!(mask & ATTR_MODE))
|
2011-03-01 23:24:09 +03:00
|
|
|
vap->va_mode = zp->z_mode;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Take ownership or chgrp to group we are a member of
|
|
|
|
*/
|
|
|
|
|
2011-03-01 23:24:09 +03:00
|
|
|
take_owner = (mask & ATTR_UID) && (vap->va_uid == crgetuid(cr));
|
2011-02-08 22:16:06 +03:00
|
|
|
take_group = (mask & ATTR_GID) &&
|
2017-03-08 03:21:37 +03:00
|
|
|
zfs_groupmember(zfsvfs, vap->va_gid, cr);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
/*
|
2011-03-01 23:24:09 +03:00
|
|
|
* If both ATTR_UID and ATTR_GID are set then take_owner and
|
2008-11-20 23:01:55 +03:00
|
|
|
* take_group must both be set in order to allow taking
|
|
|
|
* ownership.
|
|
|
|
*
|
|
|
|
* Otherwise, send the check through secpolicy_vnode_setattr()
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
|
2011-02-08 22:16:06 +03:00
|
|
|
if (((idmask == (ATTR_UID|ATTR_GID)) &&
|
|
|
|
take_owner && take_group) ||
|
|
|
|
((idmask == ATTR_UID) && take_owner) ||
|
|
|
|
((idmask == ATTR_GID) && take_group)) {
|
2008-11-20 23:01:55 +03:00
|
|
|
if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0,
|
|
|
|
skipaclchk, cr) == 0) {
|
|
|
|
/*
|
|
|
|
* Remove setuid/setgid for non-privileged users
|
|
|
|
*/
|
2011-03-01 23:24:09 +03:00
|
|
|
(void) secpolicy_setid_clear(vap, cr);
|
2011-02-08 22:16:06 +03:00
|
|
|
trim_mask = (mask & (ATTR_UID|ATTR_GID));
|
2008-11-20 23:01:55 +03:00
|
|
|
} else {
|
|
|
|
need_policy = TRUE;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
need_policy = TRUE;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
mutex_enter(&zp->z_lock);
|
2010-05-29 00:45:14 +04:00
|
|
|
oldva.va_mode = zp->z_mode;
|
2010-08-27 01:24:34 +04:00
|
|
|
zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
|
2011-03-01 23:24:09 +03:00
|
|
|
if (mask & ATTR_XVATTR) {
|
|
|
|
/*
|
|
|
|
* Update xvattr mask to include only those attributes
|
|
|
|
* that are actually changing.
|
|
|
|
*
|
|
|
|
* the bits will be restored prior to actually setting
|
|
|
|
* the attributes so the caller thinks they were set.
|
|
|
|
*/
|
|
|
|
if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
|
|
|
|
if (xoap->xoa_appendonly !=
|
|
|
|
((zp->z_pflags & ZFS_APPENDONLY) != 0)) {
|
|
|
|
need_policy = TRUE;
|
|
|
|
} else {
|
|
|
|
XVA_CLR_REQ(xvap, XAT_APPENDONLY);
|
2011-03-03 01:18:40 +03:00
|
|
|
XVA_SET_REQ(tmpxvattr, XAT_APPENDONLY);
|
2011-03-01 23:24:09 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-02-14 01:54:54 +03:00
|
|
|
if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) {
|
|
|
|
if (xoap->xoa_projinherit !=
|
|
|
|
((zp->z_pflags & ZFS_PROJINHERIT) != 0)) {
|
|
|
|
need_policy = TRUE;
|
|
|
|
} else {
|
|
|
|
XVA_CLR_REQ(xvap, XAT_PROJINHERIT);
|
|
|
|
XVA_SET_REQ(tmpxvattr, XAT_PROJINHERIT);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-03-01 23:24:09 +03:00
|
|
|
if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
|
|
|
|
if (xoap->xoa_nounlink !=
|
|
|
|
((zp->z_pflags & ZFS_NOUNLINK) != 0)) {
|
|
|
|
need_policy = TRUE;
|
|
|
|
} else {
|
|
|
|
XVA_CLR_REQ(xvap, XAT_NOUNLINK);
|
2011-03-03 01:18:40 +03:00
|
|
|
XVA_SET_REQ(tmpxvattr, XAT_NOUNLINK);
|
2011-03-01 23:24:09 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
|
|
|
|
if (xoap->xoa_immutable !=
|
|
|
|
((zp->z_pflags & ZFS_IMMUTABLE) != 0)) {
|
|
|
|
need_policy = TRUE;
|
|
|
|
} else {
|
|
|
|
XVA_CLR_REQ(xvap, XAT_IMMUTABLE);
|
2011-03-03 01:18:40 +03:00
|
|
|
XVA_SET_REQ(tmpxvattr, XAT_IMMUTABLE);
|
2011-03-01 23:24:09 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
|
|
|
|
if (xoap->xoa_nodump !=
|
|
|
|
((zp->z_pflags & ZFS_NODUMP) != 0)) {
|
|
|
|
need_policy = TRUE;
|
|
|
|
} else {
|
|
|
|
XVA_CLR_REQ(xvap, XAT_NODUMP);
|
2011-03-03 01:18:40 +03:00
|
|
|
XVA_SET_REQ(tmpxvattr, XAT_NODUMP);
|
2011-03-01 23:24:09 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
|
|
|
|
if (xoap->xoa_av_modified !=
|
|
|
|
((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) {
|
|
|
|
need_policy = TRUE;
|
|
|
|
} else {
|
|
|
|
XVA_CLR_REQ(xvap, XAT_AV_MODIFIED);
|
2011-03-03 01:18:40 +03:00
|
|
|
XVA_SET_REQ(tmpxvattr, XAT_AV_MODIFIED);
|
2011-03-01 23:24:09 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
|
|
|
|
if ((!S_ISREG(ip->i_mode) &&
|
|
|
|
xoap->xoa_av_quarantined) ||
|
|
|
|
xoap->xoa_av_quarantined !=
|
|
|
|
((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) {
|
|
|
|
need_policy = TRUE;
|
|
|
|
} else {
|
|
|
|
XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED);
|
2011-03-03 01:18:40 +03:00
|
|
|
XVA_SET_REQ(tmpxvattr, XAT_AV_QUARANTINED);
|
2011-03-01 23:24:09 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
|
|
|
|
mutex_exit(&zp->z_lock);
|
2017-08-03 07:16:12 +03:00
|
|
|
err = SET_ERROR(EPERM);
|
2011-03-03 01:18:40 +03:00
|
|
|
goto out3;
|
2011-03-01 23:24:09 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
if (need_policy == FALSE &&
|
|
|
|
(XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) ||
|
|
|
|
XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
|
|
|
|
need_policy = TRUE;
|
|
|
|
}
|
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
mutex_exit(&zp->z_lock);
|
|
|
|
|
2011-02-08 22:16:06 +03:00
|
|
|
if (mask & ATTR_MODE) {
|
2008-11-20 23:01:55 +03:00
|
|
|
if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) {
|
2011-03-01 23:24:09 +03:00
|
|
|
err = secpolicy_setid_setsticky_clear(ip, vap,
|
2008-11-20 23:01:55 +03:00
|
|
|
&oldva, cr);
|
2011-03-03 01:18:40 +03:00
|
|
|
if (err)
|
|
|
|
goto out3;
|
|
|
|
|
2011-02-08 22:16:06 +03:00
|
|
|
trim_mask |= ATTR_MODE;
|
2008-11-20 23:01:55 +03:00
|
|
|
} else {
|
|
|
|
need_policy = TRUE;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (need_policy) {
|
|
|
|
/*
|
|
|
|
* If trim_mask is set then take ownership
|
|
|
|
* has been granted or write_acl is present and user
|
|
|
|
* has the ability to modify mode. In that case remove
|
|
|
|
* UID|GID and or MODE from mask so that
|
|
|
|
* secpolicy_vnode_setattr() doesn't revoke it.
|
|
|
|
*/
|
|
|
|
|
|
|
|
if (trim_mask) {
|
2011-03-01 23:24:09 +03:00
|
|
|
saved_mask = vap->va_mask;
|
|
|
|
vap->va_mask &= ~trim_mask;
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
2011-03-01 23:24:09 +03:00
|
|
|
err = secpolicy_vnode_setattr(cr, ip, vap, &oldva, flags,
|
2008-11-20 23:01:55 +03:00
|
|
|
(int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp);
|
2011-03-03 01:18:40 +03:00
|
|
|
if (err)
|
|
|
|
goto out3;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
if (trim_mask)
|
2011-03-01 23:24:09 +03:00
|
|
|
vap->va_mask |= saved_mask;
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* secpolicy_vnode_setattr, or take ownership may have
|
|
|
|
* changed va_mask
|
|
|
|
*/
|
2011-03-01 23:24:09 +03:00
|
|
|
mask = vap->va_mask;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2018-02-14 01:54:54 +03:00
|
|
|
if ((mask & (ATTR_UID | ATTR_GID)) || projid != ZFS_INVALID_PROJID) {
|
|
|
|
handle_eadir = B_TRUE;
|
2017-03-08 03:21:37 +03:00
|
|
|
err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
|
2010-08-27 01:24:34 +04:00
|
|
|
&xattr_obj, sizeof (xattr_obj));
|
2010-05-29 00:45:14 +04:00
|
|
|
|
2010-08-27 01:24:34 +04:00
|
|
|
if (err == 0 && xattr_obj) {
|
2011-02-08 22:16:06 +03:00
|
|
|
err = zfs_zget(ZTOZSB(zp), xattr_obj, &attrzp);
|
2010-05-29 00:45:14 +04:00
|
|
|
if (err)
|
|
|
|
goto out2;
|
|
|
|
}
|
2011-02-08 22:16:06 +03:00
|
|
|
if (mask & ATTR_UID) {
|
2017-03-08 03:21:37 +03:00
|
|
|
new_kuid = zfs_fuid_create(zfsvfs,
|
2011-03-01 23:24:09 +03:00
|
|
|
(uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
|
2016-08-04 00:31:08 +03:00
|
|
|
if (new_kuid != KUID_TO_SUID(ZTOI(zp)->i_uid) &&
|
2018-02-14 01:54:54 +03:00
|
|
|
zfs_id_overquota(zfsvfs, DMU_USERUSED_OBJECT,
|
|
|
|
new_kuid)) {
|
2010-08-27 01:24:34 +04:00
|
|
|
if (attrzp)
|
2019-12-11 22:53:57 +03:00
|
|
|
zrele(attrzp);
|
2017-08-03 07:16:12 +03:00
|
|
|
err = SET_ERROR(EDQUOT);
|
2010-05-29 00:45:14 +04:00
|
|
|
goto out2;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-02-08 22:16:06 +03:00
|
|
|
if (mask & ATTR_GID) {
|
2017-03-08 03:21:37 +03:00
|
|
|
new_kgid = zfs_fuid_create(zfsvfs,
|
|
|
|
(uint64_t)vap->va_gid, cr, ZFS_GROUP, &fuidp);
|
2016-08-04 00:31:08 +03:00
|
|
|
if (new_kgid != KGID_TO_SGID(ZTOI(zp)->i_gid) &&
|
2018-02-14 01:54:54 +03:00
|
|
|
zfs_id_overquota(zfsvfs, DMU_GROUPUSED_OBJECT,
|
|
|
|
new_kgid)) {
|
2010-08-27 01:24:34 +04:00
|
|
|
if (attrzp)
|
2019-12-11 22:53:57 +03:00
|
|
|
zrele(attrzp);
|
2017-08-03 07:16:12 +03:00
|
|
|
err = SET_ERROR(EDQUOT);
|
2010-05-29 00:45:14 +04:00
|
|
|
goto out2;
|
|
|
|
}
|
|
|
|
}
|
2018-02-14 01:54:54 +03:00
|
|
|
|
|
|
|
if (projid != ZFS_INVALID_PROJID &&
|
|
|
|
zfs_id_overquota(zfsvfs, DMU_PROJECTUSED_OBJECT, projid)) {
|
|
|
|
if (attrzp)
|
2019-12-11 22:53:57 +03:00
|
|
|
zrele(attrzp);
|
2018-02-14 01:54:54 +03:00
|
|
|
err = EDQUOT;
|
|
|
|
goto out2;
|
|
|
|
}
|
2010-05-29 00:45:14 +04:00
|
|
|
}
|
2018-02-14 01:54:54 +03:00
|
|
|
tx = dmu_tx_create(os);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2011-02-08 22:16:06 +03:00
|
|
|
if (mask & ATTR_MODE) {
|
2010-05-29 00:45:14 +04:00
|
|
|
uint64_t pmode = zp->z_mode;
|
2010-08-27 01:24:34 +04:00
|
|
|
uint64_t acl_obj;
|
2011-03-01 23:24:09 +03:00
|
|
|
new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2019-11-01 20:19:11 +03:00
|
|
|
if ((err = zfs_acl_chmod_setattr(zp, &aclp, new_mode)))
|
|
|
|
goto out;
|
2010-05-29 00:45:14 +04:00
|
|
|
|
2010-08-27 01:24:34 +04:00
|
|
|
mutex_enter(&zp->z_lock);
|
|
|
|
if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) {
|
2010-05-29 00:45:14 +04:00
|
|
|
/*
|
|
|
|
* Are we upgrading ACL from old V0 format
|
|
|
|
* to V1 format?
|
|
|
|
*/
|
2017-03-08 03:21:37 +03:00
|
|
|
if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
|
2010-08-27 01:24:34 +04:00
|
|
|
zfs_znode_acl_version(zp) ==
|
2008-11-20 23:01:55 +03:00
|
|
|
ZFS_ACL_VERSION_INITIAL) {
|
2010-08-27 01:24:34 +04:00
|
|
|
dmu_tx_hold_free(tx, acl_obj, 0,
|
2008-11-20 23:01:55 +03:00
|
|
|
DMU_OBJECT_END);
|
|
|
|
dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
|
|
|
|
0, aclp->z_acl_bytes);
|
|
|
|
} else {
|
2010-08-27 01:24:34 +04:00
|
|
|
dmu_tx_hold_write(tx, acl_obj, 0,
|
2008-11-20 23:01:55 +03:00
|
|
|
aclp->z_acl_bytes);
|
|
|
|
}
|
2010-05-29 00:45:14 +04:00
|
|
|
} else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
|
2008-11-20 23:01:55 +03:00
|
|
|
dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
|
|
|
|
0, aclp->z_acl_bytes);
|
|
|
|
}
|
2010-08-27 01:24:34 +04:00
|
|
|
mutex_exit(&zp->z_lock);
|
2010-05-29 00:45:14 +04:00
|
|
|
dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
|
|
|
|
} else {
|
2018-02-14 01:54:54 +03:00
|
|
|
if (((mask & ATTR_XVATTR) &&
|
|
|
|
XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) ||
|
|
|
|
(projid != ZFS_INVALID_PROJID &&
|
|
|
|
!(zp->z_pflags & ZFS_PROJID)))
|
2011-03-01 23:24:09 +03:00
|
|
|
dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
|
|
|
|
else
|
|
|
|
dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
if (attrzp) {
|
|
|
|
dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
fuid_dirtied = zfsvfs->z_fuid_dirty;
|
2010-05-29 00:45:14 +04:00
|
|
|
if (fuid_dirtied)
|
2017-03-08 03:21:37 +03:00
|
|
|
zfs_fuid_txhold(zfsvfs, tx);
|
2010-05-29 00:45:14 +04:00
|
|
|
|
|
|
|
zfs_sa_upgrade_txholds(tx, zp);
|
|
|
|
|
2013-11-23 03:13:18 +04:00
|
|
|
err = dmu_tx_assign(tx, TXG_WAIT);
|
|
|
|
if (err)
|
2009-07-03 02:44:48 +04:00
|
|
|
goto out;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
count = 0;
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* Set each attribute requested.
|
|
|
|
* We group settings according to the locks they need to acquire.
|
|
|
|
*
|
|
|
|
* Note: you cannot set ctime directly, although it will be
|
|
|
|
* updated as a side-effect of calling this function.
|
|
|
|
*/
|
|
|
|
|
2018-02-14 01:54:54 +03:00
|
|
|
if (projid != ZFS_INVALID_PROJID && !(zp->z_pflags & ZFS_PROJID)) {
|
|
|
|
/*
|
|
|
|
* For the existed object that is upgraded from old system,
|
|
|
|
* its on-disk layout has no slot for the project ID attribute.
|
|
|
|
* But quota accounting logic needs to access related slots by
|
|
|
|
* offset directly. So we need to adjust old objects' layout
|
|
|
|
* to make the project ID to some unified and fixed offset.
|
|
|
|
*/
|
|
|
|
if (attrzp)
|
|
|
|
err = sa_add_projid(attrzp->z_sa_hdl, tx, projid);
|
|
|
|
if (err == 0)
|
|
|
|
err = sa_add_projid(zp->z_sa_hdl, tx, projid);
|
|
|
|
|
|
|
|
if (unlikely(err == EEXIST))
|
|
|
|
err = 0;
|
|
|
|
else if (err != 0)
|
|
|
|
goto out;
|
|
|
|
else
|
|
|
|
projid = ZFS_INVALID_PROJID;
|
|
|
|
}
|
2010-08-27 01:24:34 +04:00
|
|
|
|
2011-02-08 22:16:06 +03:00
|
|
|
if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
|
2010-08-27 01:24:34 +04:00
|
|
|
mutex_enter(&zp->z_acl_lock);
|
2008-11-20 23:01:55 +03:00
|
|
|
mutex_enter(&zp->z_lock);
|
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
|
2010-05-29 00:45:14 +04:00
|
|
|
&zp->z_pflags, sizeof (zp->z_pflags));
|
|
|
|
|
|
|
|
if (attrzp) {
|
2011-02-08 22:16:06 +03:00
|
|
|
if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
|
2010-08-27 01:24:34 +04:00
|
|
|
mutex_enter(&attrzp->z_acl_lock);
|
2010-05-29 00:45:14 +04:00
|
|
|
mutex_enter(&attrzp->z_lock);
|
|
|
|
SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
|
2017-03-08 03:21:37 +03:00
|
|
|
SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags,
|
2010-05-29 00:45:14 +04:00
|
|
|
sizeof (attrzp->z_pflags));
|
2018-02-14 01:54:54 +03:00
|
|
|
if (projid != ZFS_INVALID_PROJID) {
|
|
|
|
attrzp->z_projid = projid;
|
|
|
|
SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
|
|
|
|
SA_ZPL_PROJID(zfsvfs), NULL, &attrzp->z_projid,
|
|
|
|
sizeof (attrzp->z_projid));
|
|
|
|
}
|
2010-05-29 00:45:14 +04:00
|
|
|
}
|
|
|
|
|
2011-02-08 22:16:06 +03:00
|
|
|
if (mask & (ATTR_UID|ATTR_GID)) {
|
2010-05-29 00:45:14 +04:00
|
|
|
|
2011-02-08 22:16:06 +03:00
|
|
|
if (mask & ATTR_UID) {
|
2016-08-04 00:31:08 +03:00
|
|
|
ZTOI(zp)->i_uid = SUID_TO_KUID(new_kuid);
|
|
|
|
new_uid = zfs_uid_read(ZTOI(zp));
|
2017-03-08 03:21:37 +03:00
|
|
|
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
|
2010-05-29 00:45:14 +04:00
|
|
|
&new_uid, sizeof (new_uid));
|
|
|
|
if (attrzp) {
|
|
|
|
SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
|
2017-03-08 03:21:37 +03:00
|
|
|
SA_ZPL_UID(zfsvfs), NULL, &new_uid,
|
2010-05-29 00:45:14 +04:00
|
|
|
sizeof (new_uid));
|
2016-05-22 14:15:57 +03:00
|
|
|
ZTOI(attrzp)->i_uid = SUID_TO_KUID(new_uid);
|
2010-05-29 00:45:14 +04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-02-08 22:16:06 +03:00
|
|
|
if (mask & ATTR_GID) {
|
2016-08-04 00:31:08 +03:00
|
|
|
ZTOI(zp)->i_gid = SGID_TO_KGID(new_kgid);
|
|
|
|
new_gid = zfs_gid_read(ZTOI(zp));
|
2017-03-08 03:21:37 +03:00
|
|
|
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs),
|
2010-05-29 00:45:14 +04:00
|
|
|
NULL, &new_gid, sizeof (new_gid));
|
|
|
|
if (attrzp) {
|
|
|
|
SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
|
2017-03-08 03:21:37 +03:00
|
|
|
SA_ZPL_GID(zfsvfs), NULL, &new_gid,
|
2010-05-29 00:45:14 +04:00
|
|
|
sizeof (new_gid));
|
2016-08-04 00:31:08 +03:00
|
|
|
ZTOI(attrzp)->i_gid = SGID_TO_KGID(new_kgid);
|
2010-05-29 00:45:14 +04:00
|
|
|
}
|
|
|
|
}
|
2011-02-08 22:16:06 +03:00
|
|
|
if (!(mask & ATTR_MODE)) {
|
2017-03-08 03:21:37 +03:00
|
|
|
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs),
|
2010-05-29 00:45:14 +04:00
|
|
|
NULL, &new_mode, sizeof (new_mode));
|
|
|
|
new_mode = zp->z_mode;
|
|
|
|
}
|
|
|
|
err = zfs_acl_chown_setattr(zp);
|
|
|
|
ASSERT(err == 0);
|
|
|
|
if (attrzp) {
|
|
|
|
err = zfs_acl_chown_setattr(attrzp);
|
|
|
|
ASSERT(err == 0);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-02-08 22:16:06 +03:00
|
|
|
if (mask & ATTR_MODE) {
|
2017-03-08 03:21:37 +03:00
|
|
|
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
|
2010-05-29 00:45:14 +04:00
|
|
|
&new_mode, sizeof (new_mode));
|
2016-09-28 00:08:52 +03:00
|
|
|
zp->z_mode = ZTOI(zp)->i_mode = new_mode;
|
2011-02-24 02:03:30 +03:00
|
|
|
ASSERT3P(aclp, !=, NULL);
|
2009-07-03 02:44:48 +04:00
|
|
|
err = zfs_aclset_common(zp, aclp, cr, tx);
|
2013-05-11 01:17:03 +04:00
|
|
|
ASSERT0(err);
|
2010-08-27 01:24:34 +04:00
|
|
|
if (zp->z_acl_cached)
|
|
|
|
zfs_acl_free(zp->z_acl_cached);
|
2009-08-18 22:43:27 +04:00
|
|
|
zp->z_acl_cached = aclp;
|
|
|
|
aclp = NULL;
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2016-04-01 02:52:03 +03:00
|
|
|
if ((mask & ATTR_ATIME) || zp->z_atime_dirty) {
|
2019-08-13 16:58:02 +03:00
|
|
|
zp->z_atime_dirty = B_FALSE;
|
2016-04-01 02:52:03 +03:00
|
|
|
ZFS_TIME_ENCODE(&ip->i_atime, atime);
|
2017-03-08 03:21:37 +03:00
|
|
|
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
|
Fix atime handling and relatime
The problem for atime:
We have 3 places for atime: inode->i_atime, znode->z_atime and SA. And its
handling is a mess. A huge part of mess regarding atime comes from
zfs_tstamp_update_setup, zfs_inode_update, and zfs_getattr, which behave
inconsistently with those three values.
zfs_tstamp_update_setup clears z_atime_dirty unconditionally as long as you
don't pass ATTR_ATIME. Which means every write(2) operation which only updates
ctime and mtime will cause atime changes to not be written to disk.
Also zfs_inode_update from write(2) will replace inode->i_atime with what's
inside SA(stale). But doesn't touch z_atime. So after read(2) and write(2).
You'll have i_atime(stale), z_atime(new), SA(stale) and z_atime_dirty=0.
Now, if you do stat(2), zfs_getattr will actually replace i_atime with what's
inside, z_atime. So you will have now you'll have i_atime(new), z_atime(new),
SA(stale) and z_atime_dirty=0. These will all gone after umount. And you'll
leave with a stale atime.
The problem for relatime:
We do have a relatime config inside ZFS dataset, but how it should interact
with the mount flag MS_RELATIME is not well defined. It seems it wanted
relatime mount option to override the dataset config by showing it as
temporary in `zfs get`. But at the same time, `zfs set relatime=on|off` would
also seems to want to override the mount option. Not to mention that
MS_RELATIME flag is actually never passed into ZFS, so it never really worked.
How Linux handles atime:
The Linux kernel actually handles atime completely in VFS, except for writing
it to disk. So if we remove the atime handling in ZFS, things would just work,
no matter it's strictatime, relatime, noatime, or even O_NOATIME. And whenever
VFS updates the i_atime, it will notify the underlying filesystem via
sb->dirty_inode().
And also there's one thing to note about atime flags like MS_RELATIME and
other flags like MS_NODEV, etc. They are mount point flags rather than
filesystem(sb) flags. Since native linux filesystem can be mounted at multiple
places at the same time, they can all have different atime settings. So these
flags are never passed down to filesystem drivers.
What this patch tries to do:
We remove znode->z_atime, since we won't gain anything from it. We remove most
of the atime handling and leave it to VFS. The only thing we do with atime is
to write it when dirty_inode() or setattr() is called. We also add
file_accessed() in zpl_read() since it's not provided in vfs_read().
After this patch, only the MS_RELATIME flag will have effect. The setting in
dataset won't do anything. We will make zfstuil to mount ZFS with MS_RELATIME
set according to the setting in dataset in future patch.
Signed-off-by: Chunwei Chen <david.chen@osnexus.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Issue #4482
2016-03-30 03:53:34 +03:00
|
|
|
&atime, sizeof (atime));
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2017-11-13 20:24:26 +03:00
|
|
|
if (mask & (ATTR_MTIME | ATTR_SIZE)) {
|
2011-03-01 23:24:09 +03:00
|
|
|
ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
|
2020-02-06 23:37:25 +03:00
|
|
|
ZTOI(zp)->i_mtime = zpl_inode_timestamp_truncate(
|
|
|
|
vap->va_mtime, ZTOI(zp));
|
2016-09-12 22:35:56 +03:00
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
|
2010-05-29 00:45:14 +04:00
|
|
|
mtime, sizeof (mtime));
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2017-11-13 20:24:26 +03:00
|
|
|
if (mask & (ATTR_CTIME | ATTR_SIZE)) {
|
2016-09-12 22:35:56 +03:00
|
|
|
ZFS_TIME_ENCODE(&vap->va_ctime, ctime);
|
2020-02-06 23:37:25 +03:00
|
|
|
ZTOI(zp)->i_ctime = zpl_inode_timestamp_truncate(vap->va_ctime,
|
|
|
|
ZTOI(zp));
|
2017-03-08 03:21:37 +03:00
|
|
|
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
|
2016-09-12 22:35:56 +03:00
|
|
|
ctime, sizeof (ctime));
|
2010-05-29 00:45:14 +04:00
|
|
|
}
|
2016-09-12 22:35:56 +03:00
|
|
|
|
2018-02-14 01:54:54 +03:00
|
|
|
if (projid != ZFS_INVALID_PROJID) {
|
|
|
|
zp->z_projid = projid;
|
|
|
|
SA_ADD_BULK_ATTR(bulk, count,
|
|
|
|
SA_ZPL_PROJID(zfsvfs), NULL, &zp->z_projid,
|
|
|
|
sizeof (zp->z_projid));
|
|
|
|
}
|
|
|
|
|
2016-09-12 22:35:56 +03:00
|
|
|
if (attrzp && mask) {
|
|
|
|
SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
|
2017-03-08 03:21:37 +03:00
|
|
|
SA_ZPL_CTIME(zfsvfs), NULL, &ctime,
|
2016-09-12 22:35:56 +03:00
|
|
|
sizeof (ctime));
|
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* Do this after setting timestamps to prevent timestamp
|
|
|
|
* update from toggling bit
|
|
|
|
*/
|
|
|
|
|
2011-03-01 23:24:09 +03:00
|
|
|
if (xoap && (mask & ATTR_XVATTR)) {
|
|
|
|
|
|
|
|
/*
|
|
|
|
* restore trimmed off masks
|
|
|
|
* so that return masks can be set for caller.
|
|
|
|
*/
|
|
|
|
|
2011-03-03 01:18:40 +03:00
|
|
|
if (XVA_ISSET_REQ(tmpxvattr, XAT_APPENDONLY)) {
|
2011-03-01 23:24:09 +03:00
|
|
|
XVA_SET_REQ(xvap, XAT_APPENDONLY);
|
|
|
|
}
|
2011-03-03 01:18:40 +03:00
|
|
|
if (XVA_ISSET_REQ(tmpxvattr, XAT_NOUNLINK)) {
|
2011-03-01 23:24:09 +03:00
|
|
|
XVA_SET_REQ(xvap, XAT_NOUNLINK);
|
|
|
|
}
|
2011-03-03 01:18:40 +03:00
|
|
|
if (XVA_ISSET_REQ(tmpxvattr, XAT_IMMUTABLE)) {
|
2011-03-01 23:24:09 +03:00
|
|
|
XVA_SET_REQ(xvap, XAT_IMMUTABLE);
|
|
|
|
}
|
2011-03-03 01:18:40 +03:00
|
|
|
if (XVA_ISSET_REQ(tmpxvattr, XAT_NODUMP)) {
|
2011-03-01 23:24:09 +03:00
|
|
|
XVA_SET_REQ(xvap, XAT_NODUMP);
|
|
|
|
}
|
2011-03-03 01:18:40 +03:00
|
|
|
if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_MODIFIED)) {
|
2011-03-01 23:24:09 +03:00
|
|
|
XVA_SET_REQ(xvap, XAT_AV_MODIFIED);
|
|
|
|
}
|
2011-03-03 01:18:40 +03:00
|
|
|
if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_QUARANTINED)) {
|
2011-03-01 23:24:09 +03:00
|
|
|
XVA_SET_REQ(xvap, XAT_AV_QUARANTINED);
|
|
|
|
}
|
2018-02-14 01:54:54 +03:00
|
|
|
if (XVA_ISSET_REQ(tmpxvattr, XAT_PROJINHERIT)) {
|
|
|
|
XVA_SET_REQ(xvap, XAT_PROJINHERIT);
|
|
|
|
}
|
2011-03-01 23:24:09 +03:00
|
|
|
|
|
|
|
if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
|
|
|
|
ASSERT(S_ISREG(ip->i_mode));
|
|
|
|
|
|
|
|
zfs_xvattr_set(zp, xvap, tx);
|
|
|
|
}
|
|
|
|
|
2009-07-03 02:44:48 +04:00
|
|
|
if (fuid_dirtied)
|
2017-03-08 03:21:37 +03:00
|
|
|
zfs_fuid_sync(zfsvfs, tx);
|
2009-07-03 02:44:48 +04:00
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
if (mask != 0)
|
2011-03-01 23:24:09 +03:00
|
|
|
zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
mutex_exit(&zp->z_lock);
|
2011-02-08 22:16:06 +03:00
|
|
|
if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
|
2010-08-27 01:24:34 +04:00
|
|
|
mutex_exit(&zp->z_acl_lock);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-08-27 01:24:34 +04:00
|
|
|
if (attrzp) {
|
2011-02-08 22:16:06 +03:00
|
|
|
if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
|
2010-08-27 01:24:34 +04:00
|
|
|
mutex_exit(&attrzp->z_acl_lock);
|
|
|
|
mutex_exit(&attrzp->z_lock);
|
|
|
|
}
|
2009-07-03 02:44:48 +04:00
|
|
|
out:
|
2018-02-14 01:54:54 +03:00
|
|
|
if (err == 0 && xattr_count > 0) {
|
2010-05-29 00:45:14 +04:00
|
|
|
err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk,
|
|
|
|
xattr_count, tx);
|
|
|
|
ASSERT(err2 == 0);
|
|
|
|
}
|
|
|
|
|
2009-08-18 22:43:27 +04:00
|
|
|
if (aclp)
|
2009-07-03 02:44:48 +04:00
|
|
|
zfs_acl_free(aclp);
|
|
|
|
|
|
|
|
if (fuidp) {
|
|
|
|
zfs_fuid_info_free(fuidp);
|
|
|
|
fuidp = NULL;
|
|
|
|
}
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
if (err) {
|
2009-07-03 02:44:48 +04:00
|
|
|
dmu_tx_abort(tx);
|
2017-02-08 02:57:50 +03:00
|
|
|
if (attrzp)
|
2019-12-11 22:53:57 +03:00
|
|
|
zrele(attrzp);
|
2010-05-29 00:45:14 +04:00
|
|
|
if (err == ERESTART)
|
|
|
|
goto top;
|
|
|
|
} else {
|
2018-02-14 01:54:54 +03:00
|
|
|
if (count > 0)
|
|
|
|
err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
|
2009-07-03 02:44:48 +04:00
|
|
|
dmu_tx_commit(tx);
|
2018-02-14 01:54:54 +03:00
|
|
|
if (attrzp) {
|
|
|
|
if (err2 == 0 && handle_eadir)
|
|
|
|
err2 = zfs_setattr_dir(attrzp);
|
2019-12-11 22:53:57 +03:00
|
|
|
zrele(attrzp);
|
2018-02-14 01:54:54 +03:00
|
|
|
}
|
2011-02-18 01:17:44 +03:00
|
|
|
zfs_inode_update(zp);
|
2010-05-29 00:45:14 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
out2:
|
2018-02-14 01:54:54 +03:00
|
|
|
if (os->os_sync == ZFS_SYNC_ALWAYS)
|
2010-08-27 01:24:34 +04:00
|
|
|
zil_commit(zilog, 0);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2011-03-03 01:18:40 +03:00
|
|
|
out3:
|
2018-02-14 01:54:54 +03:00
|
|
|
kmem_free(xattr_bulk, sizeof (sa_bulk_attr_t) * bulks);
|
|
|
|
kmem_free(bulk, sizeof (sa_bulk_attr_t) * bulks);
|
2013-11-01 23:26:11 +04:00
|
|
|
kmem_free(tmpxvattr, sizeof (xvattr_t));
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2008-11-20 23:01:55 +03:00
|
|
|
return (err);
|
|
|
|
}
|
|
|
|
|
|
|
|
typedef struct zfs_zlock {
|
|
|
|
krwlock_t *zl_rwlock; /* lock we acquired */
|
|
|
|
znode_t *zl_znode; /* znode we held */
|
|
|
|
struct zfs_zlock *zl_next; /* next in list */
|
|
|
|
} zfs_zlock_t;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Drop locks and release vnodes that were held by zfs_rename_lock().
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
zfs_rename_unlock(zfs_zlock_t **zlpp)
|
|
|
|
{
|
|
|
|
zfs_zlock_t *zl;
|
|
|
|
|
|
|
|
while ((zl = *zlpp) != NULL) {
|
|
|
|
if (zl->zl_znode != NULL)
|
2019-12-11 22:53:57 +03:00
|
|
|
zfs_zrele_async(zl->zl_znode);
|
2008-11-20 23:01:55 +03:00
|
|
|
rw_exit(zl->zl_rwlock);
|
|
|
|
*zlpp = zl->zl_next;
|
|
|
|
kmem_free(zl, sizeof (*zl));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Search back through the directory tree, using the ".." entries.
|
|
|
|
* Lock each directory in the chain to prevent concurrent renames.
|
|
|
|
* Fail any attempt to move a directory into one of its own descendants.
|
|
|
|
* XXX - z_parent_lock can overlap with map or grow locks
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp)
|
|
|
|
{
|
|
|
|
zfs_zlock_t *zl;
|
|
|
|
znode_t *zp = tdzp;
|
2011-02-08 22:16:06 +03:00
|
|
|
uint64_t rootid = ZTOZSB(zp)->z_root;
|
2010-05-29 00:45:14 +04:00
|
|
|
uint64_t oidp = zp->z_id;
|
2008-11-20 23:01:55 +03:00
|
|
|
krwlock_t *rwlp = &szp->z_parent_lock;
|
|
|
|
krw_t rw = RW_WRITER;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* First pass write-locks szp and compares to zp->z_id.
|
|
|
|
* Later passes read-lock zp and compare to zp->z_parent.
|
|
|
|
*/
|
|
|
|
do {
|
|
|
|
if (!rw_tryenter(rwlp, rw)) {
|
|
|
|
/*
|
|
|
|
* Another thread is renaming in this path.
|
|
|
|
* Note that if we are a WRITER, we don't have any
|
|
|
|
* parent_locks held yet.
|
|
|
|
*/
|
|
|
|
if (rw == RW_READER && zp->z_id > szp->z_id) {
|
|
|
|
/*
|
|
|
|
* Drop our locks and restart
|
|
|
|
*/
|
|
|
|
zfs_rename_unlock(&zl);
|
|
|
|
*zlpp = NULL;
|
|
|
|
zp = tdzp;
|
2010-05-29 00:45:14 +04:00
|
|
|
oidp = zp->z_id;
|
2008-11-20 23:01:55 +03:00
|
|
|
rwlp = &szp->z_parent_lock;
|
|
|
|
rw = RW_WRITER;
|
|
|
|
continue;
|
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* Wait for other thread to drop its locks
|
|
|
|
*/
|
|
|
|
rw_enter(rwlp, rw);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
zl = kmem_alloc(sizeof (*zl), KM_SLEEP);
|
|
|
|
zl->zl_rwlock = rwlp;
|
|
|
|
zl->zl_znode = NULL;
|
|
|
|
zl->zl_next = *zlpp;
|
|
|
|
*zlpp = zl;
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
if (oidp == szp->z_id) /* We're a descendant of szp */
|
2013-03-08 22:41:28 +04:00
|
|
|
return (SET_ERROR(EINVAL));
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
if (oidp == rootid) /* We've hit the top */
|
2008-11-20 23:01:55 +03:00
|
|
|
return (0);
|
|
|
|
|
|
|
|
if (rw == RW_READER) { /* i.e. not the first pass */
|
2011-02-08 22:16:06 +03:00
|
|
|
int error = zfs_zget(ZTOZSB(zp), oidp, &zp);
|
2008-11-20 23:01:55 +03:00
|
|
|
if (error)
|
|
|
|
return (error);
|
|
|
|
zl->zl_znode = zp;
|
|
|
|
}
|
2011-02-08 22:16:06 +03:00
|
|
|
(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(ZTOZSB(zp)),
|
2010-05-29 00:45:14 +04:00
|
|
|
&oidp, sizeof (oidp));
|
2008-11-20 23:01:55 +03:00
|
|
|
rwlp = &zp->z_parent_lock;
|
|
|
|
rw = RW_READER;
|
|
|
|
|
|
|
|
} while (zp->z_id != sdzp->z_id);
|
|
|
|
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Move an entry from the provided source directory to the target
|
|
|
|
* directory. Change the entry name as indicated.
|
|
|
|
*
|
2019-12-11 22:53:57 +03:00
|
|
|
* IN: sdzp - Source directory containing the "old entry".
|
2008-11-20 23:01:55 +03:00
|
|
|
* snm - Old entry name.
|
2019-12-11 22:53:57 +03:00
|
|
|
* tdzp - Target directory to contain the "new entry".
|
2008-11-20 23:01:55 +03:00
|
|
|
* tnm - New entry name.
|
|
|
|
* cr - credentials of caller.
|
|
|
|
* flags - case flags
|
|
|
|
*
|
2013-06-11 21:12:34 +04:00
|
|
|
* RETURN: 0 on success, error code on failure.
|
2008-11-20 23:01:55 +03:00
|
|
|
*
|
|
|
|
* Timestamps:
|
2019-12-11 22:53:57 +03:00
|
|
|
* sdzp,tdzp - ctime|mtime updated
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
|
|
|
/*ARGSUSED*/
|
2010-12-17 22:18:08 +03:00
|
|
|
int
|
2019-12-11 22:53:57 +03:00
|
|
|
zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, char *tnm,
|
2011-02-08 22:16:06 +03:00
|
|
|
cred_t *cr, int flags)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2019-12-11 22:53:57 +03:00
|
|
|
znode_t *szp, *tzp;
|
|
|
|
zfsvfs_t *zfsvfs = ZTOZSB(sdzp);
|
2008-11-20 23:01:55 +03:00
|
|
|
zilog_t *zilog;
|
|
|
|
zfs_dirlock_t *sdl, *tdl;
|
|
|
|
dmu_tx_t *tx;
|
|
|
|
zfs_zlock_t *zl;
|
|
|
|
int cmp, serr, terr;
|
|
|
|
int error = 0;
|
|
|
|
int zflg = 0;
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-29 07:01:20 +04:00
|
|
|
boolean_t waited = B_FALSE;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2016-11-10 19:50:32 +03:00
|
|
|
if (snm == NULL || tnm == NULL)
|
|
|
|
return (SET_ERROR(EINVAL));
|
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_ENTER(zfsvfs);
|
2008-11-20 23:01:55 +03:00
|
|
|
ZFS_VERIFY_ZP(sdzp);
|
2017-03-08 03:21:37 +03:00
|
|
|
zilog = zfsvfs->z_log;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2016-01-14 02:35:55 +03:00
|
|
|
ZFS_VERIFY_ZP(tdzp);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We check i_sb because snapshots and the ctldir must have different
|
|
|
|
* super blocks.
|
|
|
|
*/
|
2019-12-11 22:53:57 +03:00
|
|
|
if (ZTOI(tdzp)->i_sb != ZTOI(sdzp)->i_sb ||
|
|
|
|
zfsctl_is_node(ZTOI(tdzp))) {
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2013-03-08 22:41:28 +04:00
|
|
|
return (SET_ERROR(EXDEV));
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
if (zfsvfs->z_utf8 && u8_validate(tnm,
|
2008-11-20 23:01:55 +03:00
|
|
|
strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2013-03-08 22:41:28 +04:00
|
|
|
return (SET_ERROR(EILSEQ));
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
if (flags & FIGNORECASE)
|
|
|
|
zflg |= ZCILOOK;
|
|
|
|
|
|
|
|
top:
|
|
|
|
szp = NULL;
|
|
|
|
tzp = NULL;
|
|
|
|
zl = NULL;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This is to prevent the creation of links into attribute space
|
|
|
|
* by renaming a linked file into/outof an attribute directory.
|
|
|
|
* See the comment in zfs_link() for why this is considered bad.
|
|
|
|
*/
|
2010-05-29 00:45:14 +04:00
|
|
|
if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2013-03-08 22:41:28 +04:00
|
|
|
return (SET_ERROR(EINVAL));
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Lock source and target directory entries. To prevent deadlock,
|
|
|
|
* a lock ordering must be defined. We lock the directory with
|
|
|
|
* the smallest object id first, or if it's a tie, the one with
|
|
|
|
* the lexically first name.
|
|
|
|
*/
|
|
|
|
if (sdzp->z_id < tdzp->z_id) {
|
|
|
|
cmp = -1;
|
|
|
|
} else if (sdzp->z_id > tdzp->z_id) {
|
|
|
|
cmp = 1;
|
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* First compare the two name arguments without
|
|
|
|
* considering any case folding.
|
|
|
|
*/
|
2017-03-08 03:21:37 +03:00
|
|
|
int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error);
|
2017-03-08 03:21:37 +03:00
|
|
|
ASSERT(error == 0 || !zfsvfs->z_utf8);
|
2008-11-20 23:01:55 +03:00
|
|
|
if (cmp == 0) {
|
|
|
|
/*
|
|
|
|
* POSIX: "If the old argument and the new argument
|
|
|
|
* both refer to links to the same existing file,
|
|
|
|
* the rename() function shall return successfully
|
|
|
|
* and perform no other action."
|
|
|
|
*/
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2008-11-20 23:01:55 +03:00
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* If the file system is case-folding, then we may
|
|
|
|
* have some more checking to do. A case-folding file
|
|
|
|
* system is either supporting mixed case sensitivity
|
|
|
|
* access or is completely case-insensitive. Note
|
|
|
|
* that the file system is always case preserving.
|
|
|
|
*
|
|
|
|
* In mixed sensitivity mode case sensitive behavior
|
|
|
|
* is the default. FIGNORECASE must be used to
|
|
|
|
* explicitly request case insensitive behavior.
|
|
|
|
*
|
|
|
|
* If the source and target names provided differ only
|
|
|
|
* by case (e.g., a request to rename 'tim' to 'Tim'),
|
|
|
|
* we will treat this as a special case in the
|
|
|
|
* case-insensitive mode: as long as the source name
|
|
|
|
* is an exact match, we will allow this to proceed as
|
|
|
|
* a name-change request.
|
|
|
|
*/
|
2017-03-08 03:21:37 +03:00
|
|
|
if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
|
|
|
|
(zfsvfs->z_case == ZFS_CASE_MIXED &&
|
2008-11-20 23:01:55 +03:00
|
|
|
flags & FIGNORECASE)) &&
|
2017-03-08 03:21:37 +03:00
|
|
|
u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST,
|
2008-11-20 23:01:55 +03:00
|
|
|
&error) == 0) {
|
|
|
|
/*
|
|
|
|
* case preserving rename request, require exact
|
|
|
|
* name matches
|
|
|
|
*/
|
|
|
|
zflg |= ZCIEXACT;
|
|
|
|
zflg &= ~ZCILOOK;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
/*
|
|
|
|
* If the source and destination directories are the same, we should
|
|
|
|
* grab the z_name_lock of that directory only once.
|
|
|
|
*/
|
|
|
|
if (sdzp == tdzp) {
|
|
|
|
zflg |= ZHAVELOCK;
|
|
|
|
rw_enter(&sdzp->z_name_lock, RW_READER);
|
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
if (cmp < 0) {
|
|
|
|
serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp,
|
|
|
|
ZEXISTS | zflg, NULL, NULL);
|
|
|
|
terr = zfs_dirent_lock(&tdl,
|
|
|
|
tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL);
|
|
|
|
} else {
|
|
|
|
terr = zfs_dirent_lock(&tdl,
|
|
|
|
tdzp, tnm, &tzp, zflg, NULL, NULL);
|
|
|
|
serr = zfs_dirent_lock(&sdl,
|
|
|
|
sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg,
|
|
|
|
NULL, NULL);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (serr) {
|
|
|
|
/*
|
|
|
|
* Source entry invalid or not there.
|
|
|
|
*/
|
|
|
|
if (!terr) {
|
|
|
|
zfs_dirent_unlock(tdl);
|
|
|
|
if (tzp)
|
2019-12-11 22:53:57 +03:00
|
|
|
zrele(tzp);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
2010-05-29 00:45:14 +04:00
|
|
|
|
|
|
|
if (sdzp == tdzp)
|
|
|
|
rw_exit(&sdzp->z_name_lock);
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
if (strcmp(snm, "..") == 0)
|
|
|
|
serr = EINVAL;
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2008-11-20 23:01:55 +03:00
|
|
|
return (serr);
|
|
|
|
}
|
|
|
|
if (terr) {
|
|
|
|
zfs_dirent_unlock(sdl);
|
2019-12-11 22:53:57 +03:00
|
|
|
zrele(szp);
|
2010-05-29 00:45:14 +04:00
|
|
|
|
|
|
|
if (sdzp == tdzp)
|
|
|
|
rw_exit(&sdzp->z_name_lock);
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
if (strcmp(tnm, "..") == 0)
|
|
|
|
terr = EINVAL;
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2008-11-20 23:01:55 +03:00
|
|
|
return (terr);
|
|
|
|
}
|
|
|
|
|
2018-02-14 01:54:54 +03:00
|
|
|
/*
|
|
|
|
* If we are using project inheritance, means if the directory has
|
|
|
|
* ZFS_PROJINHERIT set, then its descendant directories will inherit
|
|
|
|
* not only the project ID, but also the ZFS_PROJINHERIT flag. Under
|
|
|
|
* such case, we only allow renames into our tree when the project
|
|
|
|
* IDs are the same.
|
|
|
|
*/
|
|
|
|
if (tdzp->z_pflags & ZFS_PROJINHERIT &&
|
|
|
|
tdzp->z_projid != szp->z_projid) {
|
|
|
|
error = SET_ERROR(EXDEV);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* Must have write access at the source to remove the old entry
|
|
|
|
* and write access at the target to create the new entry.
|
|
|
|
* Note that if target and source are the same, this can be
|
|
|
|
* done in a single check.
|
|
|
|
*/
|
|
|
|
|
2010-12-17 01:05:42 +03:00
|
|
|
if ((error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr)))
|
2008-11-20 23:01:55 +03:00
|
|
|
goto out;
|
|
|
|
|
2011-02-08 22:16:06 +03:00
|
|
|
if (S_ISDIR(ZTOI(szp)->i_mode)) {
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* Check to make sure rename is valid.
|
|
|
|
* Can't do a move like this: /usr/a/b to /usr/a/b/c/d
|
|
|
|
*/
|
2010-12-17 01:05:42 +03:00
|
|
|
if ((error = zfs_rename_lock(szp, tdzp, sdzp, &zl)))
|
2008-11-20 23:01:55 +03:00
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Does target exist?
|
|
|
|
*/
|
|
|
|
if (tzp) {
|
|
|
|
/*
|
|
|
|
* Source and target must be the same type.
|
|
|
|
*/
|
2011-02-08 22:16:06 +03:00
|
|
|
if (S_ISDIR(ZTOI(szp)->i_mode)) {
|
|
|
|
if (!S_ISDIR(ZTOI(tzp)->i_mode)) {
|
2013-03-08 22:41:28 +04:00
|
|
|
error = SET_ERROR(ENOTDIR);
|
2008-11-20 23:01:55 +03:00
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
} else {
|
2011-02-08 22:16:06 +03:00
|
|
|
if (S_ISDIR(ZTOI(tzp)->i_mode)) {
|
2013-03-08 22:41:28 +04:00
|
|
|
error = SET_ERROR(EISDIR);
|
2008-11-20 23:01:55 +03:00
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* POSIX dictates that when the source and target
|
|
|
|
* entries refer to the same file object, rename
|
|
|
|
* must do nothing and exit without error.
|
|
|
|
*/
|
|
|
|
if (szp->z_id == tzp->z_id) {
|
|
|
|
error = 0;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
tx = dmu_tx_create(zfsvfs->z_os);
|
2010-05-29 00:45:14 +04:00
|
|
|
dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
|
|
|
|
dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
|
2008-11-20 23:01:55 +03:00
|
|
|
dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
|
|
|
|
dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
|
2010-05-29 00:45:14 +04:00
|
|
|
if (sdzp != tdzp) {
|
|
|
|
dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE);
|
|
|
|
zfs_sa_upgrade_txholds(tx, tdzp);
|
|
|
|
}
|
|
|
|
if (tzp) {
|
|
|
|
dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE);
|
|
|
|
zfs_sa_upgrade_txholds(tx, tzp);
|
|
|
|
}
|
|
|
|
|
|
|
|
zfs_sa_upgrade_txholds(tx, szp);
|
2017-03-08 03:21:37 +03:00
|
|
|
dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
|
OpenZFS 8997 - ztest assertion failure in zil_lwb_write_issue
PROBLEM
=======
When `dmu_tx_assign` is called from `zil_lwb_write_issue`, it's possible
for either `ERESTART` or `EIO` to be returned.
If `ERESTART` is returned, this will cause an assertion to fail directly
in `zil_lwb_write_issue`, where the code assumes the return value is
`EIO` if `dmu_tx_assign` returns a non-zero value. This can occur if the
SPA is suspended when `dmu_tx_assign` is called, and most often occurs
when running `zloop`.
If `EIO` is returned, this can cause assertions to fail elsewhere in the
ZIL code. For example, `zil_commit_waiter_timeout` contains the
following logic:
lwb_t *nlwb = zil_lwb_write_issue(zilog, lwb);
ASSERT3S(lwb->lwb_state, !=, LWB_STATE_OPENED);
In this case, if `dmu_tx_assign` returned `EIO` from within
`zil_lwb_write_issue`, the `lwb` variable passed in will not be issued
to disk. Thus, it's `lwb_state` field will remain `LWB_STATE_OPENED` and
this assertion will fail. `zil_commit_waiter_timeout` assumes that after
it calls `zil_lwb_write_issue`, the `lwb` will be issued to disk, and
doesn't handle the case where this is not true; i.e. it doesn't handle
the case where `dmu_tx_assign` returns `EIO`.
SOLUTION
========
This change modifies the `dmu_tx_assign` function such that `txg_how` is
a bitmask, rather than of the `txg_how_t` enum type. Now, the previous
`TXG_WAITED` semantics can be used via `TXG_NOTHROTTLE`, along with
specifying either `TXG_NOWAIT` or `TXG_WAIT` semantics.
Previously, when `TXG_WAITED` was specified, `TXG_NOWAIT` semantics was
automatically invoked. This was not ideal when using `TXG_WAITED` within
`zil_lwb_write_issued`, leading the problem described above. Rather, we
want to achieve the semantics of `TXG_WAIT`, while also preventing the
`tx` from being penalized via the dirty delay throttling.
With this change, `zil_lwb_write_issued` can acheive the semtantics that
it requires by passing in the value `TXG_WAIT | TXG_NOTHROTTLE` to
`dmu_tx_assign`.
Further, consumers of `dmu_tx_assign` wishing to achieve the old
`TXG_WAITED` semantics can pass in the value `TXG_NOWAIT | TXG_NOTHROTTLE`.
Authored by: Prakash Surya <prakash.surya@delphix.com>
Approved by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: Andriy Gapon <avg@FreeBSD.org>
Ported-by: Brian Behlendorf <behlendorf1@llnl.gov>
Porting Notes:
- Additionally updated `zfs_tmpfile` to use `TXG_NOTHROTTLE`
OpenZFS-issue: https://www.illumos.org/issues/8997
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/19ea6cb0f9
Closes #7084
2018-01-09 00:45:53 +03:00
|
|
|
error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
|
2008-11-20 23:01:55 +03:00
|
|
|
if (error) {
|
|
|
|
if (zl != NULL)
|
|
|
|
zfs_rename_unlock(&zl);
|
|
|
|
zfs_dirent_unlock(sdl);
|
|
|
|
zfs_dirent_unlock(tdl);
|
2010-05-29 00:45:14 +04:00
|
|
|
|
|
|
|
if (sdzp == tdzp)
|
|
|
|
rw_exit(&sdzp->z_name_lock);
|
|
|
|
|
2009-01-16 00:59:39 +03:00
|
|
|
if (error == ERESTART) {
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-29 07:01:20 +04:00
|
|
|
waited = B_TRUE;
|
2008-11-20 23:01:55 +03:00
|
|
|
dmu_tx_wait(tx);
|
|
|
|
dmu_tx_abort(tx);
|
2019-12-11 22:53:57 +03:00
|
|
|
zrele(szp);
|
2017-02-08 02:57:50 +03:00
|
|
|
if (tzp)
|
2019-12-11 22:53:57 +03:00
|
|
|
zrele(tzp);
|
2008-11-20 23:01:55 +03:00
|
|
|
goto top;
|
|
|
|
}
|
|
|
|
dmu_tx_abort(tx);
|
2019-12-11 22:53:57 +03:00
|
|
|
zrele(szp);
|
2017-02-08 02:57:50 +03:00
|
|
|
if (tzp)
|
2019-12-11 22:53:57 +03:00
|
|
|
zrele(tzp);
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2008-11-20 23:01:55 +03:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (tzp) /* Attempt to remove the existing target */
|
|
|
|
error = zfs_link_destroy(tdl, tzp, tx, zflg, NULL);
|
|
|
|
|
|
|
|
if (error == 0) {
|
|
|
|
error = zfs_link_create(tdl, szp, tx, ZRENAMING);
|
|
|
|
if (error == 0) {
|
2010-05-29 00:45:14 +04:00
|
|
|
szp->z_pflags |= ZFS_AV_MODIFIED;
|
2018-02-14 01:54:54 +03:00
|
|
|
if (tdzp->z_pflags & ZFS_PROJINHERIT)
|
|
|
|
szp->z_pflags |= ZFS_PROJINHERIT;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
|
2010-05-29 00:45:14 +04:00
|
|
|
(void *)&szp->z_pflags, sizeof (uint64_t), tx);
|
2013-05-11 01:17:03 +04:00
|
|
|
ASSERT0(error);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL);
|
|
|
|
if (error == 0) {
|
|
|
|
zfs_log_rename(zilog, tx, TX_RENAME |
|
2010-08-27 01:24:34 +04:00
|
|
|
(flags & FIGNORECASE ? TX_CI : 0), sdzp,
|
|
|
|
sdl->dl_name, tdzp, tdl->dl_name, szp);
|
2010-05-29 00:45:14 +04:00
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* At this point, we have successfully created
|
|
|
|
* the target name, but have failed to remove
|
|
|
|
* the source name. Since the create was done
|
|
|
|
* with the ZRENAMING flag, there are
|
|
|
|
* complications; for one, the link count is
|
|
|
|
* wrong. The easiest way to deal with this
|
|
|
|
* is to remove the newly created target, and
|
|
|
|
* return the original error. This must
|
|
|
|
* succeed; fortunately, it is very unlikely to
|
|
|
|
* fail, since we just created it.
|
|
|
|
*/
|
|
|
|
VERIFY3U(zfs_link_destroy(tdl, szp, tx,
|
|
|
|
ZRENAMING, NULL), ==, 0);
|
|
|
|
}
|
Fix ENOSPC in "Handle zap_add() failures in ..."
Commit cc63068 caused ENOSPC error when copy a large amount of files
between two directories. The reason is that the patch limits zap leaf
expansion to 2 retries, and return ENOSPC when failed.
The intent for limiting retries is to prevent pointlessly growing table
to max size when adding a block full of entries with same name in
different case in mixed mode. However, it turns out we cannot use any
limit on the retry. When we copy files from one directory in readdir
order, we are copying in hash order, one leaf block at a time. Which
means that if the leaf block in source directory has expanded 6 times,
and you copy those entries in that block, by the time you need to expand
the leaf in destination directory, you need to expand it 6 times in one
go. So any limit on the retry will result in error where it shouldn't.
Note that while we do use different salt for different directories, it
seems that the salt/hash function doesn't provide enough randomization
to the hash distance to prevent this from happening.
Since cc63068 has already been reverted. This patch adds it back and
removes the retry limit.
Also, as it turn out, failing on zap_add() has a serious side effect for
mzap_upgrade(). When upgrading from micro zap to fat zap, it will
call zap_add() to transfer entries one at a time. If it hit any error
halfway through, the remaining entries will be lost, causing those files
to become orphan. This patch add a VERIFY to catch it.
Reviewed-by: Sanjeev Bagewadi <sanjeev.bagewadi@gmail.com>
Reviewed-by: Richard Yao <ryao@gentoo.org>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Albert Lee <trisk@forkgnu.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Signed-off-by: Chunwei Chen <david.chen@nutanix.com>
Closes #7401
Closes #7421
2018-04-19 00:19:50 +03:00
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* If we had removed the existing target, subsequent
|
|
|
|
* call to zfs_link_create() to add back the same entry
|
|
|
|
* but, the new dnode (szp) should not fail.
|
|
|
|
*/
|
|
|
|
ASSERT(tzp == NULL);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
dmu_tx_commit(tx);
|
|
|
|
out:
|
|
|
|
if (zl != NULL)
|
|
|
|
zfs_rename_unlock(&zl);
|
|
|
|
|
|
|
|
zfs_dirent_unlock(sdl);
|
|
|
|
zfs_dirent_unlock(tdl);
|
|
|
|
|
2011-01-06 01:27:30 +03:00
|
|
|
zfs_inode_update(sdzp);
|
2010-05-29 00:45:14 +04:00
|
|
|
if (sdzp == tdzp)
|
|
|
|
rw_exit(&sdzp->z_name_lock);
|
|
|
|
|
2011-01-06 01:27:30 +03:00
|
|
|
if (sdzp != tdzp)
|
|
|
|
zfs_inode_update(tdzp);
|
2010-05-29 00:45:14 +04:00
|
|
|
|
2011-01-06 01:27:30 +03:00
|
|
|
zfs_inode_update(szp);
|
2019-12-11 22:53:57 +03:00
|
|
|
zrele(szp);
|
2011-01-06 01:27:30 +03:00
|
|
|
if (tzp) {
|
|
|
|
zfs_inode_update(tzp);
|
2019-12-11 22:53:57 +03:00
|
|
|
zrele(tzp);
|
2011-01-06 01:27:30 +03:00
|
|
|
}
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
|
2010-08-27 01:24:34 +04:00
|
|
|
zil_commit(zilog, 0);
|
2010-05-29 00:45:14 +04:00
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2008-11-20 23:01:55 +03:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Insert the indicated symbolic reference entry into the directory.
|
|
|
|
*
|
2019-12-11 22:53:57 +03:00
|
|
|
* IN: dzp - Directory to contain new symbolic link.
|
2019-05-26 00:29:10 +03:00
|
|
|
* name - Name of directory entry in dip.
|
2008-11-20 23:01:55 +03:00
|
|
|
* vap - Attributes of new entry.
|
2019-05-26 00:29:10 +03:00
|
|
|
* link - Name for new symlink entry.
|
2008-11-20 23:01:55 +03:00
|
|
|
* cr - credentials of caller.
|
|
|
|
* flags - case flags
|
|
|
|
*
|
2019-12-11 22:53:57 +03:00
|
|
|
* OUT: zpp - Znode for new symbolic link.
|
2019-05-26 00:29:10 +03:00
|
|
|
*
|
2013-06-11 21:12:34 +04:00
|
|
|
* RETURN: 0 on success, error code on failure.
|
2008-11-20 23:01:55 +03:00
|
|
|
*
|
|
|
|
* Timestamps:
|
2011-02-08 22:16:06 +03:00
|
|
|
* dip - ctime|mtime updated
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
|
|
|
/*ARGSUSED*/
|
2010-12-17 22:18:08 +03:00
|
|
|
int
|
2019-12-11 22:53:57 +03:00
|
|
|
zfs_symlink(znode_t *dzp, char *name, vattr_t *vap, char *link,
|
|
|
|
znode_t **zpp, cred_t *cr, int flags)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2019-12-11 22:53:57 +03:00
|
|
|
znode_t *zp;
|
2008-11-20 23:01:55 +03:00
|
|
|
zfs_dirlock_t *dl;
|
|
|
|
dmu_tx_t *tx;
|
2019-12-11 22:53:57 +03:00
|
|
|
zfsvfs_t *zfsvfs = ZTOZSB(dzp);
|
2008-11-20 23:01:55 +03:00
|
|
|
zilog_t *zilog;
|
2010-05-29 00:45:14 +04:00
|
|
|
uint64_t len = strlen(link);
|
2008-11-20 23:01:55 +03:00
|
|
|
int error;
|
|
|
|
int zflg = ZNEW;
|
2009-07-03 02:44:48 +04:00
|
|
|
zfs_acl_ids_t acl_ids;
|
|
|
|
boolean_t fuid_dirtied;
|
2010-05-29 00:45:14 +04:00
|
|
|
uint64_t txtype = TX_SYMLINK;
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-29 07:01:20 +04:00
|
|
|
boolean_t waited = B_FALSE;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2011-02-08 22:16:06 +03:00
|
|
|
ASSERT(S_ISLNK(vap->va_mode));
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2016-11-10 19:50:32 +03:00
|
|
|
if (name == NULL)
|
|
|
|
return (SET_ERROR(EINVAL));
|
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_ENTER(zfsvfs);
|
2008-11-20 23:01:55 +03:00
|
|
|
ZFS_VERIFY_ZP(dzp);
|
2017-03-08 03:21:37 +03:00
|
|
|
zilog = zfsvfs->z_log;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
|
2008-11-20 23:01:55 +03:00
|
|
|
NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2013-03-08 22:41:28 +04:00
|
|
|
return (SET_ERROR(EILSEQ));
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
if (flags & FIGNORECASE)
|
|
|
|
zflg |= ZCILOOK;
|
|
|
|
|
|
|
|
if (len > MAXPATHLEN) {
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2013-03-08 22:41:28 +04:00
|
|
|
return (SET_ERROR(ENAMETOOLONG));
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
if ((error = zfs_acl_ids_create(dzp, 0,
|
|
|
|
vap, cr, NULL, &acl_ids)) != 0) {
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2010-05-29 00:45:14 +04:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
top:
|
2019-12-11 22:53:57 +03:00
|
|
|
*zpp = NULL;
|
2011-02-08 22:16:06 +03:00
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* Attempt to lock directory; fail if entry already exists.
|
|
|
|
*/
|
|
|
|
error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL);
|
|
|
|
if (error) {
|
2010-05-29 00:45:14 +04:00
|
|
|
zfs_acl_ids_free(&acl_ids);
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2010-05-29 00:45:14 +04:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
2010-12-17 01:05:42 +03:00
|
|
|
if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) {
|
2010-05-29 00:45:14 +04:00
|
|
|
zfs_acl_ids_free(&acl_ids);
|
|
|
|
zfs_dirent_unlock(dl);
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2008-11-20 23:01:55 +03:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
2018-02-14 01:54:54 +03:00
|
|
|
if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, ZFS_DEFAULT_PROJID)) {
|
2009-07-03 02:44:48 +04:00
|
|
|
zfs_acl_ids_free(&acl_ids);
|
|
|
|
zfs_dirent_unlock(dl);
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2013-03-08 22:41:28 +04:00
|
|
|
return (SET_ERROR(EDQUOT));
|
2009-07-03 02:44:48 +04:00
|
|
|
}
|
2017-03-08 03:21:37 +03:00
|
|
|
tx = dmu_tx_create(zfsvfs->z_os);
|
|
|
|
fuid_dirtied = zfsvfs->z_fuid_dirty;
|
2008-11-20 23:01:55 +03:00
|
|
|
dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
|
|
|
|
dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
|
2010-05-29 00:45:14 +04:00
|
|
|
dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
|
|
|
|
ZFS_SA_BASE_ATTR_SIZE + len);
|
|
|
|
dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
|
2017-03-08 03:21:37 +03:00
|
|
|
if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
|
2010-05-29 00:45:14 +04:00
|
|
|
dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
|
|
|
|
acl_ids.z_aclp->z_acl_bytes);
|
|
|
|
}
|
2009-07-03 02:44:48 +04:00
|
|
|
if (fuid_dirtied)
|
2017-03-08 03:21:37 +03:00
|
|
|
zfs_fuid_txhold(zfsvfs, tx);
|
OpenZFS 8997 - ztest assertion failure in zil_lwb_write_issue
PROBLEM
=======
When `dmu_tx_assign` is called from `zil_lwb_write_issue`, it's possible
for either `ERESTART` or `EIO` to be returned.
If `ERESTART` is returned, this will cause an assertion to fail directly
in `zil_lwb_write_issue`, where the code assumes the return value is
`EIO` if `dmu_tx_assign` returns a non-zero value. This can occur if the
SPA is suspended when `dmu_tx_assign` is called, and most often occurs
when running `zloop`.
If `EIO` is returned, this can cause assertions to fail elsewhere in the
ZIL code. For example, `zil_commit_waiter_timeout` contains the
following logic:
lwb_t *nlwb = zil_lwb_write_issue(zilog, lwb);
ASSERT3S(lwb->lwb_state, !=, LWB_STATE_OPENED);
In this case, if `dmu_tx_assign` returned `EIO` from within
`zil_lwb_write_issue`, the `lwb` variable passed in will not be issued
to disk. Thus, it's `lwb_state` field will remain `LWB_STATE_OPENED` and
this assertion will fail. `zil_commit_waiter_timeout` assumes that after
it calls `zil_lwb_write_issue`, the `lwb` will be issued to disk, and
doesn't handle the case where this is not true; i.e. it doesn't handle
the case where `dmu_tx_assign` returns `EIO`.
SOLUTION
========
This change modifies the `dmu_tx_assign` function such that `txg_how` is
a bitmask, rather than of the `txg_how_t` enum type. Now, the previous
`TXG_WAITED` semantics can be used via `TXG_NOTHROTTLE`, along with
specifying either `TXG_NOWAIT` or `TXG_WAIT` semantics.
Previously, when `TXG_WAITED` was specified, `TXG_NOWAIT` semantics was
automatically invoked. This was not ideal when using `TXG_WAITED` within
`zil_lwb_write_issued`, leading the problem described above. Rather, we
want to achieve the semantics of `TXG_WAIT`, while also preventing the
`tx` from being penalized via the dirty delay throttling.
With this change, `zil_lwb_write_issued` can acheive the semtantics that
it requires by passing in the value `TXG_WAIT | TXG_NOTHROTTLE` to
`dmu_tx_assign`.
Further, consumers of `dmu_tx_assign` wishing to achieve the old
`TXG_WAITED` semantics can pass in the value `TXG_NOWAIT | TXG_NOTHROTTLE`.
Authored by: Prakash Surya <prakash.surya@delphix.com>
Approved by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: Andriy Gapon <avg@FreeBSD.org>
Ported-by: Brian Behlendorf <behlendorf1@llnl.gov>
Porting Notes:
- Additionally updated `zfs_tmpfile` to use `TXG_NOTHROTTLE`
OpenZFS-issue: https://www.illumos.org/issues/8997
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/19ea6cb0f9
Closes #7084
2018-01-09 00:45:53 +03:00
|
|
|
error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
|
2008-11-20 23:01:55 +03:00
|
|
|
if (error) {
|
|
|
|
zfs_dirent_unlock(dl);
|
2009-01-16 00:59:39 +03:00
|
|
|
if (error == ERESTART) {
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-29 07:01:20 +04:00
|
|
|
waited = B_TRUE;
|
2008-11-20 23:01:55 +03:00
|
|
|
dmu_tx_wait(tx);
|
|
|
|
dmu_tx_abort(tx);
|
|
|
|
goto top;
|
|
|
|
}
|
2010-05-29 00:45:14 +04:00
|
|
|
zfs_acl_ids_free(&acl_ids);
|
2008-11-20 23:01:55 +03:00
|
|
|
dmu_tx_abort(tx);
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2008-11-20 23:01:55 +03:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Create a new object for the symlink.
|
2010-05-29 00:45:14 +04:00
|
|
|
* for version 4 ZPL datsets the symlink will be an SA attribute
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
2010-05-29 00:45:14 +04:00
|
|
|
zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
|
2009-07-03 02:44:48 +04:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
if (fuid_dirtied)
|
2017-03-08 03:21:37 +03:00
|
|
|
zfs_fuid_sync(zfsvfs, tx);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-08-27 01:24:34 +04:00
|
|
|
mutex_enter(&zp->z_lock);
|
2010-05-29 00:45:14 +04:00
|
|
|
if (zp->z_is_sa)
|
2017-03-08 03:21:37 +03:00
|
|
|
error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs),
|
2010-05-29 00:45:14 +04:00
|
|
|
link, len, tx);
|
|
|
|
else
|
|
|
|
zfs_sa_symlink(zp, link, len, tx);
|
2010-08-27 01:24:34 +04:00
|
|
|
mutex_exit(&zp->z_lock);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
zp->z_size = len;
|
2017-03-08 03:21:37 +03:00
|
|
|
(void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
|
2010-05-29 00:45:14 +04:00
|
|
|
&zp->z_size, sizeof (zp->z_size), tx);
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* Insert the new object into the directory.
|
|
|
|
*/
|
Fix ENOSPC in "Handle zap_add() failures in ..."
Commit cc63068 caused ENOSPC error when copy a large amount of files
between two directories. The reason is that the patch limits zap leaf
expansion to 2 retries, and return ENOSPC when failed.
The intent for limiting retries is to prevent pointlessly growing table
to max size when adding a block full of entries with same name in
different case in mixed mode. However, it turns out we cannot use any
limit on the retry. When we copy files from one directory in readdir
order, we are copying in hash order, one leaf block at a time. Which
means that if the leaf block in source directory has expanded 6 times,
and you copy those entries in that block, by the time you need to expand
the leaf in destination directory, you need to expand it 6 times in one
go. So any limit on the retry will result in error where it shouldn't.
Note that while we do use different salt for different directories, it
seems that the salt/hash function doesn't provide enough randomization
to the hash distance to prevent this from happening.
Since cc63068 has already been reverted. This patch adds it back and
removes the retry limit.
Also, as it turn out, failing on zap_add() has a serious side effect for
mzap_upgrade(). When upgrading from micro zap to fat zap, it will
call zap_add() to transfer entries one at a time. If it hit any error
halfway through, the remaining entries will be lost, causing those files
to become orphan. This patch add a VERIFY to catch it.
Reviewed-by: Sanjeev Bagewadi <sanjeev.bagewadi@gmail.com>
Reviewed-by: Richard Yao <ryao@gentoo.org>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Albert Lee <trisk@forkgnu.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Signed-off-by: Chunwei Chen <david.chen@nutanix.com>
Closes #7401
Closes #7421
2018-04-19 00:19:50 +03:00
|
|
|
error = zfs_link_create(dl, zp, tx, ZNEW);
|
|
|
|
if (error != 0) {
|
|
|
|
zfs_znode_delete(zp, tx);
|
|
|
|
remove_inode_hash(ZTOI(zp));
|
|
|
|
} else {
|
|
|
|
if (flags & FIGNORECASE)
|
|
|
|
txtype |= TX_CI;
|
|
|
|
zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
|
2018-04-10 00:24:46 +03:00
|
|
|
|
Fix ENOSPC in "Handle zap_add() failures in ..."
Commit cc63068 caused ENOSPC error when copy a large amount of files
between two directories. The reason is that the patch limits zap leaf
expansion to 2 retries, and return ENOSPC when failed.
The intent for limiting retries is to prevent pointlessly growing table
to max size when adding a block full of entries with same name in
different case in mixed mode. However, it turns out we cannot use any
limit on the retry. When we copy files from one directory in readdir
order, we are copying in hash order, one leaf block at a time. Which
means that if the leaf block in source directory has expanded 6 times,
and you copy those entries in that block, by the time you need to expand
the leaf in destination directory, you need to expand it 6 times in one
go. So any limit on the retry will result in error where it shouldn't.
Note that while we do use different salt for different directories, it
seems that the salt/hash function doesn't provide enough randomization
to the hash distance to prevent this from happening.
Since cc63068 has already been reverted. This patch adds it back and
removes the retry limit.
Also, as it turn out, failing on zap_add() has a serious side effect for
mzap_upgrade(). When upgrading from micro zap to fat zap, it will
call zap_add() to transfer entries one at a time. If it hit any error
halfway through, the remaining entries will be lost, causing those files
to become orphan. This patch add a VERIFY to catch it.
Reviewed-by: Sanjeev Bagewadi <sanjeev.bagewadi@gmail.com>
Reviewed-by: Richard Yao <ryao@gentoo.org>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Albert Lee <trisk@forkgnu.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Signed-off-by: Chunwei Chen <david.chen@nutanix.com>
Closes #7401
Closes #7421
2018-04-19 00:19:50 +03:00
|
|
|
zfs_inode_update(dzp);
|
|
|
|
zfs_inode_update(zp);
|
|
|
|
}
|
2011-01-06 01:27:30 +03:00
|
|
|
|
2009-07-03 02:44:48 +04:00
|
|
|
zfs_acl_ids_free(&acl_ids);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
dmu_tx_commit(tx);
|
|
|
|
|
|
|
|
zfs_dirent_unlock(dl);
|
|
|
|
|
Fix ENOSPC in "Handle zap_add() failures in ..."
Commit cc63068 caused ENOSPC error when copy a large amount of files
between two directories. The reason is that the patch limits zap leaf
expansion to 2 retries, and return ENOSPC when failed.
The intent for limiting retries is to prevent pointlessly growing table
to max size when adding a block full of entries with same name in
different case in mixed mode. However, it turns out we cannot use any
limit on the retry. When we copy files from one directory in readdir
order, we are copying in hash order, one leaf block at a time. Which
means that if the leaf block in source directory has expanded 6 times,
and you copy those entries in that block, by the time you need to expand
the leaf in destination directory, you need to expand it 6 times in one
go. So any limit on the retry will result in error where it shouldn't.
Note that while we do use different salt for different directories, it
seems that the salt/hash function doesn't provide enough randomization
to the hash distance to prevent this from happening.
Since cc63068 has already been reverted. This patch adds it back and
removes the retry limit.
Also, as it turn out, failing on zap_add() has a serious side effect for
mzap_upgrade(). When upgrading from micro zap to fat zap, it will
call zap_add() to transfer entries one at a time. If it hit any error
halfway through, the remaining entries will be lost, causing those files
to become orphan. This patch add a VERIFY to catch it.
Reviewed-by: Sanjeev Bagewadi <sanjeev.bagewadi@gmail.com>
Reviewed-by: Richard Yao <ryao@gentoo.org>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Albert Lee <trisk@forkgnu.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Signed-off-by: Chunwei Chen <david.chen@nutanix.com>
Closes #7401
Closes #7421
2018-04-19 00:19:50 +03:00
|
|
|
if (error == 0) {
|
2019-12-11 22:53:57 +03:00
|
|
|
*zpp = zp;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
Fix ENOSPC in "Handle zap_add() failures in ..."
Commit cc63068 caused ENOSPC error when copy a large amount of files
between two directories. The reason is that the patch limits zap leaf
expansion to 2 retries, and return ENOSPC when failed.
The intent for limiting retries is to prevent pointlessly growing table
to max size when adding a block full of entries with same name in
different case in mixed mode. However, it turns out we cannot use any
limit on the retry. When we copy files from one directory in readdir
order, we are copying in hash order, one leaf block at a time. Which
means that if the leaf block in source directory has expanded 6 times,
and you copy those entries in that block, by the time you need to expand
the leaf in destination directory, you need to expand it 6 times in one
go. So any limit on the retry will result in error where it shouldn't.
Note that while we do use different salt for different directories, it
seems that the salt/hash function doesn't provide enough randomization
to the hash distance to prevent this from happening.
Since cc63068 has already been reverted. This patch adds it back and
removes the retry limit.
Also, as it turn out, failing on zap_add() has a serious side effect for
mzap_upgrade(). When upgrading from micro zap to fat zap, it will
call zap_add() to transfer entries one at a time. If it hit any error
halfway through, the remaining entries will be lost, causing those files
to become orphan. This patch add a VERIFY to catch it.
Reviewed-by: Sanjeev Bagewadi <sanjeev.bagewadi@gmail.com>
Reviewed-by: Richard Yao <ryao@gentoo.org>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Albert Lee <trisk@forkgnu.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Signed-off-by: Chunwei Chen <david.chen@nutanix.com>
Closes #7401
Closes #7421
2018-04-19 00:19:50 +03:00
|
|
|
if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
|
|
|
|
zil_commit(zilog, 0);
|
|
|
|
} else {
|
2019-12-11 22:53:57 +03:00
|
|
|
zrele(zp);
|
Fix ENOSPC in "Handle zap_add() failures in ..."
Commit cc63068 caused ENOSPC error when copy a large amount of files
between two directories. The reason is that the patch limits zap leaf
expansion to 2 retries, and return ENOSPC when failed.
The intent for limiting retries is to prevent pointlessly growing table
to max size when adding a block full of entries with same name in
different case in mixed mode. However, it turns out we cannot use any
limit on the retry. When we copy files from one directory in readdir
order, we are copying in hash order, one leaf block at a time. Which
means that if the leaf block in source directory has expanded 6 times,
and you copy those entries in that block, by the time you need to expand
the leaf in destination directory, you need to expand it 6 times in one
go. So any limit on the retry will result in error where it shouldn't.
Note that while we do use different salt for different directories, it
seems that the salt/hash function doesn't provide enough randomization
to the hash distance to prevent this from happening.
Since cc63068 has already been reverted. This patch adds it back and
removes the retry limit.
Also, as it turn out, failing on zap_add() has a serious side effect for
mzap_upgrade(). When upgrading from micro zap to fat zap, it will
call zap_add() to transfer entries one at a time. If it hit any error
halfway through, the remaining entries will be lost, causing those files
to become orphan. This patch add a VERIFY to catch it.
Reviewed-by: Sanjeev Bagewadi <sanjeev.bagewadi@gmail.com>
Reviewed-by: Richard Yao <ryao@gentoo.org>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Albert Lee <trisk@forkgnu.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Signed-off-by: Chunwei Chen <david.chen@nutanix.com>
Closes #7401
Closes #7421
2018-04-19 00:19:50 +03:00
|
|
|
}
|
2010-05-29 00:45:14 +04:00
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2008-11-20 23:01:55 +03:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Return, in the buffer contained in the provided uio structure,
|
2011-02-08 22:16:06 +03:00
|
|
|
* the symbolic path referred to by ip.
|
2008-11-20 23:01:55 +03:00
|
|
|
*
|
2011-02-17 02:54:55 +03:00
|
|
|
* IN: ip - inode of symbolic link
|
|
|
|
* uio - structure to contain the link path.
|
|
|
|
* cr - credentials of caller.
|
2008-11-20 23:01:55 +03:00
|
|
|
*
|
|
|
|
* RETURN: 0 if success
|
|
|
|
* error code if failure
|
|
|
|
*
|
|
|
|
* Timestamps:
|
2011-02-08 22:16:06 +03:00
|
|
|
* ip - atime updated
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
|
|
|
/* ARGSUSED */
|
2010-12-17 22:18:08 +03:00
|
|
|
int
|
2011-02-17 02:54:55 +03:00
|
|
|
zfs_readlink(struct inode *ip, uio_t *uio, cred_t *cr)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2011-02-08 22:16:06 +03:00
|
|
|
znode_t *zp = ITOZ(ip);
|
2017-03-08 03:21:37 +03:00
|
|
|
zfsvfs_t *zfsvfs = ITOZSB(ip);
|
2008-11-20 23:01:55 +03:00
|
|
|
int error;
|
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_ENTER(zfsvfs);
|
2008-11-20 23:01:55 +03:00
|
|
|
ZFS_VERIFY_ZP(zp);
|
|
|
|
|
2010-08-27 01:24:34 +04:00
|
|
|
mutex_enter(&zp->z_lock);
|
2010-05-29 00:45:14 +04:00
|
|
|
if (zp->z_is_sa)
|
2011-02-17 02:54:55 +03:00
|
|
|
error = sa_lookup_uio(zp->z_sa_hdl,
|
2017-03-08 03:21:37 +03:00
|
|
|
SA_ZPL_SYMLINK(zfsvfs), uio);
|
2010-05-29 00:45:14 +04:00
|
|
|
else
|
2011-02-17 02:54:55 +03:00
|
|
|
error = zfs_sa_readlink(zp, uio);
|
2010-08-27 01:24:34 +04:00
|
|
|
mutex_exit(&zp->z_lock);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2008-11-20 23:01:55 +03:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2019-12-11 22:53:57 +03:00
|
|
|
* Insert a new entry into directory tdzp referencing szp.
|
2008-11-20 23:01:55 +03:00
|
|
|
*
|
2019-12-11 22:53:57 +03:00
|
|
|
* IN: tdzp - Directory to contain new entry.
|
|
|
|
* szp - znode of new entry.
|
2008-11-20 23:01:55 +03:00
|
|
|
* name - name of new entry.
|
|
|
|
* cr - credentials of caller.
|
2019-05-26 00:29:10 +03:00
|
|
|
* flags - case flags.
|
2008-11-20 23:01:55 +03:00
|
|
|
*
|
|
|
|
* RETURN: 0 if success
|
|
|
|
* error code if failure
|
|
|
|
*
|
|
|
|
* Timestamps:
|
2019-12-11 22:53:57 +03:00
|
|
|
* tdzp - ctime|mtime updated
|
|
|
|
* szp - ctime updated
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
|
|
|
/* ARGSUSED */
|
2010-12-17 22:18:08 +03:00
|
|
|
int
|
2019-12-11 22:53:57 +03:00
|
|
|
zfs_link(znode_t *tdzp, znode_t *szp, char *name, cred_t *cr,
|
2016-04-13 18:55:35 +03:00
|
|
|
int flags)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2019-12-11 22:53:57 +03:00
|
|
|
struct inode *sip = ZTOI(szp);
|
|
|
|
znode_t *tzp;
|
|
|
|
zfsvfs_t *zfsvfs = ZTOZSB(tdzp);
|
2008-11-20 23:01:55 +03:00
|
|
|
zilog_t *zilog;
|
|
|
|
zfs_dirlock_t *dl;
|
|
|
|
dmu_tx_t *tx;
|
|
|
|
int error;
|
|
|
|
int zf = ZNEW;
|
2010-05-29 00:45:14 +04:00
|
|
|
uint64_t parent;
|
2010-08-27 01:24:34 +04:00
|
|
|
uid_t owner;
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-29 07:01:20 +04:00
|
|
|
boolean_t waited = B_FALSE;
|
2016-01-26 23:29:46 +03:00
|
|
|
boolean_t is_tmpfile = 0;
|
|
|
|
uint64_t txg;
|
|
|
|
#ifdef HAVE_TMPFILE
|
|
|
|
is_tmpfile = (sip->i_nlink == 0 && (sip->i_state & I_LINKABLE));
|
|
|
|
#endif
|
2019-12-11 22:53:57 +03:00
|
|
|
ASSERT(S_ISDIR(ZTOI(tdzp)->i_mode));
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2016-11-10 19:50:32 +03:00
|
|
|
if (name == NULL)
|
|
|
|
return (SET_ERROR(EINVAL));
|
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_ENTER(zfsvfs);
|
2019-12-11 22:53:57 +03:00
|
|
|
ZFS_VERIFY_ZP(tdzp);
|
2017-03-08 03:21:37 +03:00
|
|
|
zilog = zfsvfs->z_log;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
/*
|
|
|
|
* POSIX dictates that we return EPERM here.
|
|
|
|
* Better choices include ENOTSUP or EISDIR.
|
|
|
|
*/
|
2011-02-08 22:16:06 +03:00
|
|
|
if (S_ISDIR(sip->i_mode)) {
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2013-03-08 22:41:28 +04:00
|
|
|
return (SET_ERROR(EPERM));
|
2010-05-29 00:45:14 +04:00
|
|
|
}
|
|
|
|
|
2016-01-14 02:35:55 +03:00
|
|
|
ZFS_VERIFY_ZP(szp);
|
|
|
|
|
2018-02-14 01:54:54 +03:00
|
|
|
/*
|
|
|
|
* If we are using project inheritance, means if the directory has
|
|
|
|
* ZFS_PROJINHERIT set, then its descendant directories will inherit
|
|
|
|
* not only the project ID, but also the ZFS_PROJINHERIT flag. Under
|
|
|
|
* such case, we only allow hard link creation in our tree when the
|
|
|
|
* project IDs are the same.
|
|
|
|
*/
|
2019-12-11 22:53:57 +03:00
|
|
|
if (tdzp->z_pflags & ZFS_PROJINHERIT &&
|
|
|
|
tdzp->z_projid != szp->z_projid) {
|
2018-02-14 01:54:54 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
|
|
|
return (SET_ERROR(EXDEV));
|
|
|
|
}
|
|
|
|
|
2016-01-14 02:35:55 +03:00
|
|
|
/*
|
|
|
|
* We check i_sb because snapshots and the ctldir must have different
|
|
|
|
* super blocks.
|
|
|
|
*/
|
2019-12-11 22:53:57 +03:00
|
|
|
if (sip->i_sb != ZTOI(tdzp)->i_sb || zfsctl_is_node(sip)) {
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2013-03-08 22:41:28 +04:00
|
|
|
return (SET_ERROR(EXDEV));
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
2010-05-29 00:45:14 +04:00
|
|
|
|
|
|
|
/* Prevent links to .zfs/shares files */
|
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
|
2010-05-29 00:45:14 +04:00
|
|
|
&parent, sizeof (uint64_t))) != 0) {
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2010-05-29 00:45:14 +04:00
|
|
|
return (error);
|
|
|
|
}
|
2017-03-08 03:21:37 +03:00
|
|
|
if (parent == zfsvfs->z_shares_dir) {
|
|
|
|
ZFS_EXIT(zfsvfs);
|
2013-03-08 22:41:28 +04:00
|
|
|
return (SET_ERROR(EPERM));
|
2010-05-29 00:45:14 +04:00
|
|
|
}
|
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
if (zfsvfs->z_utf8 && u8_validate(name,
|
2008-11-20 23:01:55 +03:00
|
|
|
strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2013-03-08 22:41:28 +04:00
|
|
|
return (SET_ERROR(EILSEQ));
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
if (flags & FIGNORECASE)
|
|
|
|
zf |= ZCILOOK;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We do not support links between attributes and non-attributes
|
|
|
|
* because of the potential security risk of creating links
|
|
|
|
* into "normal" file space in order to circumvent restrictions
|
|
|
|
* imposed in attribute space.
|
|
|
|
*/
|
2019-12-11 22:53:57 +03:00
|
|
|
if ((szp->z_pflags & ZFS_XATTR) != (tdzp->z_pflags & ZFS_XATTR)) {
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2013-03-08 22:41:28 +04:00
|
|
|
return (SET_ERROR(EINVAL));
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
owner = zfs_fuid_map_id(zfsvfs, KUID_TO_SUID(sip->i_uid),
|
|
|
|
cr, ZFS_OWNER);
|
2010-08-27 01:24:34 +04:00
|
|
|
if (owner != crgetuid(cr) && secpolicy_basic_link(cr) != 0) {
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2013-03-08 22:41:28 +04:00
|
|
|
return (SET_ERROR(EPERM));
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2019-12-11 22:53:57 +03:00
|
|
|
if ((error = zfs_zaccess(tdzp, ACE_ADD_FILE, 0, B_FALSE, cr))) {
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2008-11-20 23:01:55 +03:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
top:
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
|
|
|
* Attempt to lock directory; fail if entry already exists.
|
|
|
|
*/
|
2019-12-11 22:53:57 +03:00
|
|
|
error = zfs_dirent_lock(&dl, tdzp, name, &tzp, zf, NULL, NULL);
|
2008-11-20 23:01:55 +03:00
|
|
|
if (error) {
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2008-11-20 23:01:55 +03:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
tx = dmu_tx_create(zfsvfs->z_os);
|
2010-05-29 00:45:14 +04:00
|
|
|
dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
|
2019-12-11 22:53:57 +03:00
|
|
|
dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, name);
|
2016-01-26 23:29:46 +03:00
|
|
|
if (is_tmpfile)
|
2017-03-08 03:21:37 +03:00
|
|
|
dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
|
2016-01-26 23:29:46 +03:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
zfs_sa_upgrade_txholds(tx, szp);
|
2019-12-11 22:53:57 +03:00
|
|
|
zfs_sa_upgrade_txholds(tx, tdzp);
|
OpenZFS 8997 - ztest assertion failure in zil_lwb_write_issue
PROBLEM
=======
When `dmu_tx_assign` is called from `zil_lwb_write_issue`, it's possible
for either `ERESTART` or `EIO` to be returned.
If `ERESTART` is returned, this will cause an assertion to fail directly
in `zil_lwb_write_issue`, where the code assumes the return value is
`EIO` if `dmu_tx_assign` returns a non-zero value. This can occur if the
SPA is suspended when `dmu_tx_assign` is called, and most often occurs
when running `zloop`.
If `EIO` is returned, this can cause assertions to fail elsewhere in the
ZIL code. For example, `zil_commit_waiter_timeout` contains the
following logic:
lwb_t *nlwb = zil_lwb_write_issue(zilog, lwb);
ASSERT3S(lwb->lwb_state, !=, LWB_STATE_OPENED);
In this case, if `dmu_tx_assign` returned `EIO` from within
`zil_lwb_write_issue`, the `lwb` variable passed in will not be issued
to disk. Thus, it's `lwb_state` field will remain `LWB_STATE_OPENED` and
this assertion will fail. `zil_commit_waiter_timeout` assumes that after
it calls `zil_lwb_write_issue`, the `lwb` will be issued to disk, and
doesn't handle the case where this is not true; i.e. it doesn't handle
the case where `dmu_tx_assign` returns `EIO`.
SOLUTION
========
This change modifies the `dmu_tx_assign` function such that `txg_how` is
a bitmask, rather than of the `txg_how_t` enum type. Now, the previous
`TXG_WAITED` semantics can be used via `TXG_NOTHROTTLE`, along with
specifying either `TXG_NOWAIT` or `TXG_WAIT` semantics.
Previously, when `TXG_WAITED` was specified, `TXG_NOWAIT` semantics was
automatically invoked. This was not ideal when using `TXG_WAITED` within
`zil_lwb_write_issued`, leading the problem described above. Rather, we
want to achieve the semantics of `TXG_WAIT`, while also preventing the
`tx` from being penalized via the dirty delay throttling.
With this change, `zil_lwb_write_issued` can acheive the semtantics that
it requires by passing in the value `TXG_WAIT | TXG_NOTHROTTLE` to
`dmu_tx_assign`.
Further, consumers of `dmu_tx_assign` wishing to achieve the old
`TXG_WAITED` semantics can pass in the value `TXG_NOWAIT | TXG_NOTHROTTLE`.
Authored by: Prakash Surya <prakash.surya@delphix.com>
Approved by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: Andriy Gapon <avg@FreeBSD.org>
Ported-by: Brian Behlendorf <behlendorf1@llnl.gov>
Porting Notes:
- Additionally updated `zfs_tmpfile` to use `TXG_NOTHROTTLE`
OpenZFS-issue: https://www.illumos.org/issues/8997
OpenZFS-commit: https://github.com/openzfs/openzfs/commit/19ea6cb0f9
Closes #7084
2018-01-09 00:45:53 +03:00
|
|
|
error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
|
2008-11-20 23:01:55 +03:00
|
|
|
if (error) {
|
|
|
|
zfs_dirent_unlock(dl);
|
2009-01-16 00:59:39 +03:00
|
|
|
if (error == ERESTART) {
|
Illumos #4045 write throttle & i/o scheduler performance work
4045 zfs write throttle & i/o scheduler performance work
1. The ZFS i/o scheduler (vdev_queue.c) now divides i/os into 5 classes: sync
read, sync write, async read, async write, and scrub/resilver. The scheduler
issues a number of concurrent i/os from each class to the device. Once a class
has been selected, an i/o is selected from this class using either an elevator
algorithem (async, scrub classes) or FIFO (sync classes). The number of
concurrent async write i/os is tuned dynamically based on i/o load, to achieve
good sync i/o latency when there is not a high load of writes, and good write
throughput when there is. See the block comment in vdev_queue.c (reproduced
below) for more details.
2. The write throttle (dsl_pool_tempreserve_space() and
txg_constrain_throughput()) is rewritten to produce much more consistent delays
when under constant load. The new write throttle is based on the amount of
dirty data, rather than guesses about future performance of the system. When
there is a lot of dirty data, each transaction (e.g. write() syscall) will be
delayed by the same small amount. This eliminates the "brick wall of wait"
that the old write throttle could hit, causing all transactions to wait several
seconds until the next txg opens. One of the keys to the new write throttle is
decrementing the amount of dirty data as i/o completes, rather than at the end
of spa_sync(). Note that the write throttle is only applied once the i/o
scheduler is issuing the maximum number of outstanding async writes. See the
block comments in dsl_pool.c and above dmu_tx_delay() (reproduced below) for
more details.
This diff has several other effects, including:
* the commonly-tuned global variable zfs_vdev_max_pending has been removed;
use per-class zfs_vdev_*_max_active values or zfs_vdev_max_active instead.
* the size of each txg (meaning the amount of dirty data written, and thus the
time it takes to write out) is now controlled differently. There is no longer
an explicit time goal; the primary determinant is amount of dirty data.
Systems that are under light or medium load will now often see that a txg is
always syncing, but the impact to performance (e.g. read latency) is minimal.
Tune zfs_dirty_data_max and zfs_dirty_data_sync to control this.
* zio_taskq_batch_pct = 75 -- Only use 75% of all CPUs for compression,
checksum, etc. This improves latency by not allowing these CPU-intensive tasks
to consume all CPU (on machines with at least 4 CPU's; the percentage is
rounded up).
--matt
APPENDIX: problems with the current i/o scheduler
The current ZFS i/o scheduler (vdev_queue.c) is deadline based. The problem
with this is that if there are always i/os pending, then certain classes of
i/os can see very long delays.
For example, if there are always synchronous reads outstanding, then no async
writes will be serviced until they become "past due". One symptom of this
situation is that each pass of the txg sync takes at least several seconds
(typically 3 seconds).
If many i/os become "past due" (their deadline is in the past), then we must
service all of these overdue i/os before any new i/os. This happens when we
enqueue a batch of async writes for the txg sync, with deadlines 2.5 seconds in
the future. If we can't complete all the i/os in 2.5 seconds (e.g. because
there were always reads pending), then these i/os will become past due. Now we
must service all the "async" writes (which could be hundreds of megabytes)
before we service any reads, introducing considerable latency to synchronous
i/os (reads or ZIL writes).
Notes on porting to ZFS on Linux:
- zio_t gained new members io_physdone and io_phys_children. Because
object caches in the Linux port call the constructor only once at
allocation time, objects may contain residual data when retrieved
from the cache. Therefore zio_create() was updated to zero out the two
new fields.
- vdev_mirror_pending() relied on the depth of the per-vdev pending queue
(vq->vq_pending_tree) to select the least-busy leaf vdev to read from.
This tree has been replaced by vq->vq_active_tree which is now used
for the same purpose.
- vdev_queue_init() used the value of zfs_vdev_max_pending to determine
the number of vdev I/O buffers to pre-allocate. That global no longer
exists, so we instead use the sum of the *_max_active values for each of
the five I/O classes described above.
- The Illumos implementation of dmu_tx_delay() delays a transaction by
sleeping in condition variable embedded in the thread
(curthread->t_delay_cv). We do not have an equivalent CV to use in
Linux, so this change replaced the delay logic with a wrapper called
zfs_sleep_until(). This wrapper could be adopted upstream and in other
downstream ports to abstract away operating system-specific delay logic.
- These tunables are added as module parameters, and descriptions added
to the zfs-module-parameters.5 man page.
spa_asize_inflation
zfs_deadman_synctime_ms
zfs_vdev_max_active
zfs_vdev_async_write_active_min_dirty_percent
zfs_vdev_async_write_active_max_dirty_percent
zfs_vdev_async_read_max_active
zfs_vdev_async_read_min_active
zfs_vdev_async_write_max_active
zfs_vdev_async_write_min_active
zfs_vdev_scrub_max_active
zfs_vdev_scrub_min_active
zfs_vdev_sync_read_max_active
zfs_vdev_sync_read_min_active
zfs_vdev_sync_write_max_active
zfs_vdev_sync_write_min_active
zfs_dirty_data_max_percent
zfs_delay_min_dirty_percent
zfs_dirty_data_max_max_percent
zfs_dirty_data_max
zfs_dirty_data_max_max
zfs_dirty_data_sync
zfs_delay_scale
The latter four have type unsigned long, whereas they are uint64_t in
Illumos. This accommodates Linux's module_param() supported types, but
means they may overflow on 32-bit architectures.
The values zfs_dirty_data_max and zfs_dirty_data_max_max are the most
likely to overflow on 32-bit systems, since they express physical RAM
sizes in bytes. In fact, Illumos initializes zfs_dirty_data_max_max to
2^32 which does overflow. To resolve that, this port instead initializes
it in arc_init() to 25% of physical RAM, and adds the tunable
zfs_dirty_data_max_max_percent to override that percentage. While this
solution doesn't completely avoid the overflow issue, it should be a
reasonable default for most systems, and the minority of affected
systems can work around the issue by overriding the defaults.
- Fixed reversed logic in comment above zfs_delay_scale declaration.
- Clarified comments in vdev_queue.c regarding when per-queue minimums take
effect.
- Replaced dmu_tx_write_limit in the dmu_tx kstat file
with dmu_tx_dirty_delay and dmu_tx_dirty_over_max. The first counts
how many times a transaction has been delayed because the pool dirty
data has exceeded zfs_delay_min_dirty_percent. The latter counts how
many times the pool dirty data has exceeded zfs_dirty_data_max (which
we expect to never happen).
- The original patch would have regressed the bug fixed in
zfsonlinux/zfs@c418410, which prevented users from setting the
zfs_vdev_aggregation_limit tuning larger than SPA_MAXBLOCKSIZE.
A similar fix is added to vdev_queue_aggregate().
- In vdev_queue_io_to_issue(), dynamically allocate 'zio_t search' on the
heap instead of the stack. In Linux we can't afford such large
structures on the stack.
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Ned Bass <bass6@llnl.gov>
Reviewed by: Brendan Gregg <brendan.gregg@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
References:
http://www.illumos.org/issues/4045
illumos/illumos-gate@69962b5647e4a8b9b14998733b765925381b727e
Ported-by: Ned Bass <bass6@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1913
2013-08-29 07:01:20 +04:00
|
|
|
waited = B_TRUE;
|
2008-11-20 23:01:55 +03:00
|
|
|
dmu_tx_wait(tx);
|
|
|
|
dmu_tx_abort(tx);
|
|
|
|
goto top;
|
|
|
|
}
|
|
|
|
dmu_tx_abort(tx);
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2008-11-20 23:01:55 +03:00
|
|
|
return (error);
|
|
|
|
}
|
2016-01-26 23:29:46 +03:00
|
|
|
/* unmark z_unlinked so zfs_link_create will not reject */
|
|
|
|
if (is_tmpfile)
|
2019-08-13 16:58:02 +03:00
|
|
|
szp->z_unlinked = B_FALSE;
|
2008-11-20 23:01:55 +03:00
|
|
|
error = zfs_link_create(dl, szp, tx, 0);
|
|
|
|
|
|
|
|
if (error == 0) {
|
|
|
|
uint64_t txtype = TX_LINK;
|
2016-01-26 23:29:46 +03:00
|
|
|
/*
|
|
|
|
* tmpfile is created to be in z_unlinkedobj, so remove it.
|
2019-09-03 03:56:41 +03:00
|
|
|
* Also, we don't log in ZIL, because all previous file
|
2016-01-26 23:29:46 +03:00
|
|
|
* operation on the tmpfile are ignored by ZIL. Instead we
|
|
|
|
* always wait for txg to sync to make sure all previous
|
|
|
|
* operation are sync safe.
|
|
|
|
*/
|
|
|
|
if (is_tmpfile) {
|
2017-03-08 03:21:37 +03:00
|
|
|
VERIFY(zap_remove_int(zfsvfs->z_os,
|
|
|
|
zfsvfs->z_unlinkedobj, szp->z_id, tx) == 0);
|
2016-01-26 23:29:46 +03:00
|
|
|
} else {
|
|
|
|
if (flags & FIGNORECASE)
|
|
|
|
txtype |= TX_CI;
|
2019-12-11 22:53:57 +03:00
|
|
|
zfs_log_link(zilog, tx, txtype, tdzp, szp, name);
|
2016-01-26 23:29:46 +03:00
|
|
|
}
|
|
|
|
} else if (is_tmpfile) {
|
|
|
|
/* restore z_unlinked since when linking failed */
|
2019-08-13 16:58:02 +03:00
|
|
|
szp->z_unlinked = B_TRUE;
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
2016-01-26 23:29:46 +03:00
|
|
|
txg = dmu_tx_get_txg(tx);
|
2008-11-20 23:01:55 +03:00
|
|
|
dmu_tx_commit(tx);
|
|
|
|
|
|
|
|
zfs_dirent_unlock(dl);
|
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
if (!is_tmpfile && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
|
2010-08-27 01:24:34 +04:00
|
|
|
zil_commit(zilog, 0);
|
2010-05-29 00:45:14 +04:00
|
|
|
|
2020-02-16 23:44:08 +03:00
|
|
|
if (is_tmpfile && zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED)
|
2017-03-08 03:21:37 +03:00
|
|
|
txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), txg);
|
2016-01-26 23:29:46 +03:00
|
|
|
|
2019-12-11 22:53:57 +03:00
|
|
|
zfs_inode_update(tdzp);
|
2011-01-06 01:27:30 +03:00
|
|
|
zfs_inode_update(szp);
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2008-11-20 23:01:55 +03:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
2011-08-02 08:28:51 +04:00
|
|
|
static void
|
Only commit the ZIL once in zpl_writepages() (msync() case).
Currently, using msync() results in the following code path:
sys_msync -> zpl_fsync -> filemap_write_and_wait_range -> zpl_writepages -> write_cache_pages -> zpl_putpage
In such a code path, zil_commit() is called as part of zpl_putpage().
This means that for each page, the write is handed to the DMU, the ZIL
is committed, and only then do we move on to the next page. As one might
imagine, this results in atrocious performance where there is a large
number of pages to write: instead of committing a batch of N writes,
we do N commits containing one page each. In some extreme cases this
can result in msync() being ~700 times slower than it should be, as well
as very inefficient use of ZIL resources.
This patch fixes this issue by making sure that the requested writes
are batched and then committed only once. Unfortunately, the
implementation is somewhat non-trivial because there is no way to run
write_cache_pages in SYNC mode (so that we get all pages) without
making it wait on the writeback tag for each page.
The solution implemented here is composed of two parts:
- I added a new callback system to the ZIL, which allows the caller to
be notified when its ITX gets written to stable storage. One nice
thing is that the callback is called not only in zil_commit() but
in zil_sync() as well, which means that the caller doesn't have to
care whether the write ended up in the ZIL or the DMU: it will get
notified as soon as it's safe, period. This is an improvement over
dmu_tx_callback_register() that was used previously, which only
supports DMU writes. The rationale for this change is to allow
zpl_putpage() to be notified when a ZIL commit is completed without
having to block on zil_commit() itself.
- zpl_writepages() now calls write_cache_pages in non-SYNC mode, which
will prevent (1) write_cache_pages from blocking, and (2) zpl_putpage
from issuing ZIL commits. zpl_writepages() will issue the commit
itself instead of relying on zpl_putpage() to do it, thus nicely
batching the writes. Note, however, that we still have to call
write_cache_pages() again in SYNC mode because there is an edge case
documented in the implementation of write_cache_pages() whereas it
will not give us all dirty pages when running in non-SYNC mode. Thus
we need to run it at least once in SYNC mode to make sure we honor
persistency guarantees. This only happens when the pages are
modified at the same time msync() is running, which should be rare.
In most cases there won't be any additional pages and this second
call will do nothing.
Note that this change also fixes a bug related to #907 whereas calling
msync() on pages that were already handed over to the DMU in a previous
writepages() call would make msync() block until the next TXG sync
instead of returning as soon as the ZIL commit is complete. The new
callback system fixes that problem.
Signed-off-by: Richard Yao <ryao@gentoo.org>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1849
Closes #907
2013-11-10 19:00:11 +04:00
|
|
|
zfs_putpage_commit_cb(void *arg)
|
2011-08-02 08:28:51 +04:00
|
|
|
{
|
|
|
|
struct page *pp = arg;
|
|
|
|
|
Only commit the ZIL once in zpl_writepages() (msync() case).
Currently, using msync() results in the following code path:
sys_msync -> zpl_fsync -> filemap_write_and_wait_range -> zpl_writepages -> write_cache_pages -> zpl_putpage
In such a code path, zil_commit() is called as part of zpl_putpage().
This means that for each page, the write is handed to the DMU, the ZIL
is committed, and only then do we move on to the next page. As one might
imagine, this results in atrocious performance where there is a large
number of pages to write: instead of committing a batch of N writes,
we do N commits containing one page each. In some extreme cases this
can result in msync() being ~700 times slower than it should be, as well
as very inefficient use of ZIL resources.
This patch fixes this issue by making sure that the requested writes
are batched and then committed only once. Unfortunately, the
implementation is somewhat non-trivial because there is no way to run
write_cache_pages in SYNC mode (so that we get all pages) without
making it wait on the writeback tag for each page.
The solution implemented here is composed of two parts:
- I added a new callback system to the ZIL, which allows the caller to
be notified when its ITX gets written to stable storage. One nice
thing is that the callback is called not only in zil_commit() but
in zil_sync() as well, which means that the caller doesn't have to
care whether the write ended up in the ZIL or the DMU: it will get
notified as soon as it's safe, period. This is an improvement over
dmu_tx_callback_register() that was used previously, which only
supports DMU writes. The rationale for this change is to allow
zpl_putpage() to be notified when a ZIL commit is completed without
having to block on zil_commit() itself.
- zpl_writepages() now calls write_cache_pages in non-SYNC mode, which
will prevent (1) write_cache_pages from blocking, and (2) zpl_putpage
from issuing ZIL commits. zpl_writepages() will issue the commit
itself instead of relying on zpl_putpage() to do it, thus nicely
batching the writes. Note, however, that we still have to call
write_cache_pages() again in SYNC mode because there is an edge case
documented in the implementation of write_cache_pages() whereas it
will not give us all dirty pages when running in non-SYNC mode. Thus
we need to run it at least once in SYNC mode to make sure we honor
persistency guarantees. This only happens when the pages are
modified at the same time msync() is running, which should be rare.
In most cases there won't be any additional pages and this second
call will do nothing.
Note that this change also fixes a bug related to #907 whereas calling
msync() on pages that were already handed over to the DMU in a previous
writepages() call would make msync() block until the next TXG sync
instead of returning as soon as the ZIL commit is complete. The new
callback system fixes that problem.
Signed-off-by: Richard Yao <ryao@gentoo.org>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1849
Closes #907
2013-11-10 19:00:11 +04:00
|
|
|
ClearPageError(pp);
|
2011-08-02 08:28:51 +04:00
|
|
|
end_page_writeback(pp);
|
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
2011-08-02 08:28:51 +04:00
|
|
|
* Push a page out to disk, once the page is on stable storage the
|
|
|
|
* registered commit callback will be run as notification of completion.
|
2008-11-20 23:01:55 +03:00
|
|
|
*
|
2011-08-02 08:28:51 +04:00
|
|
|
* IN: ip - page mapped for inode.
|
|
|
|
* pp - page to push (page is locked)
|
|
|
|
* wbc - writeback control data
|
2008-11-20 23:01:55 +03:00
|
|
|
*
|
|
|
|
* RETURN: 0 if success
|
|
|
|
* error code if failure
|
|
|
|
*
|
2011-08-02 08:28:51 +04:00
|
|
|
* Timestamps:
|
|
|
|
* ip - ctime|mtime updated
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
|
|
|
/* ARGSUSED */
|
2011-08-02 08:28:51 +04:00
|
|
|
int
|
|
|
|
zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2011-08-02 08:28:51 +04:00
|
|
|
znode_t *zp = ITOZ(ip);
|
2017-03-08 03:21:37 +03:00
|
|
|
zfsvfs_t *zfsvfs = ITOZSB(ip);
|
2011-08-02 08:28:51 +04:00
|
|
|
loff_t offset;
|
|
|
|
loff_t pgoff;
|
2012-10-31 21:06:34 +04:00
|
|
|
unsigned int pglen;
|
2011-08-02 08:28:51 +04:00
|
|
|
dmu_tx_t *tx;
|
|
|
|
caddr_t va;
|
|
|
|
int err = 0;
|
|
|
|
uint64_t mtime[2], ctime[2];
|
|
|
|
sa_bulk_attr_t bulk[3];
|
|
|
|
int cnt = 0;
|
2015-03-05 22:52:26 +03:00
|
|
|
struct address_space *mapping;
|
2011-08-02 08:28:51 +04:00
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_ENTER(zfsvfs);
|
2012-10-31 21:06:34 +04:00
|
|
|
ZFS_VERIFY_ZP(zp);
|
2009-02-18 23:51:31 +03:00
|
|
|
|
2011-08-02 08:28:51 +04:00
|
|
|
ASSERT(PageLocked(pp));
|
|
|
|
|
2013-11-01 23:26:11 +04:00
|
|
|
pgoff = page_offset(pp); /* Page byte-offset in file */
|
|
|
|
offset = i_size_read(ip); /* File length in bytes */
|
2016-04-05 22:39:37 +03:00
|
|
|
pglen = MIN(PAGE_SIZE, /* Page length in bytes */
|
|
|
|
P2ROUNDUP(offset, PAGE_SIZE)-pgoff);
|
2011-08-02 08:28:51 +04:00
|
|
|
|
|
|
|
/* Page is beyond end of file */
|
|
|
|
if (pgoff >= offset) {
|
|
|
|
unlock_page(pp);
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2011-08-02 08:28:51 +04:00
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Truncate page length to end of file */
|
|
|
|
if (pgoff + pglen > offset)
|
|
|
|
pglen = offset - pgoff;
|
|
|
|
|
|
|
|
#if 0
|
2008-11-20 23:01:55 +03:00
|
|
|
/*
|
2011-08-02 08:28:51 +04:00
|
|
|
* FIXME: Allow mmap writes past its quota. The correct fix
|
|
|
|
* is to register a page_mkwrite() handler to count the page
|
|
|
|
* against its quota when it is about to be dirtied.
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
2018-02-14 01:54:54 +03:00
|
|
|
if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT,
|
|
|
|
KUID_TO_SUID(ip->i_uid)) ||
|
|
|
|
zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT,
|
|
|
|
KGID_TO_SGID(ip->i_gid)) ||
|
|
|
|
(zp->z_projid != ZFS_DEFAULT_PROJID &&
|
|
|
|
zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT,
|
|
|
|
zp->z_projid))) {
|
2009-07-03 02:44:48 +04:00
|
|
|
err = EDQUOT;
|
|
|
|
}
|
2011-08-02 08:28:51 +04:00
|
|
|
#endif
|
|
|
|
|
2015-01-07 03:54:57 +03:00
|
|
|
/*
|
|
|
|
* The ordering here is critical and must adhere to the following
|
|
|
|
* rules in order to avoid deadlocking in either zfs_read() or
|
|
|
|
* zfs_free_range() due to a lock inversion.
|
|
|
|
*
|
|
|
|
* 1) The page must be unlocked prior to acquiring the range lock.
|
|
|
|
* This is critical because zfs_read() calls find_lock_page()
|
|
|
|
* which may block on the page lock while holding the range lock.
|
|
|
|
*
|
|
|
|
* 2) Before setting or clearing write back on a page the range lock
|
|
|
|
* must be held in order to prevent a lock inversion with the
|
|
|
|
* zfs_free_range() function.
|
2015-03-05 22:52:26 +03:00
|
|
|
*
|
|
|
|
* This presents a problem because upon entering this function the
|
|
|
|
* page lock is already held. To safely acquire the range lock the
|
|
|
|
* page lock must be dropped. This creates a window where another
|
|
|
|
* process could truncate, invalidate, dirty, or write out the page.
|
|
|
|
*
|
|
|
|
* Therefore, after successfully reacquiring the range and page locks
|
|
|
|
* the current page state is checked. In the common case everything
|
|
|
|
* will be as is expected and it can be written out. However, if
|
|
|
|
* the page state has changed it must be handled accordingly.
|
2015-01-07 03:54:57 +03:00
|
|
|
*/
|
2015-03-05 22:52:26 +03:00
|
|
|
mapping = pp->mapping;
|
|
|
|
redirty_page_for_writepage(wbc, pp);
|
2015-01-07 03:54:57 +03:00
|
|
|
unlock_page(pp);
|
2015-03-05 22:52:26 +03:00
|
|
|
|
2019-11-01 20:37:33 +03:00
|
|
|
zfs_locked_range_t *lr = zfs_rangelock_enter(&zp->z_rangelock,
|
2018-10-02 01:13:12 +03:00
|
|
|
pgoff, pglen, RL_WRITER);
|
2015-03-05 22:52:26 +03:00
|
|
|
lock_page(pp);
|
|
|
|
|
|
|
|
/* Page mapping changed or it was no longer dirty, we're done */
|
|
|
|
if (unlikely((mapping != pp->mapping) || !PageDirty(pp))) {
|
|
|
|
unlock_page(pp);
|
2019-10-04 01:54:29 +03:00
|
|
|
zfs_rangelock_exit(lr);
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2015-03-05 22:52:26 +03:00
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Another process started write block if required */
|
|
|
|
if (PageWriteback(pp)) {
|
|
|
|
unlock_page(pp);
|
2019-10-04 01:54:29 +03:00
|
|
|
zfs_rangelock_exit(lr);
|
2015-03-05 22:52:26 +03:00
|
|
|
|
2019-05-25 23:42:09 +03:00
|
|
|
if (wbc->sync_mode != WB_SYNC_NONE) {
|
|
|
|
if (PageWriteback(pp))
|
|
|
|
wait_on_page_bit(pp, PG_writeback);
|
|
|
|
}
|
2015-03-05 22:52:26 +03:00
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2015-03-05 22:52:26 +03:00
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Clear the dirty flag the required locks are held */
|
|
|
|
if (!clear_page_dirty_for_io(pp)) {
|
|
|
|
unlock_page(pp);
|
2019-10-04 01:54:29 +03:00
|
|
|
zfs_rangelock_exit(lr);
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2015-03-05 22:52:26 +03:00
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Counterpart for redirty_page_for_writepage() above. This page
|
|
|
|
* was in fact not skipped and should not be counted as if it were.
|
|
|
|
*/
|
|
|
|
wbc->pages_skipped--;
|
2011-08-02 08:28:51 +04:00
|
|
|
set_page_writeback(pp);
|
2015-03-05 22:52:26 +03:00
|
|
|
unlock_page(pp);
|
2011-08-02 08:28:51 +04:00
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
tx = dmu_tx_create(zfsvfs->z_os);
|
2011-08-02 08:28:51 +04:00
|
|
|
dmu_tx_hold_write(tx, zp->z_id, pgoff, pglen);
|
2010-05-29 00:45:14 +04:00
|
|
|
dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
|
|
|
|
zfs_sa_upgrade_txholds(tx, zp);
|
2015-01-07 03:54:57 +03:00
|
|
|
|
2009-01-16 00:59:39 +03:00
|
|
|
err = dmu_tx_assign(tx, TXG_NOWAIT);
|
2008-11-20 23:01:55 +03:00
|
|
|
if (err != 0) {
|
2011-08-02 08:28:51 +04:00
|
|
|
if (err == ERESTART)
|
2008-11-20 23:01:55 +03:00
|
|
|
dmu_tx_wait(tx);
|
2011-08-02 08:28:51 +04:00
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
dmu_tx_abort(tx);
|
Only commit the ZIL once in zpl_writepages() (msync() case).
Currently, using msync() results in the following code path:
sys_msync -> zpl_fsync -> filemap_write_and_wait_range -> zpl_writepages -> write_cache_pages -> zpl_putpage
In such a code path, zil_commit() is called as part of zpl_putpage().
This means that for each page, the write is handed to the DMU, the ZIL
is committed, and only then do we move on to the next page. As one might
imagine, this results in atrocious performance where there is a large
number of pages to write: instead of committing a batch of N writes,
we do N commits containing one page each. In some extreme cases this
can result in msync() being ~700 times slower than it should be, as well
as very inefficient use of ZIL resources.
This patch fixes this issue by making sure that the requested writes
are batched and then committed only once. Unfortunately, the
implementation is somewhat non-trivial because there is no way to run
write_cache_pages in SYNC mode (so that we get all pages) without
making it wait on the writeback tag for each page.
The solution implemented here is composed of two parts:
- I added a new callback system to the ZIL, which allows the caller to
be notified when its ITX gets written to stable storage. One nice
thing is that the callback is called not only in zil_commit() but
in zil_sync() as well, which means that the caller doesn't have to
care whether the write ended up in the ZIL or the DMU: it will get
notified as soon as it's safe, period. This is an improvement over
dmu_tx_callback_register() that was used previously, which only
supports DMU writes. The rationale for this change is to allow
zpl_putpage() to be notified when a ZIL commit is completed without
having to block on zil_commit() itself.
- zpl_writepages() now calls write_cache_pages in non-SYNC mode, which
will prevent (1) write_cache_pages from blocking, and (2) zpl_putpage
from issuing ZIL commits. zpl_writepages() will issue the commit
itself instead of relying on zpl_putpage() to do it, thus nicely
batching the writes. Note, however, that we still have to call
write_cache_pages() again in SYNC mode because there is an edge case
documented in the implementation of write_cache_pages() whereas it
will not give us all dirty pages when running in non-SYNC mode. Thus
we need to run it at least once in SYNC mode to make sure we honor
persistency guarantees. This only happens when the pages are
modified at the same time msync() is running, which should be rare.
In most cases there won't be any additional pages and this second
call will do nothing.
Note that this change also fixes a bug related to #907 whereas calling
msync() on pages that were already handed over to the DMU in a previous
writepages() call would make msync() block until the next TXG sync
instead of returning as soon as the ZIL commit is complete. The new
callback system fixes that problem.
Signed-off-by: Richard Yao <ryao@gentoo.org>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1849
Closes #907
2013-11-10 19:00:11 +04:00
|
|
|
__set_page_dirty_nobuffers(pp);
|
|
|
|
ClearPageError(pp);
|
|
|
|
end_page_writeback(pp);
|
2019-10-04 01:54:29 +03:00
|
|
|
zfs_rangelock_exit(lr);
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2011-08-02 08:28:51 +04:00
|
|
|
return (err);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2011-05-28 05:53:07 +04:00
|
|
|
va = kmap(pp);
|
2016-04-05 22:39:37 +03:00
|
|
|
ASSERT3U(pglen, <=, PAGE_SIZE);
|
2017-03-08 03:21:37 +03:00
|
|
|
dmu_write(zfsvfs->z_os, zp->z_id, pgoff, pglen, va, tx);
|
2011-05-28 05:53:07 +04:00
|
|
|
kunmap(pp);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
|
|
|
|
SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
|
|
|
|
SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_FLAGS(zfsvfs), NULL,
|
|
|
|
&zp->z_pflags, 8);
|
2010-05-29 00:45:14 +04:00
|
|
|
|
2012-12-05 00:11:02 +04:00
|
|
|
/* Preserve the mtime and ctime provided by the inode */
|
|
|
|
ZFS_TIME_ENCODE(&ip->i_mtime, mtime);
|
|
|
|
ZFS_TIME_ENCODE(&ip->i_ctime, ctime);
|
2019-08-13 16:58:02 +03:00
|
|
|
zp->z_atime_dirty = B_FALSE;
|
2012-12-05 00:11:02 +04:00
|
|
|
zp->z_seq++;
|
|
|
|
|
|
|
|
err = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx);
|
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, pgoff, pglen, 0,
|
Only commit the ZIL once in zpl_writepages() (msync() case).
Currently, using msync() results in the following code path:
sys_msync -> zpl_fsync -> filemap_write_and_wait_range -> zpl_writepages -> write_cache_pages -> zpl_putpage
In such a code path, zil_commit() is called as part of zpl_putpage().
This means that for each page, the write is handed to the DMU, the ZIL
is committed, and only then do we move on to the next page. As one might
imagine, this results in atrocious performance where there is a large
number of pages to write: instead of committing a batch of N writes,
we do N commits containing one page each. In some extreme cases this
can result in msync() being ~700 times slower than it should be, as well
as very inefficient use of ZIL resources.
This patch fixes this issue by making sure that the requested writes
are batched and then committed only once. Unfortunately, the
implementation is somewhat non-trivial because there is no way to run
write_cache_pages in SYNC mode (so that we get all pages) without
making it wait on the writeback tag for each page.
The solution implemented here is composed of two parts:
- I added a new callback system to the ZIL, which allows the caller to
be notified when its ITX gets written to stable storage. One nice
thing is that the callback is called not only in zil_commit() but
in zil_sync() as well, which means that the caller doesn't have to
care whether the write ended up in the ZIL or the DMU: it will get
notified as soon as it's safe, period. This is an improvement over
dmu_tx_callback_register() that was used previously, which only
supports DMU writes. The rationale for this change is to allow
zpl_putpage() to be notified when a ZIL commit is completed without
having to block on zil_commit() itself.
- zpl_writepages() now calls write_cache_pages in non-SYNC mode, which
will prevent (1) write_cache_pages from blocking, and (2) zpl_putpage
from issuing ZIL commits. zpl_writepages() will issue the commit
itself instead of relying on zpl_putpage() to do it, thus nicely
batching the writes. Note, however, that we still have to call
write_cache_pages() again in SYNC mode because there is an edge case
documented in the implementation of write_cache_pages() whereas it
will not give us all dirty pages when running in non-SYNC mode. Thus
we need to run it at least once in SYNC mode to make sure we honor
persistency guarantees. This only happens when the pages are
modified at the same time msync() is running, which should be rare.
In most cases there won't be any additional pages and this second
call will do nothing.
Note that this change also fixes a bug related to #907 whereas calling
msync() on pages that were already handed over to the DMU in a previous
writepages() call would make msync() block until the next TXG sync
instead of returning as soon as the ZIL commit is complete. The new
callback system fixes that problem.
Signed-off-by: Richard Yao <ryao@gentoo.org>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1849
Closes #907
2013-11-10 19:00:11 +04:00
|
|
|
zfs_putpage_commit_cb, pp);
|
2009-08-18 22:43:27 +04:00
|
|
|
dmu_tx_commit(tx);
|
2012-12-05 00:11:02 +04:00
|
|
|
|
2019-10-04 01:54:29 +03:00
|
|
|
zfs_rangelock_exit(lr);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
Only commit the ZIL once in zpl_writepages() (msync() case).
Currently, using msync() results in the following code path:
sys_msync -> zpl_fsync -> filemap_write_and_wait_range -> zpl_writepages -> write_cache_pages -> zpl_putpage
In such a code path, zil_commit() is called as part of zpl_putpage().
This means that for each page, the write is handed to the DMU, the ZIL
is committed, and only then do we move on to the next page. As one might
imagine, this results in atrocious performance where there is a large
number of pages to write: instead of committing a batch of N writes,
we do N commits containing one page each. In some extreme cases this
can result in msync() being ~700 times slower than it should be, as well
as very inefficient use of ZIL resources.
This patch fixes this issue by making sure that the requested writes
are batched and then committed only once. Unfortunately, the
implementation is somewhat non-trivial because there is no way to run
write_cache_pages in SYNC mode (so that we get all pages) without
making it wait on the writeback tag for each page.
The solution implemented here is composed of two parts:
- I added a new callback system to the ZIL, which allows the caller to
be notified when its ITX gets written to stable storage. One nice
thing is that the callback is called not only in zil_commit() but
in zil_sync() as well, which means that the caller doesn't have to
care whether the write ended up in the ZIL or the DMU: it will get
notified as soon as it's safe, period. This is an improvement over
dmu_tx_callback_register() that was used previously, which only
supports DMU writes. The rationale for this change is to allow
zpl_putpage() to be notified when a ZIL commit is completed without
having to block on zil_commit() itself.
- zpl_writepages() now calls write_cache_pages in non-SYNC mode, which
will prevent (1) write_cache_pages from blocking, and (2) zpl_putpage
from issuing ZIL commits. zpl_writepages() will issue the commit
itself instead of relying on zpl_putpage() to do it, thus nicely
batching the writes. Note, however, that we still have to call
write_cache_pages() again in SYNC mode because there is an edge case
documented in the implementation of write_cache_pages() whereas it
will not give us all dirty pages when running in non-SYNC mode. Thus
we need to run it at least once in SYNC mode to make sure we honor
persistency guarantees. This only happens when the pages are
modified at the same time msync() is running, which should be rare.
In most cases there won't be any additional pages and this second
call will do nothing.
Note that this change also fixes a bug related to #907 whereas calling
msync() on pages that were already handed over to the DMU in a previous
writepages() call would make msync() block until the next TXG sync
instead of returning as soon as the ZIL commit is complete. The new
callback system fixes that problem.
Signed-off-by: Richard Yao <ryao@gentoo.org>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #1849
Closes #907
2013-11-10 19:00:11 +04:00
|
|
|
if (wbc->sync_mode != WB_SYNC_NONE) {
|
|
|
|
/*
|
|
|
|
* Note that this is rarely called under writepages(), because
|
|
|
|
* writepages() normally handles the entire commit for
|
|
|
|
* performance reasons.
|
|
|
|
*/
|
2017-03-08 03:21:37 +03:00
|
|
|
zil_commit(zfsvfs->z_log, zp->z_id);
|
2012-08-29 22:52:47 +04:00
|
|
|
}
|
2011-08-02 08:28:51 +04:00
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2011-08-02 08:28:51 +04:00
|
|
|
return (err);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2012-12-12 04:58:44 +04:00
|
|
|
/*
|
|
|
|
* Update the system attributes when the inode has been dirtied. For the
|
2013-10-28 20:22:15 +04:00
|
|
|
* moment we only update the mode, atime, mtime, and ctime.
|
2012-12-12 04:58:44 +04:00
|
|
|
*/
|
|
|
|
int
|
|
|
|
zfs_dirty_inode(struct inode *ip, int flags)
|
|
|
|
{
|
|
|
|
znode_t *zp = ITOZ(ip);
|
2017-03-08 03:21:37 +03:00
|
|
|
zfsvfs_t *zfsvfs = ITOZSB(ip);
|
2012-12-12 04:58:44 +04:00
|
|
|
dmu_tx_t *tx;
|
2013-10-28 20:22:15 +04:00
|
|
|
uint64_t mode, atime[2], mtime[2], ctime[2];
|
|
|
|
sa_bulk_attr_t bulk[4];
|
2016-04-01 02:52:03 +03:00
|
|
|
int error = 0;
|
2012-12-12 04:58:44 +04:00
|
|
|
int cnt = 0;
|
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
if (zfs_is_readonly(zfsvfs) || dmu_objset_is_snapshot(zfsvfs->z_os))
|
2014-10-21 01:37:47 +04:00
|
|
|
return (0);
|
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_ENTER(zfsvfs);
|
2012-12-12 04:58:44 +04:00
|
|
|
ZFS_VERIFY_ZP(zp);
|
|
|
|
|
2016-04-01 02:52:03 +03:00
|
|
|
#ifdef I_DIRTY_TIME
|
|
|
|
/*
|
2019-09-03 03:56:41 +03:00
|
|
|
* This is the lazytime semantic introduced in Linux 4.0
|
2016-04-01 02:52:03 +03:00
|
|
|
* This flag will only be called from update_time when lazytime is set.
|
|
|
|
* (Note, I_DIRTY_SYNC will also set if not lazytime)
|
|
|
|
* Fortunately mtime and ctime are managed within ZFS itself, so we
|
|
|
|
* only need to dirty atime.
|
|
|
|
*/
|
|
|
|
if (flags == I_DIRTY_TIME) {
|
2019-08-13 16:58:02 +03:00
|
|
|
zp->z_atime_dirty = B_TRUE;
|
2016-04-01 02:52:03 +03:00
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
tx = dmu_tx_create(zfsvfs->z_os);
|
2012-12-12 04:58:44 +04:00
|
|
|
|
|
|
|
dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
|
|
|
|
zfs_sa_upgrade_txholds(tx, zp);
|
|
|
|
|
|
|
|
error = dmu_tx_assign(tx, TXG_WAIT);
|
|
|
|
if (error) {
|
|
|
|
dmu_tx_abort(tx);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
mutex_enter(&zp->z_lock);
|
2019-08-13 16:58:02 +03:00
|
|
|
zp->z_atime_dirty = B_FALSE;
|
2016-04-01 02:52:03 +03:00
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8);
|
|
|
|
SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16);
|
|
|
|
SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
|
|
|
|
SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
|
2012-12-12 04:58:44 +04:00
|
|
|
|
2013-10-28 20:22:15 +04:00
|
|
|
/* Preserve the mode, mtime and ctime provided by the inode */
|
2012-12-12 04:58:44 +04:00
|
|
|
ZFS_TIME_ENCODE(&ip->i_atime, atime);
|
|
|
|
ZFS_TIME_ENCODE(&ip->i_mtime, mtime);
|
|
|
|
ZFS_TIME_ENCODE(&ip->i_ctime, ctime);
|
2013-10-28 20:22:15 +04:00
|
|
|
mode = ip->i_mode;
|
|
|
|
|
|
|
|
zp->z_mode = mode;
|
2012-12-12 04:58:44 +04:00
|
|
|
|
|
|
|
error = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx);
|
|
|
|
mutex_exit(&zp->z_lock);
|
|
|
|
|
|
|
|
dmu_tx_commit(tx);
|
|
|
|
out:
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2012-12-12 04:58:44 +04:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
/*ARGSUSED*/
|
|
|
|
void
|
2011-02-03 21:34:05 +03:00
|
|
|
zfs_inactive(struct inode *ip)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2011-02-03 21:34:05 +03:00
|
|
|
znode_t *zp = ITOZ(ip);
|
2017-03-08 03:21:37 +03:00
|
|
|
zfsvfs_t *zfsvfs = ITOZSB(ip);
|
Fix atime handling and relatime
The problem for atime:
We have 3 places for atime: inode->i_atime, znode->z_atime and SA. And its
handling is a mess. A huge part of mess regarding atime comes from
zfs_tstamp_update_setup, zfs_inode_update, and zfs_getattr, which behave
inconsistently with those three values.
zfs_tstamp_update_setup clears z_atime_dirty unconditionally as long as you
don't pass ATTR_ATIME. Which means every write(2) operation which only updates
ctime and mtime will cause atime changes to not be written to disk.
Also zfs_inode_update from write(2) will replace inode->i_atime with what's
inside SA(stale). But doesn't touch z_atime. So after read(2) and write(2).
You'll have i_atime(stale), z_atime(new), SA(stale) and z_atime_dirty=0.
Now, if you do stat(2), zfs_getattr will actually replace i_atime with what's
inside, z_atime. So you will have now you'll have i_atime(new), z_atime(new),
SA(stale) and z_atime_dirty=0. These will all gone after umount. And you'll
leave with a stale atime.
The problem for relatime:
We do have a relatime config inside ZFS dataset, but how it should interact
with the mount flag MS_RELATIME is not well defined. It seems it wanted
relatime mount option to override the dataset config by showing it as
temporary in `zfs get`. But at the same time, `zfs set relatime=on|off` would
also seems to want to override the mount option. Not to mention that
MS_RELATIME flag is actually never passed into ZFS, so it never really worked.
How Linux handles atime:
The Linux kernel actually handles atime completely in VFS, except for writing
it to disk. So if we remove the atime handling in ZFS, things would just work,
no matter it's strictatime, relatime, noatime, or even O_NOATIME. And whenever
VFS updates the i_atime, it will notify the underlying filesystem via
sb->dirty_inode().
And also there's one thing to note about atime flags like MS_RELATIME and
other flags like MS_NODEV, etc. They are mount point flags rather than
filesystem(sb) flags. Since native linux filesystem can be mounted at multiple
places at the same time, they can all have different atime settings. So these
flags are never passed down to filesystem drivers.
What this patch tries to do:
We remove znode->z_atime, since we won't gain anything from it. We remove most
of the atime handling and leave it to VFS. The only thing we do with atime is
to write it when dirty_inode() or setattr() is called. We also add
file_accessed() in zpl_read() since it's not provided in vfs_read().
After this patch, only the MS_RELATIME flag will have effect. The setting in
dataset won't do anything. We will make zfstuil to mount ZFS with MS_RELATIME
set according to the setting in dataset in future patch.
Signed-off-by: Chunwei Chen <david.chen@osnexus.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Issue #4482
2016-03-30 03:53:34 +03:00
|
|
|
uint64_t atime[2];
|
2008-11-20 23:01:55 +03:00
|
|
|
int error;
|
2015-09-01 15:02:48 +03:00
|
|
|
int need_unlock = 0;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2015-09-01 15:02:48 +03:00
|
|
|
/* Only read lock if we haven't already write locked, e.g. rollback */
|
2017-03-08 03:21:37 +03:00
|
|
|
if (!RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)) {
|
2015-09-01 15:02:48 +03:00
|
|
|
need_unlock = 1;
|
2017-03-08 03:21:37 +03:00
|
|
|
rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
|
2015-09-01 15:02:48 +03:00
|
|
|
}
|
2011-02-03 21:34:05 +03:00
|
|
|
if (zp->z_sa_hdl == NULL) {
|
2015-09-01 15:02:48 +03:00
|
|
|
if (need_unlock)
|
2017-03-08 03:21:37 +03:00
|
|
|
rw_exit(&zfsvfs->z_teardown_inactive_lock);
|
2011-02-03 21:34:05 +03:00
|
|
|
return;
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2019-08-13 16:58:02 +03:00
|
|
|
if (zp->z_atime_dirty && zp->z_unlinked == B_FALSE) {
|
2017-03-08 03:21:37 +03:00
|
|
|
dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2010-05-29 00:45:14 +04:00
|
|
|
dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
|
|
|
|
zfs_sa_upgrade_txholds(tx, zp);
|
2008-11-20 23:01:55 +03:00
|
|
|
error = dmu_tx_assign(tx, TXG_WAIT);
|
|
|
|
if (error) {
|
|
|
|
dmu_tx_abort(tx);
|
|
|
|
} else {
|
Fix atime handling and relatime
The problem for atime:
We have 3 places for atime: inode->i_atime, znode->z_atime and SA. And its
handling is a mess. A huge part of mess regarding atime comes from
zfs_tstamp_update_setup, zfs_inode_update, and zfs_getattr, which behave
inconsistently with those three values.
zfs_tstamp_update_setup clears z_atime_dirty unconditionally as long as you
don't pass ATTR_ATIME. Which means every write(2) operation which only updates
ctime and mtime will cause atime changes to not be written to disk.
Also zfs_inode_update from write(2) will replace inode->i_atime with what's
inside SA(stale). But doesn't touch z_atime. So after read(2) and write(2).
You'll have i_atime(stale), z_atime(new), SA(stale) and z_atime_dirty=0.
Now, if you do stat(2), zfs_getattr will actually replace i_atime with what's
inside, z_atime. So you will have now you'll have i_atime(new), z_atime(new),
SA(stale) and z_atime_dirty=0. These will all gone after umount. And you'll
leave with a stale atime.
The problem for relatime:
We do have a relatime config inside ZFS dataset, but how it should interact
with the mount flag MS_RELATIME is not well defined. It seems it wanted
relatime mount option to override the dataset config by showing it as
temporary in `zfs get`. But at the same time, `zfs set relatime=on|off` would
also seems to want to override the mount option. Not to mention that
MS_RELATIME flag is actually never passed into ZFS, so it never really worked.
How Linux handles atime:
The Linux kernel actually handles atime completely in VFS, except for writing
it to disk. So if we remove the atime handling in ZFS, things would just work,
no matter it's strictatime, relatime, noatime, or even O_NOATIME. And whenever
VFS updates the i_atime, it will notify the underlying filesystem via
sb->dirty_inode().
And also there's one thing to note about atime flags like MS_RELATIME and
other flags like MS_NODEV, etc. They are mount point flags rather than
filesystem(sb) flags. Since native linux filesystem can be mounted at multiple
places at the same time, they can all have different atime settings. So these
flags are never passed down to filesystem drivers.
What this patch tries to do:
We remove znode->z_atime, since we won't gain anything from it. We remove most
of the atime handling and leave it to VFS. The only thing we do with atime is
to write it when dirty_inode() or setattr() is called. We also add
file_accessed() in zpl_read() since it's not provided in vfs_read().
After this patch, only the MS_RELATIME flag will have effect. The setting in
dataset won't do anything. We will make zfstuil to mount ZFS with MS_RELATIME
set according to the setting in dataset in future patch.
Signed-off-by: Chunwei Chen <david.chen@osnexus.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Issue #4482
2016-03-30 03:53:34 +03:00
|
|
|
ZFS_TIME_ENCODE(&ip->i_atime, atime);
|
2008-11-20 23:01:55 +03:00
|
|
|
mutex_enter(&zp->z_lock);
|
2017-03-08 03:21:37 +03:00
|
|
|
(void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
|
Fix atime handling and relatime
The problem for atime:
We have 3 places for atime: inode->i_atime, znode->z_atime and SA. And its
handling is a mess. A huge part of mess regarding atime comes from
zfs_tstamp_update_setup, zfs_inode_update, and zfs_getattr, which behave
inconsistently with those three values.
zfs_tstamp_update_setup clears z_atime_dirty unconditionally as long as you
don't pass ATTR_ATIME. Which means every write(2) operation which only updates
ctime and mtime will cause atime changes to not be written to disk.
Also zfs_inode_update from write(2) will replace inode->i_atime with what's
inside SA(stale). But doesn't touch z_atime. So after read(2) and write(2).
You'll have i_atime(stale), z_atime(new), SA(stale) and z_atime_dirty=0.
Now, if you do stat(2), zfs_getattr will actually replace i_atime with what's
inside, z_atime. So you will have now you'll have i_atime(new), z_atime(new),
SA(stale) and z_atime_dirty=0. These will all gone after umount. And you'll
leave with a stale atime.
The problem for relatime:
We do have a relatime config inside ZFS dataset, but how it should interact
with the mount flag MS_RELATIME is not well defined. It seems it wanted
relatime mount option to override the dataset config by showing it as
temporary in `zfs get`. But at the same time, `zfs set relatime=on|off` would
also seems to want to override the mount option. Not to mention that
MS_RELATIME flag is actually never passed into ZFS, so it never really worked.
How Linux handles atime:
The Linux kernel actually handles atime completely in VFS, except for writing
it to disk. So if we remove the atime handling in ZFS, things would just work,
no matter it's strictatime, relatime, noatime, or even O_NOATIME. And whenever
VFS updates the i_atime, it will notify the underlying filesystem via
sb->dirty_inode().
And also there's one thing to note about atime flags like MS_RELATIME and
other flags like MS_NODEV, etc. They are mount point flags rather than
filesystem(sb) flags. Since native linux filesystem can be mounted at multiple
places at the same time, they can all have different atime settings. So these
flags are never passed down to filesystem drivers.
What this patch tries to do:
We remove znode->z_atime, since we won't gain anything from it. We remove most
of the atime handling and leave it to VFS. The only thing we do with atime is
to write it when dirty_inode() or setattr() is called. We also add
file_accessed() in zpl_read() since it's not provided in vfs_read().
After this patch, only the MS_RELATIME flag will have effect. The setting in
dataset won't do anything. We will make zfstuil to mount ZFS with MS_RELATIME
set according to the setting in dataset in future patch.
Signed-off-by: Chunwei Chen <david.chen@osnexus.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Issue #4482
2016-03-30 03:53:34 +03:00
|
|
|
(void *)&atime, sizeof (atime), tx);
|
2019-08-13 16:58:02 +03:00
|
|
|
zp->z_atime_dirty = B_FALSE;
|
2008-11-20 23:01:55 +03:00
|
|
|
mutex_exit(&zp->z_lock);
|
|
|
|
dmu_tx_commit(tx);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
zfs_zinactive(zp);
|
2015-09-01 15:02:48 +03:00
|
|
|
if (need_unlock)
|
2017-03-08 03:21:37 +03:00
|
|
|
rw_exit(&zfsvfs->z_teardown_inactive_lock);
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Bounds-check the seek operation.
|
|
|
|
*
|
2011-02-08 22:16:06 +03:00
|
|
|
* IN: ip - inode seeking within
|
2008-11-20 23:01:55 +03:00
|
|
|
* ooff - old file offset
|
|
|
|
* noffp - pointer to new file offset
|
|
|
|
*
|
|
|
|
* RETURN: 0 if success
|
|
|
|
* EINVAL if new offset invalid
|
|
|
|
*/
|
|
|
|
/* ARGSUSED */
|
2011-02-08 22:16:06 +03:00
|
|
|
int
|
2011-03-02 03:24:21 +03:00
|
|
|
zfs_seek(struct inode *ip, offset_t ooff, offset_t *noffp)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2011-02-08 22:16:06 +03:00
|
|
|
if (S_ISDIR(ip->i_mode))
|
2008-11-20 23:01:55 +03:00
|
|
|
return (0);
|
|
|
|
return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2011-05-28 05:53:07 +04:00
|
|
|
* Fill pages with data from the disk.
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
|
|
|
static int
|
2011-05-28 05:53:07 +04:00
|
|
|
zfs_fillpage(struct inode *ip, struct page *pl[], int nr_pages)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2013-11-01 23:26:11 +04:00
|
|
|
znode_t *zp = ITOZ(ip);
|
2017-03-08 03:21:37 +03:00
|
|
|
zfsvfs_t *zfsvfs = ITOZSB(ip);
|
2013-11-01 23:26:11 +04:00
|
|
|
objset_t *os;
|
2011-05-28 05:53:07 +04:00
|
|
|
struct page *cur_pp;
|
2013-11-01 23:26:11 +04:00
|
|
|
u_offset_t io_off, total;
|
|
|
|
size_t io_len;
|
|
|
|
loff_t i_size;
|
|
|
|
unsigned page_idx;
|
|
|
|
int err;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
os = zfsvfs->z_os;
|
2016-04-05 22:39:37 +03:00
|
|
|
io_len = nr_pages << PAGE_SHIFT;
|
2011-05-28 05:53:07 +04:00
|
|
|
i_size = i_size_read(ip);
|
|
|
|
io_off = page_offset(pl[0]);
|
|
|
|
|
|
|
|
if (io_off + io_len > i_size)
|
|
|
|
io_len = i_size - io_off;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
/*
|
2011-05-28 05:53:07 +04:00
|
|
|
* Iterate over list of pages and read each page individually.
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
2011-05-28 05:53:07 +04:00
|
|
|
page_idx = 0;
|
2008-11-20 23:01:55 +03:00
|
|
|
for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) {
|
2009-02-18 23:51:31 +03:00
|
|
|
caddr_t va;
|
|
|
|
|
2016-05-28 01:39:34 +03:00
|
|
|
cur_pp = pl[page_idx++];
|
2011-05-28 05:53:07 +04:00
|
|
|
va = kmap(cur_pp);
|
2009-07-03 02:44:48 +04:00
|
|
|
err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va,
|
|
|
|
DMU_READ_PREFETCH);
|
2011-05-28 05:53:07 +04:00
|
|
|
kunmap(cur_pp);
|
2008-11-20 23:01:55 +03:00
|
|
|
if (err) {
|
2008-12-03 23:09:06 +03:00
|
|
|
/* convert checksum errors into IO errors */
|
|
|
|
if (err == ECKSUM)
|
2013-03-08 22:41:28 +04:00
|
|
|
err = SET_ERROR(EIO);
|
2008-11-20 23:01:55 +03:00
|
|
|
return (err);
|
|
|
|
}
|
|
|
|
}
|
2009-02-18 23:51:31 +03:00
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2011-05-28 05:53:07 +04:00
|
|
|
* Uses zfs_fillpage to read data from the file and fill the pages.
|
2008-11-20 23:01:55 +03:00
|
|
|
*
|
2011-05-28 05:53:07 +04:00
|
|
|
* IN: ip - inode of file to get data from.
|
|
|
|
* pl - list of pages to read
|
|
|
|
* nr_pages - number of pages to read
|
2008-11-20 23:01:55 +03:00
|
|
|
*
|
2013-06-11 21:12:34 +04:00
|
|
|
* RETURN: 0 on success, error code on failure.
|
2008-11-20 23:01:55 +03:00
|
|
|
*
|
|
|
|
* Timestamps:
|
|
|
|
* vp - atime updated
|
|
|
|
*/
|
|
|
|
/* ARGSUSED */
|
2011-05-28 05:53:07 +04:00
|
|
|
int
|
|
|
|
zfs_getpage(struct inode *ip, struct page *pl[], int nr_pages)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2011-05-28 05:53:07 +04:00
|
|
|
znode_t *zp = ITOZ(ip);
|
2017-03-08 03:21:37 +03:00
|
|
|
zfsvfs_t *zfsvfs = ITOZSB(ip);
|
2011-05-28 05:53:07 +04:00
|
|
|
int err;
|
2009-02-18 23:51:31 +03:00
|
|
|
|
|
|
|
if (pl == NULL)
|
|
|
|
return (0);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_ENTER(zfsvfs);
|
2008-11-20 23:01:55 +03:00
|
|
|
ZFS_VERIFY_ZP(zp);
|
|
|
|
|
2011-05-28 05:53:07 +04:00
|
|
|
err = zfs_fillpage(ip, pl, nr_pages);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2008-11-20 23:01:55 +03:00
|
|
|
return (err);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2011-07-01 22:25:07 +04:00
|
|
|
* Check ZFS specific permissions to memory map a section of a file.
|
2008-11-20 23:01:55 +03:00
|
|
|
*
|
2011-07-01 22:25:07 +04:00
|
|
|
* IN: ip - inode of the file to mmap
|
|
|
|
* off - file offset
|
|
|
|
* addrp - start address in memory region
|
|
|
|
* len - length of memory region
|
|
|
|
* vm_flags- address flags
|
2008-11-20 23:01:55 +03:00
|
|
|
*
|
2011-07-01 22:25:07 +04:00
|
|
|
* RETURN: 0 if success
|
|
|
|
* error code if failure
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
|
|
|
/*ARGSUSED*/
|
2011-07-01 22:25:07 +04:00
|
|
|
int
|
|
|
|
zfs_map(struct inode *ip, offset_t off, caddr_t *addrp, size_t len,
|
|
|
|
unsigned long vm_flags)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2011-07-01 22:25:07 +04:00
|
|
|
znode_t *zp = ITOZ(ip);
|
2017-03-08 03:21:37 +03:00
|
|
|
zfsvfs_t *zfsvfs = ITOZSB(ip);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_ENTER(zfsvfs);
|
2008-11-20 23:01:55 +03:00
|
|
|
ZFS_VERIFY_ZP(zp);
|
|
|
|
|
2011-07-01 22:25:07 +04:00
|
|
|
if ((vm_flags & VM_WRITE) && (zp->z_pflags &
|
2010-05-29 00:45:14 +04:00
|
|
|
(ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) {
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2013-03-08 22:41:28 +04:00
|
|
|
return (SET_ERROR(EPERM));
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2011-07-01 22:25:07 +04:00
|
|
|
if ((vm_flags & (VM_READ | VM_EXEC)) &&
|
2010-05-29 00:45:14 +04:00
|
|
|
(zp->z_pflags & ZFS_AV_QUARANTINED)) {
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2013-03-08 22:41:28 +04:00
|
|
|
return (SET_ERROR(EACCES));
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
if (off < 0 || len > MAXOFFSET_T - off) {
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2013-03-08 22:41:28 +04:00
|
|
|
return (SET_ERROR(ENXIO));
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2008-11-20 23:01:55 +03:00
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Free or allocate space in a file. Currently, this function only
|
|
|
|
* supports the `F_FREESP' command. However, this command is somewhat
|
|
|
|
* misnamed, as its functionality includes the ability to allocate as
|
|
|
|
* well as free space.
|
|
|
|
*
|
2019-12-11 22:53:57 +03:00
|
|
|
* IN: zp - znode of file to free data in.
|
2008-11-20 23:01:55 +03:00
|
|
|
* cmd - action to take (only F_FREESP supported).
|
|
|
|
* bfp - section of file to free/alloc.
|
|
|
|
* flag - current file open mode flags.
|
|
|
|
* offset - current file offset.
|
2019-04-19 22:03:32 +03:00
|
|
|
* cr - credentials of caller.
|
2008-11-20 23:01:55 +03:00
|
|
|
*
|
2013-06-11 21:12:34 +04:00
|
|
|
* RETURN: 0 on success, error code on failure.
|
2008-11-20 23:01:55 +03:00
|
|
|
*
|
|
|
|
* Timestamps:
|
2019-12-11 22:53:57 +03:00
|
|
|
* zp - ctime|mtime updated
|
2008-11-20 23:01:55 +03:00
|
|
|
*/
|
|
|
|
/* ARGSUSED */
|
2010-12-17 22:18:08 +03:00
|
|
|
int
|
2019-12-11 22:53:57 +03:00
|
|
|
zfs_space(znode_t *zp, int cmd, flock64_t *bfp, int flag,
|
2011-02-08 22:16:06 +03:00
|
|
|
offset_t offset, cred_t *cr)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2019-12-11 22:53:57 +03:00
|
|
|
zfsvfs_t *zfsvfs = ZTOZSB(zp);
|
2008-11-20 23:01:55 +03:00
|
|
|
uint64_t off, len;
|
|
|
|
int error;
|
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_ENTER(zfsvfs);
|
2008-11-20 23:01:55 +03:00
|
|
|
ZFS_VERIFY_ZP(zp);
|
|
|
|
|
|
|
|
if (cmd != F_FREESP) {
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2013-03-08 22:41:28 +04:00
|
|
|
return (SET_ERROR(EINVAL));
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2016-01-11 01:31:24 +03:00
|
|
|
/*
|
|
|
|
* Callers might not be able to detect properly that we are read-only,
|
|
|
|
* so check it explicitly here.
|
|
|
|
*/
|
2017-03-08 03:21:37 +03:00
|
|
|
if (zfs_is_readonly(zfsvfs)) {
|
|
|
|
ZFS_EXIT(zfsvfs);
|
2016-01-11 01:31:24 +03:00
|
|
|
return (SET_ERROR(EROFS));
|
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
if (bfp->l_len < 0) {
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2013-03-08 22:41:28 +04:00
|
|
|
return (SET_ERROR(EINVAL));
|
2008-11-20 23:01:55 +03:00
|
|
|
}
|
|
|
|
|
2011-09-02 11:37:53 +04:00
|
|
|
/*
|
|
|
|
* Permissions aren't checked on Solaris because on this OS
|
|
|
|
* zfs_space() can only be called with an opened file handle.
|
|
|
|
* On Linux we can get here through truncate_range() which
|
|
|
|
* operates directly on inodes, so we need to check access rights.
|
|
|
|
*/
|
|
|
|
if ((error = zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr))) {
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2011-09-02 11:37:53 +04:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
off = bfp->l_start;
|
|
|
|
len = bfp->l_len; /* 0 means from off to end of file */
|
|
|
|
|
2008-12-03 23:09:06 +03:00
|
|
|
error = zfs_freesp(zp, off, len, flag, TRUE);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2008-11-20 23:01:55 +03:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*ARGSUSED*/
|
2010-12-17 22:18:08 +03:00
|
|
|
int
|
2011-02-08 22:16:06 +03:00
|
|
|
zfs_fid(struct inode *ip, fid_t *fidp)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2011-02-08 22:16:06 +03:00
|
|
|
znode_t *zp = ITOZ(ip);
|
2017-03-08 03:21:37 +03:00
|
|
|
zfsvfs_t *zfsvfs = ITOZSB(ip);
|
2008-11-20 23:01:55 +03:00
|
|
|
uint32_t gen;
|
2010-05-29 00:45:14 +04:00
|
|
|
uint64_t gen64;
|
2008-11-20 23:01:55 +03:00
|
|
|
uint64_t object = zp->z_id;
|
|
|
|
zfid_short_t *zfid;
|
2010-05-29 00:45:14 +04:00
|
|
|
int size, i, error;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_ENTER(zfsvfs);
|
2008-11-20 23:01:55 +03:00
|
|
|
ZFS_VERIFY_ZP(zp);
|
2010-05-29 00:45:14 +04:00
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs),
|
2010-05-29 00:45:14 +04:00
|
|
|
&gen64, sizeof (uint64_t))) != 0) {
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2010-05-29 00:45:14 +04:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
|
|
|
gen = (uint32_t)gen64;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2017-03-08 20:26:33 +03:00
|
|
|
size = SHORT_FID_LEN;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
zfid = (zfid_short_t *)fidp;
|
|
|
|
|
|
|
|
zfid->zf_len = size;
|
|
|
|
|
|
|
|
for (i = 0; i < sizeof (zfid->zf_object); i++)
|
|
|
|
zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
|
|
|
|
|
|
|
|
/* Must have a non-zero generation number to distinguish from .zfs */
|
|
|
|
if (gen == 0)
|
|
|
|
gen = 1;
|
|
|
|
for (i = 0; i < sizeof (zfid->zf_gen); i++)
|
|
|
|
zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
|
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2008-11-20 23:01:55 +03:00
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*ARGSUSED*/
|
2010-12-17 22:18:08 +03:00
|
|
|
int
|
2011-02-08 22:16:06 +03:00
|
|
|
zfs_getsecattr(struct inode *ip, vsecattr_t *vsecp, int flag, cred_t *cr)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2011-02-08 22:16:06 +03:00
|
|
|
znode_t *zp = ITOZ(ip);
|
2017-03-08 03:21:37 +03:00
|
|
|
zfsvfs_t *zfsvfs = ITOZSB(ip);
|
2008-11-20 23:01:55 +03:00
|
|
|
int error;
|
|
|
|
boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
|
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_ENTER(zfsvfs);
|
2008-11-20 23:01:55 +03:00
|
|
|
ZFS_VERIFY_ZP(zp);
|
|
|
|
error = zfs_getacl(zp, vsecp, skipaclchk, cr);
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2008-11-20 23:01:55 +03:00
|
|
|
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*ARGSUSED*/
|
2010-12-17 22:18:08 +03:00
|
|
|
int
|
2019-12-11 22:53:57 +03:00
|
|
|
zfs_setsecattr(znode_t *zp, vsecattr_t *vsecp, int flag, cred_t *cr)
|
2008-11-20 23:01:55 +03:00
|
|
|
{
|
2019-12-11 22:53:57 +03:00
|
|
|
zfsvfs_t *zfsvfs = ZTOZSB(zp);
|
2008-11-20 23:01:55 +03:00
|
|
|
int error;
|
|
|
|
boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
|
2017-03-08 03:21:37 +03:00
|
|
|
zilog_t *zilog = zfsvfs->z_log;
|
2008-11-20 23:01:55 +03:00
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_ENTER(zfsvfs);
|
2008-11-20 23:01:55 +03:00
|
|
|
ZFS_VERIFY_ZP(zp);
|
2010-05-29 00:45:14 +04:00
|
|
|
|
2008-11-20 23:01:55 +03:00
|
|
|
error = zfs_setacl(zp, vsecp, skipaclchk, cr);
|
2010-05-29 00:45:14 +04:00
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
|
2010-08-27 01:24:34 +04:00
|
|
|
zil_commit(zilog, 0);
|
2010-05-29 00:45:14 +04:00
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2008-11-20 23:01:55 +03:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
2011-02-08 22:16:06 +03:00
|
|
|
#ifdef HAVE_UIO_ZEROCOPY
|
2010-05-29 00:45:14 +04:00
|
|
|
/*
|
2019-07-18 22:48:46 +03:00
|
|
|
* The smallest read we may consider to loan out an arcbuf.
|
|
|
|
* This must be a power of 2.
|
2010-05-29 00:45:14 +04:00
|
|
|
*/
|
|
|
|
int zcr_blksz_min = (1 << 10); /* 1K */
|
2019-07-18 22:48:46 +03:00
|
|
|
/*
|
|
|
|
* If set to less than the file block size, allow loaning out of an
|
|
|
|
* arcbuf for a partial block read. This must be a power of 2.
|
|
|
|
*/
|
2010-05-29 00:45:14 +04:00
|
|
|
int zcr_blksz_max = (1 << 17); /* 128K */
|
|
|
|
|
|
|
|
/*ARGSUSED*/
|
|
|
|
static int
|
2011-02-08 22:16:06 +03:00
|
|
|
zfs_reqzcbuf(struct inode *ip, enum uio_rw ioflag, xuio_t *xuio, cred_t *cr)
|
2010-05-29 00:45:14 +04:00
|
|
|
{
|
2011-02-08 22:16:06 +03:00
|
|
|
znode_t *zp = ITOZ(ip);
|
2017-03-08 03:21:37 +03:00
|
|
|
zfsvfs_t *zfsvfs = ITOZSB(ip);
|
|
|
|
int max_blksz = zfsvfs->z_max_blksz;
|
2010-05-29 00:45:14 +04:00
|
|
|
uio_t *uio = &xuio->xu_uio;
|
|
|
|
ssize_t size = uio->uio_resid;
|
|
|
|
offset_t offset = uio->uio_loffset;
|
|
|
|
int blksz;
|
|
|
|
int fullblk, i;
|
|
|
|
arc_buf_t *abuf;
|
|
|
|
ssize_t maxsize;
|
|
|
|
int preamble, postamble;
|
|
|
|
|
|
|
|
if (xuio->xu_type != UIOTYPE_ZEROCOPY)
|
2013-03-08 22:41:28 +04:00
|
|
|
return (SET_ERROR(EINVAL));
|
2010-05-29 00:45:14 +04:00
|
|
|
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_ENTER(zfsvfs);
|
2010-05-29 00:45:14 +04:00
|
|
|
ZFS_VERIFY_ZP(zp);
|
|
|
|
switch (ioflag) {
|
|
|
|
case UIO_WRITE:
|
|
|
|
/*
|
|
|
|
* Loan out an arc_buf for write if write size is bigger than
|
|
|
|
* max_blksz, and the file's block size is also max_blksz.
|
|
|
|
*/
|
|
|
|
blksz = max_blksz;
|
|
|
|
if (size < blksz || zp->z_blksz != blksz) {
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2013-03-08 22:41:28 +04:00
|
|
|
return (SET_ERROR(EINVAL));
|
2010-05-29 00:45:14 +04:00
|
|
|
}
|
|
|
|
/*
|
|
|
|
* Caller requests buffers for write before knowing where the
|
|
|
|
* write offset might be (e.g. NFS TCP write).
|
|
|
|
*/
|
|
|
|
if (offset == -1) {
|
|
|
|
preamble = 0;
|
|
|
|
} else {
|
|
|
|
preamble = P2PHASE(offset, blksz);
|
|
|
|
if (preamble) {
|
|
|
|
preamble = blksz - preamble;
|
|
|
|
size -= preamble;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
postamble = P2PHASE(size, blksz);
|
|
|
|
size -= postamble;
|
|
|
|
|
|
|
|
fullblk = size / blksz;
|
|
|
|
(void) dmu_xuio_init(xuio,
|
|
|
|
(preamble != 0) + fullblk + (postamble != 0));
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Have to fix iov base/len for partial buffers. They
|
|
|
|
* currently represent full arc_buf's.
|
|
|
|
*/
|
|
|
|
if (preamble) {
|
|
|
|
/* data begins in the middle of the arc_buf */
|
|
|
|
abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
|
|
|
|
blksz);
|
|
|
|
ASSERT(abuf);
|
|
|
|
(void) dmu_xuio_add(xuio, abuf,
|
|
|
|
blksz - preamble, preamble);
|
|
|
|
}
|
|
|
|
|
|
|
|
for (i = 0; i < fullblk; i++) {
|
|
|
|
abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
|
|
|
|
blksz);
|
|
|
|
ASSERT(abuf);
|
|
|
|
(void) dmu_xuio_add(xuio, abuf, 0, blksz);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (postamble) {
|
|
|
|
/* data ends in the middle of the arc_buf */
|
|
|
|
abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
|
|
|
|
blksz);
|
|
|
|
ASSERT(abuf);
|
|
|
|
(void) dmu_xuio_add(xuio, abuf, 0, postamble);
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case UIO_READ:
|
|
|
|
/*
|
|
|
|
* Loan out an arc_buf for read if the read size is larger than
|
|
|
|
* the current file block size. Block alignment is not
|
|
|
|
* considered. Partial arc_buf will be loaned out for read.
|
|
|
|
*/
|
|
|
|
blksz = zp->z_blksz;
|
|
|
|
if (blksz < zcr_blksz_min)
|
|
|
|
blksz = zcr_blksz_min;
|
|
|
|
if (blksz > zcr_blksz_max)
|
|
|
|
blksz = zcr_blksz_max;
|
|
|
|
/* avoid potential complexity of dealing with it */
|
|
|
|
if (blksz > max_blksz) {
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2013-03-08 22:41:28 +04:00
|
|
|
return (SET_ERROR(EINVAL));
|
2010-05-29 00:45:14 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
maxsize = zp->z_size - uio->uio_loffset;
|
|
|
|
if (size > maxsize)
|
|
|
|
size = maxsize;
|
|
|
|
|
2011-02-08 22:16:06 +03:00
|
|
|
if (size < blksz) {
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2013-03-08 22:41:28 +04:00
|
|
|
return (SET_ERROR(EINVAL));
|
2010-05-29 00:45:14 +04:00
|
|
|
}
|
|
|
|
break;
|
|
|
|
default:
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2013-03-08 22:41:28 +04:00
|
|
|
return (SET_ERROR(EINVAL));
|
2010-05-29 00:45:14 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
uio->uio_extflg = UIO_XUIO;
|
|
|
|
XUIO_XUZC_RW(xuio) = ioflag;
|
2017-03-08 03:21:37 +03:00
|
|
|
ZFS_EXIT(zfsvfs);
|
2010-05-29 00:45:14 +04:00
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*ARGSUSED*/
|
|
|
|
static int
|
2011-02-08 22:16:06 +03:00
|
|
|
zfs_retzcbuf(struct inode *ip, xuio_t *xuio, cred_t *cr)
|
2010-05-29 00:45:14 +04:00
|
|
|
{
|
|
|
|
int i;
|
|
|
|
arc_buf_t *abuf;
|
|
|
|
int ioflag = XUIO_XUZC_RW(xuio);
|
|
|
|
|
|
|
|
ASSERT(xuio->xu_type == UIOTYPE_ZEROCOPY);
|
|
|
|
|
|
|
|
i = dmu_xuio_cnt(xuio);
|
|
|
|
while (i-- > 0) {
|
|
|
|
abuf = dmu_xuio_arcbuf(xuio, i);
|
|
|
|
/*
|
|
|
|
* if abuf == NULL, it must be a write buffer
|
|
|
|
* that has been returned in zfs_write().
|
|
|
|
*/
|
|
|
|
if (abuf)
|
|
|
|
dmu_return_arcbuf(abuf);
|
|
|
|
ASSERT(abuf || ioflag == UIO_WRITE);
|
|
|
|
}
|
|
|
|
|
|
|
|
dmu_xuio_fini(xuio);
|
|
|
|
return (0);
|
|
|
|
}
|
2011-02-08 22:16:06 +03:00
|
|
|
#endif /* HAVE_UIO_ZEROCOPY */
|
2011-05-04 02:09:28 +04:00
|
|
|
|
2018-02-16 04:53:18 +03:00
|
|
|
#if defined(_KERNEL)
|
2017-03-09 01:56:19 +03:00
|
|
|
EXPORT_SYMBOL(zfs_open);
|
|
|
|
EXPORT_SYMBOL(zfs_close);
|
|
|
|
EXPORT_SYMBOL(zfs_read);
|
|
|
|
EXPORT_SYMBOL(zfs_write);
|
|
|
|
EXPORT_SYMBOL(zfs_access);
|
|
|
|
EXPORT_SYMBOL(zfs_lookup);
|
|
|
|
EXPORT_SYMBOL(zfs_create);
|
|
|
|
EXPORT_SYMBOL(zfs_tmpfile);
|
|
|
|
EXPORT_SYMBOL(zfs_remove);
|
|
|
|
EXPORT_SYMBOL(zfs_mkdir);
|
|
|
|
EXPORT_SYMBOL(zfs_rmdir);
|
|
|
|
EXPORT_SYMBOL(zfs_readdir);
|
|
|
|
EXPORT_SYMBOL(zfs_fsync);
|
|
|
|
EXPORT_SYMBOL(zfs_getattr_fast);
|
|
|
|
EXPORT_SYMBOL(zfs_setattr);
|
|
|
|
EXPORT_SYMBOL(zfs_rename);
|
|
|
|
EXPORT_SYMBOL(zfs_symlink);
|
|
|
|
EXPORT_SYMBOL(zfs_readlink);
|
|
|
|
EXPORT_SYMBOL(zfs_link);
|
|
|
|
EXPORT_SYMBOL(zfs_inactive);
|
|
|
|
EXPORT_SYMBOL(zfs_space);
|
|
|
|
EXPORT_SYMBOL(zfs_fid);
|
|
|
|
EXPORT_SYMBOL(zfs_getsecattr);
|
|
|
|
EXPORT_SYMBOL(zfs_setsecattr);
|
|
|
|
EXPORT_SYMBOL(zfs_getpage);
|
|
|
|
EXPORT_SYMBOL(zfs_putpage);
|
|
|
|
EXPORT_SYMBOL(zfs_dirty_inode);
|
|
|
|
EXPORT_SYMBOL(zfs_map);
|
|
|
|
|
2019-07-19 21:23:56 +03:00
|
|
|
/* BEGIN CSTYLED */
|
2015-08-21 04:43:10 +03:00
|
|
|
module_param(zfs_delete_blocks, ulong, 0644);
|
|
|
|
MODULE_PARM_DESC(zfs_delete_blocks, "Delete files larger than N blocks async");
|
2019-07-19 21:23:56 +03:00
|
|
|
module_param(zfs_read_chunk_size, ulong, 0644);
|
2011-05-04 02:09:28 +04:00
|
|
|
MODULE_PARM_DESC(zfs_read_chunk_size, "Bytes to read per chunk");
|
2019-07-19 21:23:56 +03:00
|
|
|
/* END CSTYLED */
|
|
|
|
|
2011-05-04 02:09:28 +04:00
|
|
|
#endif
|