2023-05-16 06:30:26 +03:00
|
|
|
/*
|
|
|
|
* CDDL HEADER START
|
|
|
|
*
|
|
|
|
* The contents of this file are subject to the terms of the
|
|
|
|
* Common Development and Distribution License (the "License").
|
|
|
|
* You may not use this file except in compliance with the License.
|
|
|
|
*
|
|
|
|
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
|
|
|
* or https://opensource.org/licenses/CDDL-1.0.
|
|
|
|
* See the License for the specific language governing permissions
|
|
|
|
* and limitations under the License.
|
|
|
|
*
|
|
|
|
* When distributing Covered Code, include this CDDL HEADER in each
|
|
|
|
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
|
|
|
* If applicable, add the following below this CDDL HEADER, with the
|
|
|
|
* fields enclosed by brackets "[]" replaced with your own identifying
|
|
|
|
* information: Portions Copyright [yyyy] [name of copyright owner]
|
|
|
|
*
|
|
|
|
* CDDL HEADER END
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
|
|
|
|
* Copyright (c) 2012, 2016 by Delphix. All rights reserved.
|
|
|
|
* Copyright (c) 2022 by Pawel Jakub Dawidek
|
|
|
|
* Copyright (c) 2023, Klara Inc.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <sys/zfs_context.h>
|
|
|
|
#include <sys/spa.h>
|
|
|
|
#include <sys/spa_impl.h>
|
|
|
|
#include <sys/ddt.h>
|
2023-06-30 06:35:18 +03:00
|
|
|
#include <sys/ddt_impl.h>
|
2023-05-16 06:30:26 +03:00
|
|
|
|
|
|
|
static void
|
ddt: cleanup the stats & histogram code
Both the API and the code were kinda mangled and I was really struggling
to follow it. The worst offender was the old ddt_stat_add(); after
fixing it up the rest of the changes are mostly knock-on effects and
targets of opportunity.
Note that the old ddt_stat_add() was safe against overflows - it could
produce crazy numbers, but the compiler wouldn't do anything stupid. The
assertions in ddt_stat_sub() go a lot of the way to protecting against
this; getting in a position where overflows are a problem is definitely
a programming error.
Also expanding ddt_stat_add() and ddt_histogram_empty() produces less
efficient assembly. I'm not bothered about this right now though; these
should not be hot functions, and if they are we'll optimise them later.
If we have to go back to the old form, we'll comment it like crazy.
Finally, I've removed the assertion that the bucket will never be
negative, as it will soon be possible to have entries with zero
refcounts: an entry for a block that is no longer on the pool, but is on
the log waiting to be synced out. It might be better to have a separate
bucket for these, since they're still using real space on disk, but
ultimately these stats are driving UI, and for now I've chosen to keep
them matching how they've looked in the past, as well as match the
operators mental model - pool usage is managed elsewhere.
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: iXsystems, Inc.
Closes #15895
2023-06-15 10:19:41 +03:00
|
|
|
ddt_stat_generate(ddt_t *ddt, const ddt_lightweight_entry_t *ddlwe,
|
|
|
|
ddt_stat_t *dds)
|
2023-05-16 06:30:26 +03:00
|
|
|
{
|
|
|
|
spa_t *spa = ddt->ddt_spa;
|
ddt: cleanup the stats & histogram code
Both the API and the code were kinda mangled and I was really struggling
to follow it. The worst offender was the old ddt_stat_add(); after
fixing it up the rest of the changes are mostly knock-on effects and
targets of opportunity.
Note that the old ddt_stat_add() was safe against overflows - it could
produce crazy numbers, but the compiler wouldn't do anything stupid. The
assertions in ddt_stat_sub() go a lot of the way to protecting against
this; getting in a position where overflows are a problem is definitely
a programming error.
Also expanding ddt_stat_add() and ddt_histogram_empty() produces less
efficient assembly. I'm not bothered about this right now though; these
should not be hot functions, and if they are we'll optimise them later.
If we have to go back to the old form, we'll comment it like crazy.
Finally, I've removed the assertion that the bucket will never be
negative, as it will soon be possible to have entries with zero
refcounts: an entry for a block that is no longer on the pool, but is on
the log waiting to be synced out. It might be better to have a separate
bucket for these, since they're still using real space on disk, but
ultimately these stats are driving UI, and for now I've chosen to keep
them matching how they've looked in the past, as well as match the
operators mental model - pool usage is managed elsewhere.
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: iXsystems, Inc.
Closes #15895
2023-06-15 10:19:41 +03:00
|
|
|
uint64_t lsize = DDK_GET_LSIZE(&ddlwe->ddlwe_key);
|
|
|
|
uint64_t psize = DDK_GET_PSIZE(&ddlwe->ddlwe_key);
|
2023-05-16 06:30:26 +03:00
|
|
|
|
|
|
|
memset(dds, 0, sizeof (*dds));
|
|
|
|
|
ddt: dedup log
Adds a log/journal to dedup. At the end of txg, instead of writing the
entry directly to the ZAP, instead its adding to an in-memory tree and
appended to an on-disk object. The on-disk object is only read at
import, to reload the in-memory tree.
Lookups first go the the log tree before going to the ZAP, so
recently-used entries will remain close by in memory. This vastly
reduces overhead from dedup IO, as it will not have to do so many
read/update/write cycles on ZAP leaf nodes.
A flushing facility is added at end of txg, to push logged entries out
to the ZAP. There's actually two separate "logs" (in-memory tree and
on-disk object), one active (recieving updated entries) and one flushing
(writing out to disk). These are swapped (ie flushing begins) based on
memory used by the in-memory log trees and time since we last flushed
something.
The flushing facility monitors the amount of entries coming in and being
flushed out, and calibrates itself to try to flush enough each txg to
keep up with the ingest rate without competing too much with other IO.
Multiple tuneables are provided to control the flushing facility.
All the histograms and stats are update to accomodate the log as a
separate entry store. zdb gains knowledge of how to count them and dump
them. Documentation included!
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Co-authored-by: Allan Jude <allan@klarasystems.com>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: iXsystems, Inc.
Closes #15895
2023-06-22 10:46:22 +03:00
|
|
|
for (int p = 0; p < DDT_NPHYS(ddt); p++) {
|
ddt: cleanup the stats & histogram code
Both the API and the code were kinda mangled and I was really struggling
to follow it. The worst offender was the old ddt_stat_add(); after
fixing it up the rest of the changes are mostly knock-on effects and
targets of opportunity.
Note that the old ddt_stat_add() was safe against overflows - it could
produce crazy numbers, but the compiler wouldn't do anything stupid. The
assertions in ddt_stat_sub() go a lot of the way to protecting against
this; getting in a position where overflows are a problem is definitely
a programming error.
Also expanding ddt_stat_add() and ddt_histogram_empty() produces less
efficient assembly. I'm not bothered about this right now though; these
should not be hot functions, and if they are we'll optimise them later.
If we have to go back to the old form, we'll comment it like crazy.
Finally, I've removed the assertion that the bucket will never be
negative, as it will soon be possible to have entries with zero
refcounts: an entry for a block that is no longer on the pool, but is on
the log waiting to be synced out. It might be better to have a separate
bucket for these, since they're still using real space on disk, but
ultimately these stats are driving UI, and for now I've chosen to keep
them matching how they've looked in the past, as well as match the
operators mental model - pool usage is managed elsewhere.
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: iXsystems, Inc.
Closes #15895
2023-06-15 10:19:41 +03:00
|
|
|
const ddt_univ_phys_t *ddp = &ddlwe->ddlwe_phys;
|
ddt: add "flat phys" feature
Traditional dedup keeps a separate ddt_phys_t "type" for each possible
count of DVAs (that is, copies=) parameter. Each of these are tracked
independently of each other, and have their own set of DVAs. This leads
to an (admittedly rare) situation where you can create as many as six
copies of the data, by changing the copies= parameter between copying.
This is both a waste of storage on disk, but also a waste of space in
the stored DDT entries, since there never needs to be more than three
DVAs to handle all possible values of copies=.
This commit adds a new FDT feature, DDT_FLAG_FLAT. When active, only the
first ddt_phys_t is used. Each time a block is written with the dedup
bit set, this single phys is checked to see if it has enough DVAs to
fulfill the request. If it does, the block is filled with the saved DVAs
as normal. If not, an adjusted write is issued to create as many extra
copies as are needed to fulfill the request, which are then saved into
the entry too.
Because a single phys is no longer an all-or-nothing, but can be
transitioning from fewer to more DVAs, the write path now has to keep a
copy of the previous "known good" DVA set so we can revert to it in case
an error occurs. zio_ddt_write() has been restructured and heavily
commented to make it much easier to see what's happening.
Backwards compatibility is maintained simply by allocating four
ddt_phys_t when the DDT_FLAG_FLAT flag is not set, and updating the phys
selection macros to check the flag. In the old arrangement, each number
of copies gets a whole phys, so it will always have either zero or all
necessary DVAs filled, with no in-between, so the old behaviour
naturally falls out of the new code.
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Co-authored-by: Don Brady <don.brady@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: iXsystems, Inc.
Closes #15893
2023-06-20 04:09:48 +03:00
|
|
|
ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
|
2023-07-03 08:16:02 +03:00
|
|
|
|
ddt: add "flat phys" feature
Traditional dedup keeps a separate ddt_phys_t "type" for each possible
count of DVAs (that is, copies=) parameter. Each of these are tracked
independently of each other, and have their own set of DVAs. This leads
to an (admittedly rare) situation where you can create as many as six
copies of the data, by changing the copies= parameter between copying.
This is both a waste of storage on disk, but also a waste of space in
the stored DDT entries, since there never needs to be more than three
DVAs to handle all possible values of copies=.
This commit adds a new FDT feature, DDT_FLAG_FLAT. When active, only the
first ddt_phys_t is used. Each time a block is written with the dedup
bit set, this single phys is checked to see if it has enough DVAs to
fulfill the request. If it does, the block is filled with the saved DVAs
as normal. If not, an adjusted write is issued to create as many extra
copies as are needed to fulfill the request, which are then saved into
the entry too.
Because a single phys is no longer an all-or-nothing, but can be
transitioning from fewer to more DVAs, the write path now has to keep a
copy of the previous "known good" DVA set so we can revert to it in case
an error occurs. zio_ddt_write() has been restructured and heavily
commented to make it much easier to see what's happening.
Backwards compatibility is maintained simply by allocating four
ddt_phys_t when the DDT_FLAG_FLAT flag is not set, and updating the phys
selection macros to check the flag. In the old arrangement, each number
of copies gets a whole phys, so it will always have either zero or all
necessary DVAs filled, with no in-between, so the old behaviour
naturally falls out of the new code.
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Co-authored-by: Don Brady <don.brady@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: iXsystems, Inc.
Closes #15893
2023-06-20 04:09:48 +03:00
|
|
|
if (ddt_phys_birth(ddp, v) == 0)
|
2023-05-16 06:30:26 +03:00
|
|
|
continue;
|
|
|
|
|
ddt: add "flat phys" feature
Traditional dedup keeps a separate ddt_phys_t "type" for each possible
count of DVAs (that is, copies=) parameter. Each of these are tracked
independently of each other, and have their own set of DVAs. This leads
to an (admittedly rare) situation where you can create as many as six
copies of the data, by changing the copies= parameter between copying.
This is both a waste of storage on disk, but also a waste of space in
the stored DDT entries, since there never needs to be more than three
DVAs to handle all possible values of copies=.
This commit adds a new FDT feature, DDT_FLAG_FLAT. When active, only the
first ddt_phys_t is used. Each time a block is written with the dedup
bit set, this single phys is checked to see if it has enough DVAs to
fulfill the request. If it does, the block is filled with the saved DVAs
as normal. If not, an adjusted write is issued to create as many extra
copies as are needed to fulfill the request, which are then saved into
the entry too.
Because a single phys is no longer an all-or-nothing, but can be
transitioning from fewer to more DVAs, the write path now has to keep a
copy of the previous "known good" DVA set so we can revert to it in case
an error occurs. zio_ddt_write() has been restructured and heavily
commented to make it much easier to see what's happening.
Backwards compatibility is maintained simply by allocating four
ddt_phys_t when the DDT_FLAG_FLAT flag is not set, and updating the phys
selection macros to check the flag. In the old arrangement, each number
of copies gets a whole phys, so it will always have either zero or all
necessary DVAs filled, with no in-between, so the old behaviour
naturally falls out of the new code.
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Co-authored-by: Don Brady <don.brady@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: iXsystems, Inc.
Closes #15893
2023-06-20 04:09:48 +03:00
|
|
|
int ndvas = ddt_phys_dva_count(ddp, v,
|
ddt: cleanup the stats & histogram code
Both the API and the code were kinda mangled and I was really struggling
to follow it. The worst offender was the old ddt_stat_add(); after
fixing it up the rest of the changes are mostly knock-on effects and
targets of opportunity.
Note that the old ddt_stat_add() was safe against overflows - it could
produce crazy numbers, but the compiler wouldn't do anything stupid. The
assertions in ddt_stat_sub() go a lot of the way to protecting against
this; getting in a position where overflows are a problem is definitely
a programming error.
Also expanding ddt_stat_add() and ddt_histogram_empty() produces less
efficient assembly. I'm not bothered about this right now though; these
should not be hot functions, and if they are we'll optimise them later.
If we have to go back to the old form, we'll comment it like crazy.
Finally, I've removed the assertion that the bucket will never be
negative, as it will soon be possible to have entries with zero
refcounts: an entry for a block that is no longer on the pool, but is on
the log waiting to be synced out. It might be better to have a separate
bucket for these, since they're still using real space on disk, but
ultimately these stats are driving UI, and for now I've chosen to keep
them matching how they've looked in the past, as well as match the
operators mental model - pool usage is managed elsewhere.
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: iXsystems, Inc.
Closes #15895
2023-06-15 10:19:41 +03:00
|
|
|
DDK_GET_CRYPT(&ddlwe->ddlwe_key));
|
ddt: add "flat phys" feature
Traditional dedup keeps a separate ddt_phys_t "type" for each possible
count of DVAs (that is, copies=) parameter. Each of these are tracked
independently of each other, and have their own set of DVAs. This leads
to an (admittedly rare) situation where you can create as many as six
copies of the data, by changing the copies= parameter between copying.
This is both a waste of storage on disk, but also a waste of space in
the stored DDT entries, since there never needs to be more than three
DVAs to handle all possible values of copies=.
This commit adds a new FDT feature, DDT_FLAG_FLAT. When active, only the
first ddt_phys_t is used. Each time a block is written with the dedup
bit set, this single phys is checked to see if it has enough DVAs to
fulfill the request. If it does, the block is filled with the saved DVAs
as normal. If not, an adjusted write is issued to create as many extra
copies as are needed to fulfill the request, which are then saved into
the entry too.
Because a single phys is no longer an all-or-nothing, but can be
transitioning from fewer to more DVAs, the write path now has to keep a
copy of the previous "known good" DVA set so we can revert to it in case
an error occurs. zio_ddt_write() has been restructured and heavily
commented to make it much easier to see what's happening.
Backwards compatibility is maintained simply by allocating four
ddt_phys_t when the DDT_FLAG_FLAT flag is not set, and updating the phys
selection macros to check the flag. In the old arrangement, each number
of copies gets a whole phys, so it will always have either zero or all
necessary DVAs filled, with no in-between, so the old behaviour
naturally falls out of the new code.
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Co-authored-by: Don Brady <don.brady@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: iXsystems, Inc.
Closes #15893
2023-06-20 04:09:48 +03:00
|
|
|
const dva_t *dvas = (ddt->ddt_flags & DDT_FLAG_FLAT) ?
|
|
|
|
ddp->ddp_flat.ddp_dva : ddp->ddp_trad[p].ddp_dva;
|
|
|
|
|
|
|
|
uint64_t dsize = 0;
|
2023-07-03 08:25:06 +03:00
|
|
|
for (int d = 0; d < ndvas; d++)
|
ddt: add "flat phys" feature
Traditional dedup keeps a separate ddt_phys_t "type" for each possible
count of DVAs (that is, copies=) parameter. Each of these are tracked
independently of each other, and have their own set of DVAs. This leads
to an (admittedly rare) situation where you can create as many as six
copies of the data, by changing the copies= parameter between copying.
This is both a waste of storage on disk, but also a waste of space in
the stored DDT entries, since there never needs to be more than three
DVAs to handle all possible values of copies=.
This commit adds a new FDT feature, DDT_FLAG_FLAT. When active, only the
first ddt_phys_t is used. Each time a block is written with the dedup
bit set, this single phys is checked to see if it has enough DVAs to
fulfill the request. If it does, the block is filled with the saved DVAs
as normal. If not, an adjusted write is issued to create as many extra
copies as are needed to fulfill the request, which are then saved into
the entry too.
Because a single phys is no longer an all-or-nothing, but can be
transitioning from fewer to more DVAs, the write path now has to keep a
copy of the previous "known good" DVA set so we can revert to it in case
an error occurs. zio_ddt_write() has been restructured and heavily
commented to make it much easier to see what's happening.
Backwards compatibility is maintained simply by allocating four
ddt_phys_t when the DDT_FLAG_FLAT flag is not set, and updating the phys
selection macros to check the flag. In the old arrangement, each number
of copies gets a whole phys, so it will always have either zero or all
necessary DVAs filled, with no in-between, so the old behaviour
naturally falls out of the new code.
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Co-authored-by: Don Brady <don.brady@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: iXsystems, Inc.
Closes #15893
2023-06-20 04:09:48 +03:00
|
|
|
dsize += dva_get_dsize_sync(spa, &dvas[d]);
|
|
|
|
|
|
|
|
uint64_t refcnt = ddt_phys_refcnt(ddp, v);
|
2023-05-16 06:30:26 +03:00
|
|
|
|
|
|
|
dds->dds_blocks += 1;
|
|
|
|
dds->dds_lsize += lsize;
|
|
|
|
dds->dds_psize += psize;
|
|
|
|
dds->dds_dsize += dsize;
|
|
|
|
|
|
|
|
dds->dds_ref_blocks += refcnt;
|
|
|
|
dds->dds_ref_lsize += lsize * refcnt;
|
|
|
|
dds->dds_ref_psize += psize * refcnt;
|
|
|
|
dds->dds_ref_dsize += dsize * refcnt;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
ddt: cleanup the stats & histogram code
Both the API and the code were kinda mangled and I was really struggling
to follow it. The worst offender was the old ddt_stat_add(); after
fixing it up the rest of the changes are mostly knock-on effects and
targets of opportunity.
Note that the old ddt_stat_add() was safe against overflows - it could
produce crazy numbers, but the compiler wouldn't do anything stupid. The
assertions in ddt_stat_sub() go a lot of the way to protecting against
this; getting in a position where overflows are a problem is definitely
a programming error.
Also expanding ddt_stat_add() and ddt_histogram_empty() produces less
efficient assembly. I'm not bothered about this right now though; these
should not be hot functions, and if they are we'll optimise them later.
If we have to go back to the old form, we'll comment it like crazy.
Finally, I've removed the assertion that the bucket will never be
negative, as it will soon be possible to have entries with zero
refcounts: an entry for a block that is no longer on the pool, but is on
the log waiting to be synced out. It might be better to have a separate
bucket for these, since they're still using real space on disk, but
ultimately these stats are driving UI, and for now I've chosen to keep
them matching how they've looked in the past, as well as match the
operators mental model - pool usage is managed elsewhere.
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: iXsystems, Inc.
Closes #15895
2023-06-15 10:19:41 +03:00
|
|
|
static void
|
|
|
|
ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src)
|
|
|
|
{
|
|
|
|
dst->dds_blocks += src->dds_blocks;
|
|
|
|
dst->dds_lsize += src->dds_lsize;
|
|
|
|
dst->dds_psize += src->dds_psize;
|
|
|
|
dst->dds_dsize += src->dds_dsize;
|
|
|
|
dst->dds_ref_blocks += src->dds_ref_blocks;
|
|
|
|
dst->dds_ref_lsize += src->dds_ref_lsize;
|
|
|
|
dst->dds_ref_psize += src->dds_ref_psize;
|
|
|
|
dst->dds_ref_dsize += src->dds_ref_dsize;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
ddt_stat_sub(ddt_stat_t *dst, const ddt_stat_t *src)
|
|
|
|
{
|
|
|
|
/* This caught more during development than you might expect... */
|
|
|
|
ASSERT3U(dst->dds_blocks, >=, src->dds_blocks);
|
|
|
|
ASSERT3U(dst->dds_lsize, >=, src->dds_lsize);
|
|
|
|
ASSERT3U(dst->dds_psize, >=, src->dds_psize);
|
|
|
|
ASSERT3U(dst->dds_dsize, >=, src->dds_dsize);
|
|
|
|
ASSERT3U(dst->dds_ref_blocks, >=, src->dds_ref_blocks);
|
|
|
|
ASSERT3U(dst->dds_ref_lsize, >=, src->dds_ref_lsize);
|
|
|
|
ASSERT3U(dst->dds_ref_psize, >=, src->dds_ref_psize);
|
|
|
|
ASSERT3U(dst->dds_ref_dsize, >=, src->dds_ref_dsize);
|
|
|
|
|
|
|
|
dst->dds_blocks -= src->dds_blocks;
|
|
|
|
dst->dds_lsize -= src->dds_lsize;
|
|
|
|
dst->dds_psize -= src->dds_psize;
|
|
|
|
dst->dds_dsize -= src->dds_dsize;
|
|
|
|
dst->dds_ref_blocks -= src->dds_ref_blocks;
|
|
|
|
dst->dds_ref_lsize -= src->dds_ref_lsize;
|
|
|
|
dst->dds_ref_psize -= src->dds_ref_psize;
|
|
|
|
dst->dds_ref_dsize -= src->dds_ref_dsize;
|
|
|
|
}
|
|
|
|
|
2023-05-16 06:30:26 +03:00
|
|
|
void
|
ddt: cleanup the stats & histogram code
Both the API and the code were kinda mangled and I was really struggling
to follow it. The worst offender was the old ddt_stat_add(); after
fixing it up the rest of the changes are mostly knock-on effects and
targets of opportunity.
Note that the old ddt_stat_add() was safe against overflows - it could
produce crazy numbers, but the compiler wouldn't do anything stupid. The
assertions in ddt_stat_sub() go a lot of the way to protecting against
this; getting in a position where overflows are a problem is definitely
a programming error.
Also expanding ddt_stat_add() and ddt_histogram_empty() produces less
efficient assembly. I'm not bothered about this right now though; these
should not be hot functions, and if they are we'll optimise them later.
If we have to go back to the old form, we'll comment it like crazy.
Finally, I've removed the assertion that the bucket will never be
negative, as it will soon be possible to have entries with zero
refcounts: an entry for a block that is no longer on the pool, but is on
the log waiting to be synced out. It might be better to have a separate
bucket for these, since they're still using real space on disk, but
ultimately these stats are driving UI, and for now I've chosen to keep
them matching how they've looked in the past, as well as match the
operators mental model - pool usage is managed elsewhere.
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: iXsystems, Inc.
Closes #15895
2023-06-15 10:19:41 +03:00
|
|
|
ddt_histogram_add_entry(ddt_t *ddt, ddt_histogram_t *ddh,
|
|
|
|
const ddt_lightweight_entry_t *ddlwe)
|
2023-05-16 06:30:26 +03:00
|
|
|
{
|
ddt: cleanup the stats & histogram code
Both the API and the code were kinda mangled and I was really struggling
to follow it. The worst offender was the old ddt_stat_add(); after
fixing it up the rest of the changes are mostly knock-on effects and
targets of opportunity.
Note that the old ddt_stat_add() was safe against overflows - it could
produce crazy numbers, but the compiler wouldn't do anything stupid. The
assertions in ddt_stat_sub() go a lot of the way to protecting against
this; getting in a position where overflows are a problem is definitely
a programming error.
Also expanding ddt_stat_add() and ddt_histogram_empty() produces less
efficient assembly. I'm not bothered about this right now though; these
should not be hot functions, and if they are we'll optimise them later.
If we have to go back to the old form, we'll comment it like crazy.
Finally, I've removed the assertion that the bucket will never be
negative, as it will soon be possible to have entries with zero
refcounts: an entry for a block that is no longer on the pool, but is on
the log waiting to be synced out. It might be better to have a separate
bucket for these, since they're still using real space on disk, but
ultimately these stats are driving UI, and for now I've chosen to keep
them matching how they've looked in the past, as well as match the
operators mental model - pool usage is managed elsewhere.
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: iXsystems, Inc.
Closes #15895
2023-06-15 10:19:41 +03:00
|
|
|
ddt_stat_t dds;
|
|
|
|
int bucket;
|
2023-05-16 06:30:26 +03:00
|
|
|
|
ddt: cleanup the stats & histogram code
Both the API and the code were kinda mangled and I was really struggling
to follow it. The worst offender was the old ddt_stat_add(); after
fixing it up the rest of the changes are mostly knock-on effects and
targets of opportunity.
Note that the old ddt_stat_add() was safe against overflows - it could
produce crazy numbers, but the compiler wouldn't do anything stupid. The
assertions in ddt_stat_sub() go a lot of the way to protecting against
this; getting in a position where overflows are a problem is definitely
a programming error.
Also expanding ddt_stat_add() and ddt_histogram_empty() produces less
efficient assembly. I'm not bothered about this right now though; these
should not be hot functions, and if they are we'll optimise them later.
If we have to go back to the old form, we'll comment it like crazy.
Finally, I've removed the assertion that the bucket will never be
negative, as it will soon be possible to have entries with zero
refcounts: an entry for a block that is no longer on the pool, but is on
the log waiting to be synced out. It might be better to have a separate
bucket for these, since they're still using real space on disk, but
ultimately these stats are driving UI, and for now I've chosen to keep
them matching how they've looked in the past, as well as match the
operators mental model - pool usage is managed elsewhere.
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: iXsystems, Inc.
Closes #15895
2023-06-15 10:19:41 +03:00
|
|
|
ddt_stat_generate(ddt, ddlwe, &dds);
|
|
|
|
|
|
|
|
bucket = highbit64(dds.dds_ref_blocks) - 1;
|
|
|
|
if (bucket < 0)
|
|
|
|
return;
|
2023-05-16 06:30:26 +03:00
|
|
|
|
ddt: cleanup the stats & histogram code
Both the API and the code were kinda mangled and I was really struggling
to follow it. The worst offender was the old ddt_stat_add(); after
fixing it up the rest of the changes are mostly knock-on effects and
targets of opportunity.
Note that the old ddt_stat_add() was safe against overflows - it could
produce crazy numbers, but the compiler wouldn't do anything stupid. The
assertions in ddt_stat_sub() go a lot of the way to protecting against
this; getting in a position where overflows are a problem is definitely
a programming error.
Also expanding ddt_stat_add() and ddt_histogram_empty() produces less
efficient assembly. I'm not bothered about this right now though; these
should not be hot functions, and if they are we'll optimise them later.
If we have to go back to the old form, we'll comment it like crazy.
Finally, I've removed the assertion that the bucket will never be
negative, as it will soon be possible to have entries with zero
refcounts: an entry for a block that is no longer on the pool, but is on
the log waiting to be synced out. It might be better to have a separate
bucket for these, since they're still using real space on disk, but
ultimately these stats are driving UI, and for now I've chosen to keep
them matching how they've looked in the past, as well as match the
operators mental model - pool usage is managed elsewhere.
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: iXsystems, Inc.
Closes #15895
2023-06-15 10:19:41 +03:00
|
|
|
ddt_stat_add(&ddh->ddh_stat[bucket], &dds);
|
2023-05-16 06:30:26 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
void
|
ddt: cleanup the stats & histogram code
Both the API and the code were kinda mangled and I was really struggling
to follow it. The worst offender was the old ddt_stat_add(); after
fixing it up the rest of the changes are mostly knock-on effects and
targets of opportunity.
Note that the old ddt_stat_add() was safe against overflows - it could
produce crazy numbers, but the compiler wouldn't do anything stupid. The
assertions in ddt_stat_sub() go a lot of the way to protecting against
this; getting in a position where overflows are a problem is definitely
a programming error.
Also expanding ddt_stat_add() and ddt_histogram_empty() produces less
efficient assembly. I'm not bothered about this right now though; these
should not be hot functions, and if they are we'll optimise them later.
If we have to go back to the old form, we'll comment it like crazy.
Finally, I've removed the assertion that the bucket will never be
negative, as it will soon be possible to have entries with zero
refcounts: an entry for a block that is no longer on the pool, but is on
the log waiting to be synced out. It might be better to have a separate
bucket for these, since they're still using real space on disk, but
ultimately these stats are driving UI, and for now I've chosen to keep
them matching how they've looked in the past, as well as match the
operators mental model - pool usage is managed elsewhere.
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: iXsystems, Inc.
Closes #15895
2023-06-15 10:19:41 +03:00
|
|
|
ddt_histogram_sub_entry(ddt_t *ddt, ddt_histogram_t *ddh,
|
|
|
|
const ddt_lightweight_entry_t *ddlwe)
|
2023-05-16 06:30:26 +03:00
|
|
|
{
|
|
|
|
ddt_stat_t dds;
|
|
|
|
int bucket;
|
|
|
|
|
ddt: cleanup the stats & histogram code
Both the API and the code were kinda mangled and I was really struggling
to follow it. The worst offender was the old ddt_stat_add(); after
fixing it up the rest of the changes are mostly knock-on effects and
targets of opportunity.
Note that the old ddt_stat_add() was safe against overflows - it could
produce crazy numbers, but the compiler wouldn't do anything stupid. The
assertions in ddt_stat_sub() go a lot of the way to protecting against
this; getting in a position where overflows are a problem is definitely
a programming error.
Also expanding ddt_stat_add() and ddt_histogram_empty() produces less
efficient assembly. I'm not bothered about this right now though; these
should not be hot functions, and if they are we'll optimise them later.
If we have to go back to the old form, we'll comment it like crazy.
Finally, I've removed the assertion that the bucket will never be
negative, as it will soon be possible to have entries with zero
refcounts: an entry for a block that is no longer on the pool, but is on
the log waiting to be synced out. It might be better to have a separate
bucket for these, since they're still using real space on disk, but
ultimately these stats are driving UI, and for now I've chosen to keep
them matching how they've looked in the past, as well as match the
operators mental model - pool usage is managed elsewhere.
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: iXsystems, Inc.
Closes #15895
2023-06-15 10:19:41 +03:00
|
|
|
ddt_stat_generate(ddt, ddlwe, &dds);
|
2023-05-16 06:30:26 +03:00
|
|
|
|
|
|
|
bucket = highbit64(dds.dds_ref_blocks) - 1;
|
ddt: cleanup the stats & histogram code
Both the API and the code were kinda mangled and I was really struggling
to follow it. The worst offender was the old ddt_stat_add(); after
fixing it up the rest of the changes are mostly knock-on effects and
targets of opportunity.
Note that the old ddt_stat_add() was safe against overflows - it could
produce crazy numbers, but the compiler wouldn't do anything stupid. The
assertions in ddt_stat_sub() go a lot of the way to protecting against
this; getting in a position where overflows are a problem is definitely
a programming error.
Also expanding ddt_stat_add() and ddt_histogram_empty() produces less
efficient assembly. I'm not bothered about this right now though; these
should not be hot functions, and if they are we'll optimise them later.
If we have to go back to the old form, we'll comment it like crazy.
Finally, I've removed the assertion that the bucket will never be
negative, as it will soon be possible to have entries with zero
refcounts: an entry for a block that is no longer on the pool, but is on
the log waiting to be synced out. It might be better to have a separate
bucket for these, since they're still using real space on disk, but
ultimately these stats are driving UI, and for now I've chosen to keep
them matching how they've looked in the past, as well as match the
operators mental model - pool usage is managed elsewhere.
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: iXsystems, Inc.
Closes #15895
2023-06-15 10:19:41 +03:00
|
|
|
if (bucket < 0)
|
|
|
|
return;
|
2023-05-16 06:30:26 +03:00
|
|
|
|
ddt: cleanup the stats & histogram code
Both the API and the code were kinda mangled and I was really struggling
to follow it. The worst offender was the old ddt_stat_add(); after
fixing it up the rest of the changes are mostly knock-on effects and
targets of opportunity.
Note that the old ddt_stat_add() was safe against overflows - it could
produce crazy numbers, but the compiler wouldn't do anything stupid. The
assertions in ddt_stat_sub() go a lot of the way to protecting against
this; getting in a position where overflows are a problem is definitely
a programming error.
Also expanding ddt_stat_add() and ddt_histogram_empty() produces less
efficient assembly. I'm not bothered about this right now though; these
should not be hot functions, and if they are we'll optimise them later.
If we have to go back to the old form, we'll comment it like crazy.
Finally, I've removed the assertion that the bucket will never be
negative, as it will soon be possible to have entries with zero
refcounts: an entry for a block that is no longer on the pool, but is on
the log waiting to be synced out. It might be better to have a separate
bucket for these, since they're still using real space on disk, but
ultimately these stats are driving UI, and for now I've chosen to keep
them matching how they've looked in the past, as well as match the
operators mental model - pool usage is managed elsewhere.
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: iXsystems, Inc.
Closes #15895
2023-06-15 10:19:41 +03:00
|
|
|
ddt_stat_sub(&ddh->ddh_stat[bucket], &dds);
|
2023-05-16 06:30:26 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src)
|
|
|
|
{
|
|
|
|
for (int h = 0; h < 64; h++)
|
ddt: cleanup the stats & histogram code
Both the API and the code were kinda mangled and I was really struggling
to follow it. The worst offender was the old ddt_stat_add(); after
fixing it up the rest of the changes are mostly knock-on effects and
targets of opportunity.
Note that the old ddt_stat_add() was safe against overflows - it could
produce crazy numbers, but the compiler wouldn't do anything stupid. The
assertions in ddt_stat_sub() go a lot of the way to protecting against
this; getting in a position where overflows are a problem is definitely
a programming error.
Also expanding ddt_stat_add() and ddt_histogram_empty() produces less
efficient assembly. I'm not bothered about this right now though; these
should not be hot functions, and if they are we'll optimise them later.
If we have to go back to the old form, we'll comment it like crazy.
Finally, I've removed the assertion that the bucket will never be
negative, as it will soon be possible to have entries with zero
refcounts: an entry for a block that is no longer on the pool, but is on
the log waiting to be synced out. It might be better to have a separate
bucket for these, since they're still using real space on disk, but
ultimately these stats are driving UI, and for now I've chosen to keep
them matching how they've looked in the past, as well as match the
operators mental model - pool usage is managed elsewhere.
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: iXsystems, Inc.
Closes #15895
2023-06-15 10:19:41 +03:00
|
|
|
ddt_stat_add(&dst->ddh_stat[h], &src->ddh_stat[h]);
|
2023-05-16 06:30:26 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
void
|
ddt: cleanup the stats & histogram code
Both the API and the code were kinda mangled and I was really struggling
to follow it. The worst offender was the old ddt_stat_add(); after
fixing it up the rest of the changes are mostly knock-on effects and
targets of opportunity.
Note that the old ddt_stat_add() was safe against overflows - it could
produce crazy numbers, but the compiler wouldn't do anything stupid. The
assertions in ddt_stat_sub() go a lot of the way to protecting against
this; getting in a position where overflows are a problem is definitely
a programming error.
Also expanding ddt_stat_add() and ddt_histogram_empty() produces less
efficient assembly. I'm not bothered about this right now though; these
should not be hot functions, and if they are we'll optimise them later.
If we have to go back to the old form, we'll comment it like crazy.
Finally, I've removed the assertion that the bucket will never be
negative, as it will soon be possible to have entries with zero
refcounts: an entry for a block that is no longer on the pool, but is on
the log waiting to be synced out. It might be better to have a separate
bucket for these, since they're still using real space on disk, but
ultimately these stats are driving UI, and for now I've chosen to keep
them matching how they've looked in the past, as well as match the
operators mental model - pool usage is managed elsewhere.
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: iXsystems, Inc.
Closes #15895
2023-06-15 10:19:41 +03:00
|
|
|
ddt_histogram_total(ddt_stat_t *dds, const ddt_histogram_t *ddh)
|
2023-05-16 06:30:26 +03:00
|
|
|
{
|
|
|
|
memset(dds, 0, sizeof (*dds));
|
|
|
|
|
|
|
|
for (int h = 0; h < 64; h++)
|
ddt: cleanup the stats & histogram code
Both the API and the code were kinda mangled and I was really struggling
to follow it. The worst offender was the old ddt_stat_add(); after
fixing it up the rest of the changes are mostly knock-on effects and
targets of opportunity.
Note that the old ddt_stat_add() was safe against overflows - it could
produce crazy numbers, but the compiler wouldn't do anything stupid. The
assertions in ddt_stat_sub() go a lot of the way to protecting against
this; getting in a position where overflows are a problem is definitely
a programming error.
Also expanding ddt_stat_add() and ddt_histogram_empty() produces less
efficient assembly. I'm not bothered about this right now though; these
should not be hot functions, and if they are we'll optimise them later.
If we have to go back to the old form, we'll comment it like crazy.
Finally, I've removed the assertion that the bucket will never be
negative, as it will soon be possible to have entries with zero
refcounts: an entry for a block that is no longer on the pool, but is on
the log waiting to be synced out. It might be better to have a separate
bucket for these, since they're still using real space on disk, but
ultimately these stats are driving UI, and for now I've chosen to keep
them matching how they've looked in the past, as well as match the
operators mental model - pool usage is managed elsewhere.
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: iXsystems, Inc.
Closes #15895
2023-06-15 10:19:41 +03:00
|
|
|
ddt_stat_add(dds, &ddh->ddh_stat[h]);
|
2023-05-16 06:30:26 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
boolean_t
|
|
|
|
ddt_histogram_empty(const ddt_histogram_t *ddh)
|
|
|
|
{
|
ddt: cleanup the stats & histogram code
Both the API and the code were kinda mangled and I was really struggling
to follow it. The worst offender was the old ddt_stat_add(); after
fixing it up the rest of the changes are mostly knock-on effects and
targets of opportunity.
Note that the old ddt_stat_add() was safe against overflows - it could
produce crazy numbers, but the compiler wouldn't do anything stupid. The
assertions in ddt_stat_sub() go a lot of the way to protecting against
this; getting in a position where overflows are a problem is definitely
a programming error.
Also expanding ddt_stat_add() and ddt_histogram_empty() produces less
efficient assembly. I'm not bothered about this right now though; these
should not be hot functions, and if they are we'll optimise them later.
If we have to go back to the old form, we'll comment it like crazy.
Finally, I've removed the assertion that the bucket will never be
negative, as it will soon be possible to have entries with zero
refcounts: an entry for a block that is no longer on the pool, but is on
the log waiting to be synced out. It might be better to have a separate
bucket for these, since they're still using real space on disk, but
ultimately these stats are driving UI, and for now I've chosen to keep
them matching how they've looked in the past, as well as match the
operators mental model - pool usage is managed elsewhere.
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: iXsystems, Inc.
Closes #15895
2023-06-15 10:19:41 +03:00
|
|
|
for (int h = 0; h < 64; h++) {
|
|
|
|
const ddt_stat_t *dds = &ddh->ddh_stat[h];
|
|
|
|
|
|
|
|
if (dds->dds_blocks == 0 &&
|
|
|
|
dds->dds_lsize == 0 &&
|
|
|
|
dds->dds_psize == 0 &&
|
|
|
|
dds->dds_dsize == 0 &&
|
|
|
|
dds->dds_ref_blocks == 0 &&
|
|
|
|
dds->dds_ref_lsize == 0 &&
|
|
|
|
dds->dds_ref_psize == 0 &&
|
|
|
|
dds->dds_ref_dsize == 0)
|
|
|
|
continue;
|
2023-05-16 06:30:26 +03:00
|
|
|
|
ddt: cleanup the stats & histogram code
Both the API and the code were kinda mangled and I was really struggling
to follow it. The worst offender was the old ddt_stat_add(); after
fixing it up the rest of the changes are mostly knock-on effects and
targets of opportunity.
Note that the old ddt_stat_add() was safe against overflows - it could
produce crazy numbers, but the compiler wouldn't do anything stupid. The
assertions in ddt_stat_sub() go a lot of the way to protecting against
this; getting in a position where overflows are a problem is definitely
a programming error.
Also expanding ddt_stat_add() and ddt_histogram_empty() produces less
efficient assembly. I'm not bothered about this right now though; these
should not be hot functions, and if they are we'll optimise them later.
If we have to go back to the old form, we'll comment it like crazy.
Finally, I've removed the assertion that the bucket will never be
negative, as it will soon be possible to have entries with zero
refcounts: an entry for a block that is no longer on the pool, but is on
the log waiting to be synced out. It might be better to have a separate
bucket for these, since they're still using real space on disk, but
ultimately these stats are driving UI, and for now I've chosen to keep
them matching how they've looked in the past, as well as match the
operators mental model - pool usage is managed elsewhere.
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: iXsystems, Inc.
Closes #15895
2023-06-15 10:19:41 +03:00
|
|
|
return (B_FALSE);
|
|
|
|
}
|
2023-05-16 06:30:26 +03:00
|
|
|
|
|
|
|
return (B_TRUE);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo_total)
|
|
|
|
{
|
ddt: dedup table quota enforcement
This adds two new pool properties:
- dedup_table_size, the total size of all DDTs on the pool; and
- dedup_table_quota, the maximum possible size of all DDTs in the pool
When set, quota will be enforced by checking when a new entry is about
to be created. If the pool is over its dedup quota, the entry won't be
created, and the corresponding write will be converted to a regular
non-dedup write. Note that existing entries can be updated (ie their
refcounts changed), as that reuses the space rather than requiring more.
dedup_table_quota can be set to 'auto', which will set it based on the
size of the devices backing the "dedup" allocation device. This makes it
possible to limit the DDTs to the size of a dedup vdev only, such that
when the device fills, no new blocks are deduplicated.
Sponsored-by: iXsystems, Inc.
Sponsored-By: Klara Inc.
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Signed-off-by: Don Brady <don.brady@klarasystems.com>
Co-authored-by: Don Brady <don.brady@klarasystems.com>
Co-authored-by: Rob Wing <rob.wing@klarasystems.com>
Co-authored-by: Sean Eric Fagan <sean.fagan@klarasystems.com>
Closes #15889
2024-07-25 19:47:36 +03:00
|
|
|
memset(ddo_total, 0, sizeof (*ddo_total));
|
|
|
|
|
2023-05-16 06:30:26 +03:00
|
|
|
for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
|
|
|
|
ddt_t *ddt = spa->spa_ddt[c];
|
2023-06-15 09:10:00 +03:00
|
|
|
if (!ddt)
|
|
|
|
continue;
|
|
|
|
|
2023-07-03 05:32:53 +03:00
|
|
|
for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
|
|
|
|
for (ddt_class_t class = 0; class < DDT_CLASSES;
|
2023-05-16 06:30:26 +03:00
|
|
|
class++) {
|
ddt: dedup table quota enforcement
This adds two new pool properties:
- dedup_table_size, the total size of all DDTs on the pool; and
- dedup_table_quota, the maximum possible size of all DDTs in the pool
When set, quota will be enforced by checking when a new entry is about
to be created. If the pool is over its dedup quota, the entry won't be
created, and the corresponding write will be converted to a regular
non-dedup write. Note that existing entries can be updated (ie their
refcounts changed), as that reuses the space rather than requiring more.
dedup_table_quota can be set to 'auto', which will set it based on the
size of the devices backing the "dedup" allocation device. This makes it
possible to limit the DDTs to the size of a dedup vdev only, such that
when the device fills, no new blocks are deduplicated.
Sponsored-by: iXsystems, Inc.
Sponsored-By: Klara Inc.
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Signed-off-by: Don Brady <don.brady@klarasystems.com>
Co-authored-by: Don Brady <don.brady@klarasystems.com>
Co-authored-by: Rob Wing <rob.wing@klarasystems.com>
Co-authored-by: Sean Eric Fagan <sean.fagan@klarasystems.com>
Closes #15889
2024-07-25 19:47:36 +03:00
|
|
|
dmu_object_info_t doi;
|
|
|
|
uint64_t cnt;
|
|
|
|
int err;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* These stats were originally calculated
|
|
|
|
* during ddt_object_load().
|
|
|
|
*/
|
|
|
|
|
|
|
|
err = ddt_object_info(ddt, type, class, &doi);
|
|
|
|
if (err != 0)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
err = ddt_object_count(ddt, type, class, &cnt);
|
|
|
|
if (err != 0)
|
|
|
|
continue;
|
|
|
|
|
2023-05-16 06:30:26 +03:00
|
|
|
ddt_object_t *ddo =
|
|
|
|
&ddt->ddt_object_stats[type][class];
|
ddt: dedup table quota enforcement
This adds two new pool properties:
- dedup_table_size, the total size of all DDTs on the pool; and
- dedup_table_quota, the maximum possible size of all DDTs in the pool
When set, quota will be enforced by checking when a new entry is about
to be created. If the pool is over its dedup quota, the entry won't be
created, and the corresponding write will be converted to a regular
non-dedup write. Note that existing entries can be updated (ie their
refcounts changed), as that reuses the space rather than requiring more.
dedup_table_quota can be set to 'auto', which will set it based on the
size of the devices backing the "dedup" allocation device. This makes it
possible to limit the DDTs to the size of a dedup vdev only, such that
when the device fills, no new blocks are deduplicated.
Sponsored-by: iXsystems, Inc.
Sponsored-By: Klara Inc.
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Signed-off-by: Don Brady <don.brady@klarasystems.com>
Co-authored-by: Don Brady <don.brady@klarasystems.com>
Co-authored-by: Rob Wing <rob.wing@klarasystems.com>
Co-authored-by: Sean Eric Fagan <sean.fagan@klarasystems.com>
Closes #15889
2024-07-25 19:47:36 +03:00
|
|
|
|
|
|
|
ddo->ddo_count = cnt;
|
|
|
|
ddo->ddo_dspace =
|
|
|
|
doi.doi_physical_blocks_512 << 9;
|
|
|
|
ddo->ddo_mspace = doi.doi_fill_count *
|
|
|
|
doi.doi_data_block_size;
|
|
|
|
|
2023-05-16 06:30:26 +03:00
|
|
|
ddo_total->ddo_count += ddo->ddo_count;
|
|
|
|
ddo_total->ddo_dspace += ddo->ddo_dspace;
|
|
|
|
ddo_total->ddo_mspace += ddo->ddo_mspace;
|
|
|
|
}
|
|
|
|
}
|
ddt: dedup log
Adds a log/journal to dedup. At the end of txg, instead of writing the
entry directly to the ZAP, instead its adding to an in-memory tree and
appended to an on-disk object. The on-disk object is only read at
import, to reload the in-memory tree.
Lookups first go the the log tree before going to the ZAP, so
recently-used entries will remain close by in memory. This vastly
reduces overhead from dedup IO, as it will not have to do so many
read/update/write cycles on ZAP leaf nodes.
A flushing facility is added at end of txg, to push logged entries out
to the ZAP. There's actually two separate "logs" (in-memory tree and
on-disk object), one active (recieving updated entries) and one flushing
(writing out to disk). These are swapped (ie flushing begins) based on
memory used by the in-memory log trees and time since we last flushed
something.
The flushing facility monitors the amount of entries coming in and being
flushed out, and calibrates itself to try to flush enough each txg to
keep up with the ingest rate without competing too much with other IO.
Multiple tuneables are provided to control the flushing facility.
All the histograms and stats are update to accomodate the log as a
separate entry store. zdb gains knowledge of how to count them and dump
them. Documentation included!
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Co-authored-by: Allan Jude <allan@klarasystems.com>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: iXsystems, Inc.
Closes #15895
2023-06-22 10:46:22 +03:00
|
|
|
|
|
|
|
ddt_object_t *ddo = &ddt->ddt_log_stats;
|
|
|
|
ddo_total->ddo_count += ddo->ddo_count;
|
|
|
|
ddo_total->ddo_dspace += ddo->ddo_dspace;
|
|
|
|
ddo_total->ddo_mspace += ddo->ddo_mspace;
|
2023-05-16 06:30:26 +03:00
|
|
|
}
|
|
|
|
|
ddt: dedup table quota enforcement
This adds two new pool properties:
- dedup_table_size, the total size of all DDTs on the pool; and
- dedup_table_quota, the maximum possible size of all DDTs in the pool
When set, quota will be enforced by checking when a new entry is about
to be created. If the pool is over its dedup quota, the entry won't be
created, and the corresponding write will be converted to a regular
non-dedup write. Note that existing entries can be updated (ie their
refcounts changed), as that reuses the space rather than requiring more.
dedup_table_quota can be set to 'auto', which will set it based on the
size of the devices backing the "dedup" allocation device. This makes it
possible to limit the DDTs to the size of a dedup vdev only, such that
when the device fills, no new blocks are deduplicated.
Sponsored-by: iXsystems, Inc.
Sponsored-By: Klara Inc.
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Signed-off-by: Don Brady <don.brady@klarasystems.com>
Co-authored-by: Don Brady <don.brady@klarasystems.com>
Co-authored-by: Rob Wing <rob.wing@klarasystems.com>
Co-authored-by: Sean Eric Fagan <sean.fagan@klarasystems.com>
Closes #15889
2024-07-25 19:47:36 +03:00
|
|
|
/*
|
|
|
|
* This returns raw counts (not averages). One of the consumers,
|
|
|
|
* print_dedup_stats(), historically has expected raw counts.
|
|
|
|
*/
|
|
|
|
|
|
|
|
spa->spa_dedup_dsize = ddo_total->ddo_dspace;
|
|
|
|
}
|
|
|
|
|
|
|
|
uint64_t
|
|
|
|
ddt_get_ddt_dsize(spa_t *spa)
|
|
|
|
{
|
|
|
|
ddt_object_t ddo_total;
|
|
|
|
|
|
|
|
/* recalculate after each txg sync */
|
|
|
|
if (spa->spa_dedup_dsize == ~0ULL)
|
|
|
|
ddt_get_dedup_object_stats(spa, &ddo_total);
|
|
|
|
|
|
|
|
return (spa->spa_dedup_dsize);
|
2023-05-16 06:30:26 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh)
|
|
|
|
{
|
|
|
|
for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
|
|
|
|
ddt_t *ddt = spa->spa_ddt[c];
|
2023-06-15 09:10:00 +03:00
|
|
|
if (!ddt)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
|
2023-07-03 05:32:53 +03:00
|
|
|
for (ddt_class_t class = 0; class < DDT_CLASSES;
|
2023-05-16 06:30:26 +03:00
|
|
|
class++) {
|
|
|
|
ddt_histogram_add(ddh,
|
|
|
|
&ddt->ddt_histogram_cache[type][class]);
|
|
|
|
}
|
|
|
|
}
|
ddt: dedup log
Adds a log/journal to dedup. At the end of txg, instead of writing the
entry directly to the ZAP, instead its adding to an in-memory tree and
appended to an on-disk object. The on-disk object is only read at
import, to reload the in-memory tree.
Lookups first go the the log tree before going to the ZAP, so
recently-used entries will remain close by in memory. This vastly
reduces overhead from dedup IO, as it will not have to do so many
read/update/write cycles on ZAP leaf nodes.
A flushing facility is added at end of txg, to push logged entries out
to the ZAP. There's actually two separate "logs" (in-memory tree and
on-disk object), one active (recieving updated entries) and one flushing
(writing out to disk). These are swapped (ie flushing begins) based on
memory used by the in-memory log trees and time since we last flushed
something.
The flushing facility monitors the amount of entries coming in and being
flushed out, and calibrates itself to try to flush enough each txg to
keep up with the ingest rate without competing too much with other IO.
Multiple tuneables are provided to control the flushing facility.
All the histograms and stats are update to accomodate the log as a
separate entry store. zdb gains knowledge of how to count them and dump
them. Documentation included!
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Co-authored-by: Allan Jude <allan@klarasystems.com>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: iXsystems, Inc.
Closes #15895
2023-06-22 10:46:22 +03:00
|
|
|
|
|
|
|
ddt_histogram_add(ddh, &ddt->ddt_log_histogram);
|
2023-05-16 06:30:26 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
ddt_get_dedup_stats(spa_t *spa, ddt_stat_t *dds_total)
|
|
|
|
{
|
|
|
|
ddt_histogram_t *ddh_total;
|
|
|
|
|
|
|
|
ddh_total = kmem_zalloc(sizeof (ddt_histogram_t), KM_SLEEP);
|
|
|
|
ddt_get_dedup_histogram(spa, ddh_total);
|
ddt: cleanup the stats & histogram code
Both the API and the code were kinda mangled and I was really struggling
to follow it. The worst offender was the old ddt_stat_add(); after
fixing it up the rest of the changes are mostly knock-on effects and
targets of opportunity.
Note that the old ddt_stat_add() was safe against overflows - it could
produce crazy numbers, but the compiler wouldn't do anything stupid. The
assertions in ddt_stat_sub() go a lot of the way to protecting against
this; getting in a position where overflows are a problem is definitely
a programming error.
Also expanding ddt_stat_add() and ddt_histogram_empty() produces less
efficient assembly. I'm not bothered about this right now though; these
should not be hot functions, and if they are we'll optimise them later.
If we have to go back to the old form, we'll comment it like crazy.
Finally, I've removed the assertion that the bucket will never be
negative, as it will soon be possible to have entries with zero
refcounts: an entry for a block that is no longer on the pool, but is on
the log waiting to be synced out. It might be better to have a separate
bucket for these, since they're still using real space on disk, but
ultimately these stats are driving UI, and for now I've chosen to keep
them matching how they've looked in the past, as well as match the
operators mental model - pool usage is managed elsewhere.
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: iXsystems, Inc.
Closes #15895
2023-06-15 10:19:41 +03:00
|
|
|
ddt_histogram_total(dds_total, ddh_total);
|
2023-05-16 06:30:26 +03:00
|
|
|
kmem_free(ddh_total, sizeof (ddt_histogram_t));
|
|
|
|
}
|
|
|
|
|
|
|
|
uint64_t
|
|
|
|
ddt_get_dedup_dspace(spa_t *spa)
|
|
|
|
{
|
|
|
|
ddt_stat_t dds_total;
|
|
|
|
|
|
|
|
if (spa->spa_dedup_dspace != ~0ULL)
|
|
|
|
return (spa->spa_dedup_dspace);
|
|
|
|
|
|
|
|
memset(&dds_total, 0, sizeof (ddt_stat_t));
|
|
|
|
|
|
|
|
/* Calculate and cache the stats */
|
|
|
|
ddt_get_dedup_stats(spa, &dds_total);
|
|
|
|
spa->spa_dedup_dspace = dds_total.dds_ref_dsize - dds_total.dds_dsize;
|
|
|
|
return (spa->spa_dedup_dspace);
|
|
|
|
}
|
|
|
|
|
|
|
|
uint64_t
|
|
|
|
ddt_get_pool_dedup_ratio(spa_t *spa)
|
|
|
|
{
|
|
|
|
ddt_stat_t dds_total = { 0 };
|
|
|
|
|
|
|
|
ddt_get_dedup_stats(spa, &dds_total);
|
|
|
|
if (dds_total.dds_dsize == 0)
|
|
|
|
return (100);
|
|
|
|
|
|
|
|
return (dds_total.dds_ref_dsize * 100 / dds_total.dds_dsize);
|
|
|
|
}
|
2024-07-26 19:16:18 +03:00
|
|
|
|
|
|
|
int
|
|
|
|
ddt_get_pool_dedup_cached(spa_t *spa, uint64_t *psize)
|
|
|
|
{
|
|
|
|
uint64_t l1sz, l1tot, l2sz, l2tot;
|
|
|
|
int err = 0;
|
|
|
|
|
|
|
|
l1tot = l2tot = 0;
|
|
|
|
*psize = 0;
|
|
|
|
for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
|
|
|
|
ddt_t *ddt = spa->spa_ddt[c];
|
|
|
|
if (ddt == NULL)
|
|
|
|
continue;
|
|
|
|
for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
|
|
|
|
for (ddt_class_t class = 0; class < DDT_CLASSES;
|
|
|
|
class++) {
|
|
|
|
err = dmu_object_cached_size(ddt->ddt_os,
|
|
|
|
ddt->ddt_object[type][class], &l1sz, &l2sz);
|
|
|
|
if (err != 0)
|
|
|
|
return (err);
|
|
|
|
l1tot += l1sz;
|
|
|
|
l2tot += l2sz;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
*psize = l1tot + l2tot;
|
|
|
|
return (err);
|
|
|
|
}
|