Fast Clone Deletion

Deleting a clone requires finding blocks are clone-only, not shared
with the snapshot. This was done by traversing the entire block tree
which results in a large performance penalty for sparsely
written clones.

This is new method keeps track of clone blocks when they are
modified in a "Livelist" so that, when it’s time to delete,
the clone-specific blocks are already at hand.

We see performance improvements because now deletion work is
proportional to the number of clone-modified blocks, not the size
of the original dataset.

Reviewed-by: Sean Eric Fagan <sef@ixsystems.com>
Reviewed-by: Matt Ahrens <matt@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Serapheim Dimitropoulos <serapheim@delphix.com>
Signed-off-by: Sara Hartse <sara.hartse@delphix.com>
Closes #8416
This commit is contained in:
Sara Hartse 2019-07-26 10:54:14 -07:00 committed by Brian Behlendorf
parent d274ac5460
commit 37f03da8ba
38 changed files with 2583 additions and 205 deletions

View File

@ -115,7 +115,8 @@ uint64_t max_inflight = 1000;
static int leaked_objects = 0;
static range_tree_t *mos_refd_objs;
static void snprintf_blkptr_compact(char *, size_t, const blkptr_t *);
static void snprintf_blkptr_compact(char *, size_t, const blkptr_t *,
boolean_t);
static void mos_obj_refd(uint64_t);
static void mos_obj_refd_multiple(uint64_t);
@ -552,12 +553,16 @@ dump_bpobj(objset_t *os, uint64_t object, void *data, size_t size)
(void) printf("\t\tcomp = %s\n", comp);
(void) printf("\t\tuncomp = %s\n", uncomp);
}
if (size >= sizeof (*bpop)) {
if (size >= BPOBJ_SIZE_V2) {
(void) printf("\t\tsubobjs = %llu\n",
(u_longlong_t)bpop->bpo_subobjs);
(void) printf("\t\tnum_subobjs = %llu\n",
(u_longlong_t)bpop->bpo_num_subobjs);
}
if (size >= sizeof (*bpop)) {
(void) printf("\t\tnum_freed = %llu\n",
(u_longlong_t)bpop->bpo_num_freed);
}
if (dump_opt['d'] < 5)
return;
@ -572,7 +577,8 @@ dump_bpobj(objset_t *os, uint64_t object, void *data, size_t size)
(void) printf("got error %u from dmu_read\n", err);
break;
}
snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), &bp);
snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), &bp,
BP_GET_FREE(&bp));
(void) printf("\t%s\n", blkbuf);
}
}
@ -1508,7 +1514,8 @@ blkid2offset(const dnode_phys_t *dnp, const blkptr_t *bp,
}
static void
snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp)
snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp,
boolean_t bp_freed)
{
const dva_t *dva = bp->blk_dva;
int ndvas = dump_opt['d'] > 5 ? BP_GET_NDVAS(bp) : 1;
@ -1516,6 +1523,10 @@ snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp)
if (dump_opt['b'] >= 6) {
snprintf_blkptr(blkbuf, buflen, bp);
if (bp_freed) {
(void) snprintf(blkbuf + strlen(blkbuf),
buflen - strlen(blkbuf), " %s", "FREE");
}
return;
}
@ -1553,6 +1564,9 @@ snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp)
(u_longlong_t)BP_GET_FILL(bp),
(u_longlong_t)bp->blk_birth,
(u_longlong_t)BP_PHYSICAL_BIRTH(bp));
if (bp_freed)
(void) snprintf(blkbuf + strlen(blkbuf),
buflen - strlen(blkbuf), " %s", "FREE");
}
}
@ -1580,7 +1594,7 @@ print_indirect(blkptr_t *bp, const zbookmark_phys_t *zb,
}
}
snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp);
snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp, B_FALSE);
(void) printf("%s\n", blkbuf);
}
@ -1815,12 +1829,12 @@ dump_bptree(objset_t *os, uint64_t obj, const char *name)
/* ARGSUSED */
static int
dump_bpobj_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
dump_bpobj_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx)
{
char blkbuf[BP_SPRINTF_LEN];
ASSERT(bp->blk_birth != 0);
snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp);
snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp, bp_freed);
(void) printf("\t%s\n", blkbuf);
return (0);
}
@ -1845,14 +1859,28 @@ dump_full_bpobj(bpobj_t *bpo, const char *name, int indent)
if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) {
zdb_nicenum(bpo->bpo_phys->bpo_comp, comp, sizeof (comp));
zdb_nicenum(bpo->bpo_phys->bpo_uncomp, uncomp, sizeof (uncomp));
(void) printf(" %*s: object %llu, %llu local blkptrs, "
"%llu subobjs in object, %llu, %s (%s/%s comp)\n",
indent * 8, name,
(u_longlong_t)bpo->bpo_object,
(u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs,
(u_longlong_t)bpo->bpo_phys->bpo_num_subobjs,
(u_longlong_t)bpo->bpo_phys->bpo_subobjs,
bytes, comp, uncomp);
if (bpo->bpo_havefreed) {
(void) printf(" %*s: object %llu, %llu local "
"blkptrs, %llu freed, %llu subobjs in object %llu, "
"%s (%s/%s comp)\n",
indent * 8, name,
(u_longlong_t)bpo->bpo_object,
(u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs,
(u_longlong_t)bpo->bpo_phys->bpo_num_freed,
(u_longlong_t)bpo->bpo_phys->bpo_num_subobjs,
(u_longlong_t)bpo->bpo_phys->bpo_subobjs,
bytes, comp, uncomp);
} else {
(void) printf(" %*s: object %llu, %llu local "
"blkptrs, %llu subobjs in object %llu, "
"%s (%s/%s comp)\n",
indent * 8, name,
(u_longlong_t)bpo->bpo_object,
(u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs,
(u_longlong_t)bpo->bpo_phys->bpo_num_subobjs,
(u_longlong_t)bpo->bpo_phys->bpo_subobjs,
bytes, comp, uncomp);
}
for (i = 0; i < bpo->bpo_phys->bpo_num_subobjs; i++) {
uint64_t subobj;
@ -1872,11 +1900,22 @@ dump_full_bpobj(bpobj_t *bpo, const char *name, int indent)
bpobj_close(&subbpo);
}
} else {
(void) printf(" %*s: object %llu, %llu blkptrs, %s\n",
indent * 8, name,
(u_longlong_t)bpo->bpo_object,
(u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs,
bytes);
if (bpo->bpo_havefreed) {
(void) printf(" %*s: object %llu, %llu blkptrs, "
"%llu freed, %s\n",
indent * 8, name,
(u_longlong_t)bpo->bpo_object,
(u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs,
(u_longlong_t)bpo->bpo_phys->bpo_num_freed,
bytes);
} else {
(void) printf(" %*s: object %llu, %llu blkptrs, "
"%s\n",
indent * 8, name,
(u_longlong_t)bpo->bpo_object,
(u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs,
bytes);
}
}
if (dump_opt['d'] < 5)
@ -2038,36 +2077,59 @@ bpobj_count_refd(bpobj_t *bpo)
}
}
static void
dump_deadlist(dsl_deadlist_t *dl)
static int
dsl_deadlist_entry_count_refd(void *arg, dsl_deadlist_entry_t *dle)
{
spa_t *spa = arg;
uint64_t empty_bpobj = spa->spa_dsl_pool->dp_empty_bpobj;
if (dle->dle_bpobj.bpo_object != empty_bpobj)
bpobj_count_refd(&dle->dle_bpobj);
return (0);
}
static int
dsl_deadlist_entry_dump(void *arg, dsl_deadlist_entry_t *dle)
{
ASSERT(arg == NULL);
if (dump_opt['d'] >= 5) {
char buf[128];
(void) snprintf(buf, sizeof (buf),
"mintxg %llu -> obj %llu",
(longlong_t)dle->dle_mintxg,
(longlong_t)dle->dle_bpobj.bpo_object);
dump_full_bpobj(&dle->dle_bpobj, buf, 0);
} else {
(void) printf("mintxg %llu -> obj %llu\n",
(longlong_t)dle->dle_mintxg,
(longlong_t)dle->dle_bpobj.bpo_object);
}
return (0);
}
static void
dump_blkptr_list(dsl_deadlist_t *dl, char *name)
{
dsl_deadlist_entry_t *dle;
uint64_t unused;
char bytes[32];
char comp[32];
char uncomp[32];
uint64_t empty_bpobj =
dmu_objset_spa(dl->dl_os)->spa_dsl_pool->dp_empty_bpobj;
/* force the tree to be loaded */
dsl_deadlist_space_range(dl, 0, UINT64_MAX, &unused, &unused, &unused);
char entries[32];
spa_t *spa = dmu_objset_spa(dl->dl_os);
uint64_t empty_bpobj = spa->spa_dsl_pool->dp_empty_bpobj;
if (dl->dl_oldfmt) {
if (dl->dl_bpobj.bpo_object != empty_bpobj)
bpobj_count_refd(&dl->dl_bpobj);
} else {
mos_obj_refd(dl->dl_object);
for (dle = avl_first(&dl->dl_tree); dle;
dle = AVL_NEXT(&dl->dl_tree, dle)) {
if (dle->dle_bpobj.bpo_object != empty_bpobj)
bpobj_count_refd(&dle->dle_bpobj);
}
dsl_deadlist_iterate(dl, dsl_deadlist_entry_count_refd, spa);
}
/* make sure nicenum has enough space */
CTASSERT(sizeof (bytes) >= NN_NUMBUF_SZ);
CTASSERT(sizeof (comp) >= NN_NUMBUF_SZ);
CTASSERT(sizeof (uncomp) >= NN_NUMBUF_SZ);
CTASSERT(sizeof (entries) >= NN_NUMBUF_SZ);
if (dump_opt['d'] < 3)
return;
@ -2080,30 +2142,60 @@ dump_deadlist(dsl_deadlist_t *dl)
zdb_nicenum(dl->dl_phys->dl_used, bytes, sizeof (bytes));
zdb_nicenum(dl->dl_phys->dl_comp, comp, sizeof (comp));
zdb_nicenum(dl->dl_phys->dl_uncomp, uncomp, sizeof (uncomp));
(void) printf("\n Deadlist: %s (%s/%s comp)\n",
bytes, comp, uncomp);
zdb_nicenum(avl_numnodes(&dl->dl_tree), entries, sizeof (entries));
(void) printf("\n %s: %s (%s/%s comp), %s entries\n",
name, bytes, comp, uncomp, entries);
if (dump_opt['d'] < 4)
return;
(void) printf("\n");
for (dle = avl_first(&dl->dl_tree); dle;
dle = AVL_NEXT(&dl->dl_tree, dle)) {
if (dump_opt['d'] >= 5) {
char buf[128];
(void) snprintf(buf, sizeof (buf),
"mintxg %llu -> obj %llu",
(longlong_t)dle->dle_mintxg,
(longlong_t)dle->dle_bpobj.bpo_object);
dsl_deadlist_iterate(dl, dsl_deadlist_entry_dump, NULL);
}
dump_full_bpobj(&dle->dle_bpobj, buf, 0);
} else {
(void) printf("mintxg %llu -> obj %llu\n",
(longlong_t)dle->dle_mintxg,
(longlong_t)dle->dle_bpobj.bpo_object);
}
static int
verify_dd_livelist(objset_t *os)
{
uint64_t ll_used, used, ll_comp, comp, ll_uncomp, uncomp;
dsl_pool_t *dp = spa_get_dsl(os->os_spa);
dsl_dir_t *dd = os->os_dsl_dataset->ds_dir;
ASSERT(!dmu_objset_is_snapshot(os));
if (!dsl_deadlist_is_open(&dd->dd_livelist))
return (0);
dsl_pool_config_enter(dp, FTAG);
dsl_deadlist_space(&dd->dd_livelist, &ll_used,
&ll_comp, &ll_uncomp);
dsl_dataset_t *origin_ds;
ASSERT(dsl_pool_config_held(dp));
VERIFY0(dsl_dataset_hold_obj(dp,
dsl_dir_phys(dd)->dd_origin_obj, FTAG, &origin_ds));
VERIFY0(dsl_dataset_space_written(origin_ds, os->os_dsl_dataset,
&used, &comp, &uncomp));
dsl_dataset_rele(origin_ds, FTAG);
dsl_pool_config_exit(dp, FTAG);
/*
* It's possible that the dataset's uncomp space is larger than the
* livelist's because livelists do not track embedded block pointers
*/
if (used != ll_used || comp != ll_comp || uncomp < ll_uncomp) {
char nice_used[32], nice_comp[32], nice_uncomp[32];
(void) printf("Discrepancy in space accounting:\n");
zdb_nicenum(used, nice_used, sizeof (nice_used));
zdb_nicenum(comp, nice_comp, sizeof (nice_comp));
zdb_nicenum(uncomp, nice_uncomp, sizeof (nice_uncomp));
(void) printf("dir: used %s, comp %s, uncomp %s\n",
nice_used, nice_comp, nice_uncomp);
zdb_nicenum(ll_used, nice_used, sizeof (nice_used));
zdb_nicenum(ll_comp, nice_comp, sizeof (nice_comp));
zdb_nicenum(ll_uncomp, nice_uncomp, sizeof (nice_uncomp));
(void) printf("livelist: used %s, comp %s, uncomp %s\n",
nice_used, nice_comp, nice_uncomp);
return (1);
}
return (0);
}
static avl_tree_t idx_tree;
@ -2643,7 +2735,7 @@ static const char *objset_types[DMU_OST_NUMTYPES] = {
"NONE", "META", "ZPL", "ZVOL", "OTHER", "ANY" };
static void
dump_dir(objset_t *os)
dump_objset(objset_t *os)
{
dmu_objset_stats_t dds;
uint64_t object, object_count;
@ -2716,11 +2808,17 @@ dump_dir(objset_t *os)
if (dmu_objset_ds(os) != NULL) {
dsl_dataset_t *ds = dmu_objset_ds(os);
dump_deadlist(&ds->ds_deadlist);
dump_blkptr_list(&ds->ds_deadlist, "Deadlist");
if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) &&
!dmu_objset_is_snapshot(os)) {
dump_blkptr_list(&ds->ds_dir->dd_livelist, "Livelist");
if (verify_dd_livelist(os) != 0)
fatal("livelist is incorrect");
}
if (dsl_dataset_remap_deadlist_exists(ds)) {
(void) printf("ds_remap_deadlist:\n");
dump_deadlist(&ds->ds_remap_deadlist);
dump_blkptr_list(&ds->ds_remap_deadlist, "Deadlist");
}
count_ds_mos_objects(ds);
}
@ -3470,7 +3568,7 @@ static uint64_t remap_deadlist_count = 0;
/*ARGSUSED*/
static int
dump_one_dir(const char *dsname, void *arg)
dump_one_objset(const char *dsname, void *arg)
{
int error;
objset_t *os;
@ -3502,7 +3600,12 @@ dump_one_dir(const char *dsname, void *arg)
global_feature_count[SPA_FEATURE_BOOKMARK_WRITTEN]++;
}
dump_dir(os);
if (dsl_deadlist_is_open(&dmu_objset_ds(os)->ds_dir->dd_livelist) &&
!dmu_objset_is_snapshot(os)) {
global_feature_count[SPA_FEATURE_LIVELIST]++;
}
dump_objset(os);
close_objset(os, FTAG);
fuid_table_destroy();
return (0);
@ -3993,13 +4096,15 @@ zdb_claim_removing(spa_t *spa, zdb_cb_t *zcb)
/* ARGSUSED */
static int
increment_indirect_mapping_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
increment_indirect_mapping_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
dmu_tx_t *tx)
{
zdb_cb_t *zcb = arg;
spa_t *spa = zcb->zcb_spa;
vdev_t *vd;
const dva_t *dva = &bp->blk_dva[0];
ASSERT(!bp_freed);
ASSERT(!dump_opt['L']);
ASSERT3U(BP_GET_NDVAS(bp), ==, 1);
@ -4617,6 +4722,101 @@ count_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
return (0);
}
/*
* Iterate over livelists which have been destroyed by the user but
* are still present in the MOS, waiting to be freed
*/
typedef void ll_iter_t(dsl_deadlist_t *ll, void *arg);
static void
iterate_deleted_livelists(spa_t *spa, ll_iter_t func, void *arg)
{
objset_t *mos = spa->spa_meta_objset;
uint64_t zap_obj;
int err = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT,
DMU_POOL_DELETED_CLONES, sizeof (uint64_t), 1, &zap_obj);
if (err == ENOENT)
return;
ASSERT0(err);
zap_cursor_t zc;
zap_attribute_t attr;
dsl_deadlist_t ll;
/* NULL out os prior to dsl_deadlist_open in case it's garbage */
ll.dl_os = NULL;
for (zap_cursor_init(&zc, mos, zap_obj);
zap_cursor_retrieve(&zc, &attr) == 0;
(void) zap_cursor_advance(&zc)) {
dsl_deadlist_open(&ll, mos, attr.za_first_integer);
func(&ll, arg);
dsl_deadlist_close(&ll);
}
zap_cursor_fini(&zc);
}
static int
bpobj_count_block_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
dmu_tx_t *tx)
{
ASSERT(!bp_freed);
return (count_block_cb(arg, bp, tx));
}
static int
livelist_entry_count_blocks_cb(void *args, dsl_deadlist_entry_t *dle)
{
zdb_cb_t *zbc = args;
bplist_t blks;
bplist_create(&blks);
/* determine which blocks have been alloc'd but not freed */
VERIFY0(dsl_process_sub_livelist(&dle->dle_bpobj, &blks, NULL, NULL));
/* count those blocks */
(void) bplist_iterate(&blks, count_block_cb, zbc, NULL);
bplist_destroy(&blks);
return (0);
}
static void
livelist_count_blocks(dsl_deadlist_t *ll, void *arg)
{
dsl_deadlist_iterate(ll, livelist_entry_count_blocks_cb, arg);
}
/*
* Count the blocks in the livelists that have been destroyed by the user
* but haven't yet been freed.
*/
static void
deleted_livelists_count_blocks(spa_t *spa, zdb_cb_t *zbc)
{
iterate_deleted_livelists(spa, livelist_count_blocks, zbc);
}
static void
dump_livelist_cb(dsl_deadlist_t *ll, void *arg)
{
ASSERT3P(arg, ==, NULL);
global_feature_count[SPA_FEATURE_LIVELIST]++;
dump_blkptr_list(ll, "Deleted Livelist");
}
/*
* Print out, register object references to, and increment feature counts for
* livelists that have been destroyed by the user but haven't yet been freed.
*/
static void
deleted_livelists_dump_mos(spa_t *spa)
{
uint64_t zap_obj;
objset_t *mos = spa->spa_meta_objset;
int err = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT,
DMU_POOL_DELETED_CLONES, sizeof (uint64_t), 1, &zap_obj);
if (err == ENOENT)
return;
mos_obj_refd(zap_obj);
iterate_deleted_livelists(spa, dump_livelist_cb, NULL);
}
static int
dump_block_stats(spa_t *spa)
{
@ -4656,11 +4856,11 @@ dump_block_stats(spa_t *spa)
* If there's a deferred-free bplist, process that first.
*/
(void) bpobj_iterate_nofree(&spa->spa_deferred_bpobj,
count_block_cb, &zcb, NULL);
bpobj_count_block_cb, &zcb, NULL);
if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
(void) bpobj_iterate_nofree(&spa->spa_dsl_pool->dp_free_bpobj,
count_block_cb, &zcb, NULL);
bpobj_count_block_cb, &zcb, NULL);
}
zdb_claim_removing(spa, &zcb);
@ -4671,6 +4871,8 @@ dump_block_stats(spa_t *spa)
&zcb, NULL));
}
deleted_livelists_count_blocks(spa, &zcb);
if (dump_opt['c'] > 1)
flags |= TRAVERSE_PREFETCH_DATA;
@ -5706,6 +5908,7 @@ dump_mos_leaks(spa_t *spa)
mos_obj_refd(vim->vim_phys->vimp_counts_object);
vdev_indirect_mapping_close(vim);
}
deleted_livelists_dump_mos(spa);
if (dp->dp_origin_snap != NULL) {
dsl_dataset_t *ds;
@ -5715,12 +5918,12 @@ dump_mos_leaks(spa_t *spa)
dsl_dataset_phys(dp->dp_origin_snap)->ds_next_snap_obj,
FTAG, &ds));
count_ds_mos_objects(ds);
dump_deadlist(&ds->ds_deadlist);
dump_blkptr_list(&ds->ds_deadlist, "Deadlist");
dsl_dataset_rele(ds, FTAG);
dsl_pool_config_exit(dp, FTAG);
count_ds_mos_objects(dp->dp_origin_snap);
dump_deadlist(&dp->dp_origin_snap->ds_deadlist);
dump_blkptr_list(&dp->dp_origin_snap->ds_deadlist, "Deadlist");
}
count_dir_mos_objects(dp->dp_mos_dir);
if (dp->dp_free_dir != NULL)
@ -5885,7 +6088,7 @@ dump_zpool(spa_t *spa)
if (dump_opt['d'] || dump_opt['i']) {
spa_feature_t f;
mos_refd_objs = range_tree_create(NULL, NULL);
dump_dir(dp->dp_meta_objset);
dump_objset(dp->dp_meta_objset);
if (dump_opt['d'] >= 3) {
dsl_pool_t *dp = spa->spa_dsl_pool;
@ -5915,8 +6118,9 @@ dump_zpool(spa_t *spa)
global_feature_count[f] = UINT64_MAX;
global_feature_count[SPA_FEATURE_REDACTION_BOOKMARKS] = 0;
global_feature_count[SPA_FEATURE_BOOKMARK_WRITTEN] = 0;
global_feature_count[SPA_FEATURE_LIVELIST] = 0;
(void) dmu_objset_find(spa_name(spa), dump_one_dir,
(void) dmu_objset_find(spa_name(spa), dump_one_objset,
NULL, DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN);
if (rc == 0 && !dump_opt['L'])
@ -6777,9 +6981,9 @@ main(int argc, char **argv)
}
}
if (os != NULL) {
dump_dir(os);
dump_objset(os);
} else if (zopt_objects > 0 && !dump_opt['m']) {
dump_dir(spa->spa_meta_objset);
dump_objset(spa->spa_meta_objset);
} else {
dump_zpool(spa);
}

View File

@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2018 by Delphix. All rights reserved.
*/
#ifndef _SYS_BPLIST_H
@ -49,6 +50,7 @@ void bplist_destroy(bplist_t *bpl);
void bplist_append(bplist_t *bpl, const blkptr_t *bp);
void bplist_iterate(bplist_t *bpl, bplist_itor_t *func,
void *arg, dmu_tx_t *tx);
void bplist_clear(bplist_t *bpl);
#ifdef __cplusplus
}

View File

@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2015 by Delphix. All rights reserved.
* Copyright (c) 2015, 2019 by Delphix. All rights reserved.
*/
#ifndef _SYS_BPOBJ_H
@ -31,6 +31,7 @@
#include <sys/txg.h>
#include <sys/zio.h>
#include <sys/zfs_context.h>
#include <sys/bplist.h>
#ifdef __cplusplus
extern "C" {
@ -48,10 +49,12 @@ typedef struct bpobj_phys {
uint64_t bpo_uncomp;
uint64_t bpo_subobjs;
uint64_t bpo_num_subobjs;
uint64_t bpo_num_freed;
} bpobj_phys_t;
#define BPOBJ_SIZE_V0 (2 * sizeof (uint64_t))
#define BPOBJ_SIZE_V1 (4 * sizeof (uint64_t))
#define BPOBJ_SIZE_V2 (6 * sizeof (uint64_t))
typedef struct bpobj {
kmutex_t bpo_lock;
@ -60,12 +63,14 @@ typedef struct bpobj {
int bpo_epb;
uint8_t bpo_havecomp;
uint8_t bpo_havesubobj;
uint8_t bpo_havefreed;
bpobj_phys_t *bpo_phys;
dmu_buf_t *bpo_dbuf;
dmu_buf_t *bpo_cached_dbuf;
} bpobj_t;
typedef int bpobj_itor_t(void *arg, const blkptr_t *bp, dmu_tx_t *tx);
typedef int bpobj_itor_t(void *arg, const blkptr_t *bp, boolean_t bp_freed,
dmu_tx_t *tx);
uint64_t bpobj_alloc(objset_t *mos, int blocksize, dmu_tx_t *tx);
uint64_t bpobj_alloc_empty(objset_t *os, int blocksize, dmu_tx_t *tx);
@ -77,10 +82,13 @@ void bpobj_close(bpobj_t *bpo);
boolean_t bpobj_is_open(const bpobj_t *bpo);
int bpobj_iterate(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx);
int bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *, dmu_tx_t *);
int bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *, uint64_t *);
int livelist_bpobj_iterate_from_nofree(bpobj_t *bpo, bpobj_itor_t func,
void *arg, int64_t start);
void bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx);
void bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx);
void bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, boolean_t bp_freed,
dmu_tx_t *tx);
int bpobj_space(bpobj_t *bpo,
uint64_t *usedp, uint64_t *compp, uint64_t *uncompp);
@ -88,6 +96,9 @@ int bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg,
uint64_t *usedp, uint64_t *compp, uint64_t *uncompp);
boolean_t bpobj_is_empty(bpobj_t *bpo);
int bplist_append_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
dmu_tx_t *tx);
#ifdef __cplusplus
}
#endif

View File

@ -383,6 +383,7 @@ typedef struct dmu_buf {
#define DMU_POOL_CONDENSING_INDIRECT "com.delphix:condensing_indirect"
#define DMU_POOL_ZPOOL_CHECKPOINT "com.delphix:zpool_checkpoint"
#define DMU_POOL_LOG_SPACEMAP_ZAP "com.delphix:log_spacemap_zap"
#define DMU_POOL_DELETED_CLONES "com.delphix:deleted_clones"
/*
* Allocate an object from this objset. The range of object numbers
@ -1003,6 +1004,7 @@ extern uint64_t dmu_objset_id(objset_t *os);
extern uint64_t dmu_objset_dnodesize(objset_t *os);
extern zfs_sync_type_t dmu_objset_syncprop(objset_t *os);
extern zfs_logbias_op_t dmu_objset_logbias(objset_t *os);
extern int dmu_objset_blksize(objset_t *os);
extern int dmu_snapshot_list_next(objset_t *os, int namelen, char *name,
uint64_t *id, uint64_t *offp, boolean_t *case_conflict);
extern int dmu_snapshot_lookup(objset_t *os, const char *name, uint64_t *val);

View File

@ -126,7 +126,7 @@ struct objset {
zfs_cache_type_t os_secondary_cache;
zfs_sync_type_t os_sync;
zfs_redundant_metadata_type_t os_redundant_metadata;
int os_recordsize;
uint64_t os_recordsize;
/*
* The next four values are used as a cache of whatever's on disk, and
* are initialized the first time these properties are queried. Before

View File

@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2015 by Delphix. All rights reserved.
* Copyright (c) 2018, 2019 by Delphix. All rights reserved.
*/
#ifndef _SYS_DSL_DEADLIST_H
@ -28,12 +28,14 @@
#include <sys/bpobj.h>
#include <sys/zfs_context.h>
#include <sys/zthr.h>
#ifdef __cplusplus
extern "C" {
#endif
struct dmu_buf;
struct dsl_pool;
struct dsl_dataset;
typedef struct dsl_deadlist_phys {
@ -63,13 +65,34 @@ typedef struct dsl_deadlist_entry {
bpobj_t dle_bpobj;
} dsl_deadlist_entry_t;
typedef struct livelist_condense_entry {
struct dsl_dataset *ds;
dsl_deadlist_entry_t *first;
dsl_deadlist_entry_t *next;
boolean_t syncing;
boolean_t cancelled;
} livelist_condense_entry_t;
extern unsigned long zfs_livelist_max_entries;
extern int zfs_livelist_min_percent_shared;
typedef int deadlist_iter_t(void *args, dsl_deadlist_entry_t *dle);
void dsl_deadlist_open(dsl_deadlist_t *dl, objset_t *os, uint64_t object);
void dsl_deadlist_close(dsl_deadlist_t *dl);
void dsl_deadlist_iterate(dsl_deadlist_t *dl, deadlist_iter_t func, void *arg);
uint64_t dsl_deadlist_alloc(objset_t *os, dmu_tx_t *tx);
void dsl_deadlist_free(objset_t *os, uint64_t dlobj, dmu_tx_t *tx);
void dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx);
void dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp,
boolean_t free, dmu_tx_t *tx);
int dsl_deadlist_insert_alloc_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx);
int dsl_deadlist_insert_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx);
void dsl_deadlist_add_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx);
void dsl_deadlist_remove_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx);
void dsl_deadlist_remove_entry(dsl_deadlist_t *dl, uint64_t mintxg,
dmu_tx_t *tx);
dsl_deadlist_entry_t *dsl_deadlist_first(dsl_deadlist_t *dl);
dsl_deadlist_entry_t *dsl_deadlist_last(dsl_deadlist_t *dl);
uint64_t dsl_deadlist_clone(dsl_deadlist_t *dl, uint64_t maxtxg,
uint64_t mrs_obj, dmu_tx_t *tx);
void dsl_deadlist_space(dsl_deadlist_t *dl,
@ -81,6 +104,10 @@ void dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx);
void dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg,
dmu_tx_t *tx);
boolean_t dsl_deadlist_is_open(dsl_deadlist_t *dl);
int dsl_process_sub_livelist(bpobj_t *bpobj, struct bplist *to_free,
zthr_t *t, uint64_t *size);
void dsl_deadlist_clear_entry(dsl_deadlist_entry_t *dle, dsl_deadlist_t *dl,
dmu_tx_t *tx);
#ifdef __cplusplus
}

View File

@ -33,6 +33,7 @@ extern "C" {
struct nvlist;
struct dsl_dataset;
struct dsl_pool;
struct dmu_tx;
int dsl_destroy_snapshots_nvl(struct nvlist *, boolean_t,

View File

@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2017 by Delphix. All rights reserved.
* Copyright (c) 2012, 2018 by Delphix. All rights reserved.
* Copyright (c) 2014, Joyent, Inc. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
*/
@ -29,18 +29,20 @@
#define _SYS_DSL_DIR_H
#include <sys/dmu.h>
#include <sys/dsl_deadlist.h>
#include <sys/dsl_pool.h>
#include <sys/dsl_synctask.h>
#include <sys/refcount.h>
#include <sys/zfs_context.h>
#include <sys/dsl_crypt.h>
#include <sys/bplist.h>
#ifdef __cplusplus
extern "C" {
#endif
struct dsl_dataset;
struct zthr;
/*
* DD_FIELD_* are strings that are used in the "extensified" dsl_dir zap object.
* They should be of the format <reverse-dns>:<field>.
@ -49,6 +51,7 @@ struct dsl_dataset;
#define DD_FIELD_FILESYSTEM_COUNT "com.joyent:filesystem_count"
#define DD_FIELD_SNAPSHOT_COUNT "com.joyent:snapshot_count"
#define DD_FIELD_CRYPTO_KEY_OBJ "com.datto:crypto_key_obj"
#define DD_FIELD_LIVELIST "com.delphix:livelist"
typedef enum dd_used {
DD_USED_HEAD,
@ -114,6 +117,10 @@ struct dsl_dir {
/* amount of space we expect to write; == amount of dirty data */
int64_t dd_space_towrite[TXG_SIZE];
dsl_deadlist_t dd_livelist;
bplist_t dd_pending_frees;
bplist_t dd_pending_allocs;
/* protected by dd_lock; keep at end of struct for better locality */
char dd_myname[ZFS_MAX_DATASET_NAME_LEN];
};
@ -182,6 +189,9 @@ void dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd, uint64_t value,
dmu_tx_t *tx);
void dsl_dir_zapify(dsl_dir_t *dd, dmu_tx_t *tx);
boolean_t dsl_dir_is_zapified(dsl_dir_t *dd);
void dsl_dir_livelist_open(dsl_dir_t *dd, uint64_t obj);
void dsl_dir_livelist_close(dsl_dir_t *dd);
void dsl_dir_remove_livelist(dsl_dir_t *dd, dmu_tx_t *tx, boolean_t total);
/* internal reserved dir name */
#define MOS_DIR_NAME "$MOS"

View File

@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2013, 2017 by Delphix. All rights reserved.
* Copyright (c) 2013, 2018 by Delphix. All rights reserved.
* Copyright 2016 Nexenta Systems, Inc. All rights reserved.
*/
@ -54,6 +54,7 @@ struct dsl_pool;
struct dmu_tx;
struct dsl_scan;
struct dsl_crypto_params;
struct dsl_deadlist;
extern unsigned long zfs_dirty_data_max;
extern unsigned long zfs_dirty_data_max_max;

View File

@ -63,6 +63,8 @@ typedef struct ddt ddt_t;
typedef struct ddt_entry ddt_entry_t;
typedef struct zbookmark_phys zbookmark_phys_t;
struct bpobj;
struct bplist;
struct dsl_pool;
struct dsl_dataset;
struct dsl_crypto_params;
@ -532,6 +534,9 @@ _NOTE(CONSTCOND) } while (0)
#define BP_GET_BYTEORDER(bp) BF64_GET((bp)->blk_prop, 63, 1)
#define BP_SET_BYTEORDER(bp, x) BF64_SET((bp)->blk_prop, 63, 1, x)
#define BP_GET_FREE(bp) BF64_GET((bp)->blk_fill, 0, 1)
#define BP_SET_FREE(bp, x) BF64_SET((bp)->blk_fill, 0, 1, x)
#define BP_PHYSICAL_BIRTH(bp) \
(BP_IS_EMBEDDED(bp) ? 0 : \
(bp)->blk_phys_birth ? (bp)->blk_phys_birth : (bp)->blk_birth)
@ -654,6 +659,7 @@ _NOTE(CONSTCOND) } while (0)
* 'func' is either snprintf() or mdb_snprintf().
* 'ws' (whitespace) can be ' ' for single-line format, '\n' for multi-line.
*/
#define SNPRINTF_BLKPTR(func, ws, buf, size, bp, type, checksum, compress) \
{ \
static const char *copyname[] = \
@ -804,6 +810,8 @@ extern spa_t *spa_inject_addref(char *pool);
extern void spa_inject_delref(spa_t *spa);
extern void spa_scan_stat_init(spa_t *spa);
extern int spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps);
extern int bpobj_enqueue_alloc_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx);
extern int bpobj_enqueue_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx);
#define SPA_ASYNC_CONFIG_UPDATE 0x01
#define SPA_ASYNC_REMOVE 0x02
@ -1131,6 +1139,7 @@ extern uint64_t spa_total_metaslabs(spa_t *spa);
extern boolean_t spa_multihost(spa_t *spa);
extern unsigned long spa_get_hostid(void);
extern void spa_activate_allocation_classes(spa_t *, dmu_tx_t *);
extern boolean_t spa_livelist_delete_check(spa_t *spa);
extern int spa_mode(spa_t *spa);
extern uint64_t zfs_strtonum(const char *str, char **nptr);

View File

@ -49,6 +49,7 @@
#include <sys/dsl_crypt.h>
#include <sys/zfeature.h>
#include <sys/zthr.h>
#include <sys/dsl_deadlist.h>
#include <zfeature_common.h>
#ifdef __cplusplus
@ -317,6 +318,11 @@ struct spa {
list_t spa_log_summary;
uint64_t spa_log_flushall_txg;
zthr_t *spa_livelist_delete_zthr; /* deleting livelists */
zthr_t *spa_livelist_condense_zthr; /* condensing livelists */
uint64_t spa_livelists_to_delete; /* set of livelists to free */
livelist_condense_entry_t spa_to_condense; /* next to condense */
char *spa_root; /* alternate root directory */
uint64_t spa_ena; /* spa-wide ereport ENA */
int spa_last_open_failed; /* error if last open failed */

View File

@ -33,7 +33,9 @@ extern void zthr_destroy(zthr_t *t);
extern void zthr_wakeup(zthr_t *t);
extern void zthr_cancel(zthr_t *t);
extern void zthr_resume(zthr_t *t);
extern void zthr_wait_cycle_done(zthr_t *t);
extern boolean_t zthr_iscancelled(zthr_t *t);
extern boolean_t zthr_has_waiters(zthr_t *t);
#endif /* _SYS_ZTHR_H */

View File

@ -71,6 +71,7 @@ typedef enum spa_feature {
SPA_FEATURE_REDACTED_DATASETS,
SPA_FEATURE_BOOKMARK_WRITTEN,
SPA_FEATURE_LOG_SPACEMAP,
SPA_FEATURE_LIVELIST,
SPA_FEATURES
} spa_feature_t;

View File

@ -1909,6 +1909,98 @@ Pattern written to vdev free space by \fBzpool initialize\fR.
Default value: \fB16,045,690,984,833,335,022\fR (0xdeadbeefdeadbeee).
.RE
.sp
.ne 2
.na
\fBzfs_livelist_max_entries\fR (ulong)
.ad
.RS 12n
The threshold size (in block pointers) at which we create a new sub-livelist.
Larger sublists are more costly from a memory perspective but the fewer
sublists there are, the lower the cost of insertion.
.sp
Default value: \fB500,000\fR.
.RE
.sp
.ne 2
.na
\fBzfs_livelist_min_percent_shared\fR (int)
.ad
.RS 12n
If the amount of shared space between a snapshot and its clone drops below
this threshold, the clone turns off the livelist and reverts to the old deletion
method. This is in place because once a clone has been overwritten enough
livelists no long give us a benefit.
.sp
Default value: \fB75\fR.
.RE
.sp
.ne 2
.na
\fBzfs_livelist_condense_new_alloc\fR (int)
.ad
.RS 12n
Incremented each time an extra ALLOC blkptr is added to a livelist entry while
it is being condensed.
This option is used by the test suite to track race conditions.
.sp
Default value: \fB0\fR.
.RE
.sp
.ne 2
.na
\fBzfs_livelist_condense_sync_cancel\fR (int)
.ad
.RS 12n
Incremented each time livelist condensing is canceled while in
spa_livelist_condense_sync.
This option is used by the test suite to track race conditions.
.sp
Default value: \fB0\fR.
.RE
.sp
.ne 2
.na
\fBzfs_livelist_condense_sync_pause\fR (int)
.ad
.RS 12n
When set, the livelist condense process pauses indefinitely before
executing the synctask - spa_livelist_condense_sync.
This option is used by the test suite to trigger race conditions.
.sp
Default value: \fB0\fR.
.RE
.sp
.ne 2
.na
\fBzfs_livelist_condense_zthr_cancel\fR (int)
.ad
.RS 12n
Incremented each time livelist condensing is canceled while in
spa_livelist_condense_cb.
This option is used by the test suite to track race conditions.
.sp
Default value: \fB0\fR.
.RE
.sp
.ne 2
.na
\fBzfs_livelist_condense_zthr_pause\fR (int)
.ad
.RS 12n
When set, the livelist condense process pauses indefinitely before
executing the open context condensing work in spa_livelist_condense_cb.
This option is used by the test suite to trigger race conditions.
.sp
Default value: \fB0\fR.
.RE
.sp
.ne 2
.na

View File

@ -547,6 +547,26 @@ allow more data to be stored in the bonus buffer, thus potentially
improving performance by avoiding the use of spill blocks.
.RE
.sp
.ne 2
.na
\fB\fBlivelist\fR\fR
.ad
.RS 4n
.TS
l l .
GUID com.delphix:livelist
READ\-ONLY COMPATIBLE yes
DEPENDENCIES none
.TE
This feature allows clones to be deleted faster than the traditional method
when a large number of random/sparse writes have been made to the clone.
All blocks allocated and freed after a clone is created are tracked by the
the clone's livelist which is referenced during the deletion of the clone.
The feature is activated when a clone is created and remains active until all
clones have been destroyed.
.RE
.sp
.ne 2
.na
@ -882,7 +902,6 @@ This feature becomes \fBactive\fR when the \fBzpool checkpoint\fR subcommand
is used to checkpoint the pool.
The feature will only return back to being \fBenabled\fR when the pool
is rewound or the checkpoint has been discarded.
.RE
.SH "SEE ALSO"
zpool(8)

View File

@ -348,6 +348,18 @@ zpool_feature_init(void)
ZFEATURE_FLAG_MOS | ZFEATURE_FLAG_ACTIVATE_ON_ENABLE,
ZFEATURE_TYPE_BOOLEAN, NULL);
{
static const spa_feature_t livelist_deps[] = {
SPA_FEATURE_EXTENSIBLE_DATASET,
SPA_FEATURE_NONE
};
zfeature_register(SPA_FEATURE_LIVELIST,
"com.delphix:livelist", "livelist",
"Improved clone deletion performance.",
ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN,
livelist_deps);
}
{
static const spa_feature_t log_spacemap_deps[] = {
SPA_FEATURE_SPACEMAP_V2,

View File

@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012 by Delphix. All rights reserved.
* Copyright (c) 2012, 2018 by Delphix. All rights reserved.
*/
#include <sys/bplist.h>
@ -75,3 +75,17 @@ bplist_iterate(bplist_t *bpl, bplist_itor_t *func, void *arg, dmu_tx_t *tx)
}
mutex_exit(&bpl->bpl_lock);
}
void
bplist_clear(bplist_t *bpl)
{
bplist_entry_t *bpe;
mutex_enter(&bpl->bpl_lock);
while ((bpe = list_head(&bpl->bpl_list))) {
bplist_iterate_last_removed = bpe;
list_remove(&bpl->bpl_list, bpe);
kmem_free(bpe, sizeof (*bpe));
}
mutex_exit(&bpl->bpl_lock);
}

View File

@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011, 2016 by Delphix. All rights reserved.
* Copyright (c) 2011, 2018 by Delphix. All rights reserved.
* Copyright (c) 2017 Datto Inc.
*/
@ -83,6 +83,9 @@ bpobj_alloc(objset_t *os, int blocksize, dmu_tx_t *tx)
size = BPOBJ_SIZE_V0;
else if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS)
size = BPOBJ_SIZE_V1;
else if (!spa_feature_is_active(dmu_objset_spa(os),
SPA_FEATURE_LIVELIST))
size = BPOBJ_SIZE_V2;
else
size = sizeof (bpobj_phys_t);
@ -171,6 +174,7 @@ bpobj_open(bpobj_t *bpo, objset_t *os, uint64_t object)
bpo->bpo_epb = doi.doi_data_block_size >> SPA_BLKPTRSHIFT;
bpo->bpo_havecomp = (doi.doi_bonus_size > BPOBJ_SIZE_V0);
bpo->bpo_havesubobj = (doi.doi_bonus_size > BPOBJ_SIZE_V1);
bpo->bpo_havefreed = (doi.doi_bonus_size > BPOBJ_SIZE_V2);
bpo->bpo_phys = bpo->bpo_dbuf->db_data;
return (0);
}
@ -245,8 +249,8 @@ bpi_alloc(bpobj_t *bpo, bpobj_info_t *parent, uint64_t index)
* Update bpobj and all of its parents with new space accounting.
*/
static void
propagate_space_reduction(bpobj_info_t *bpi, uint64_t freed,
uint64_t comp_freed, uint64_t uncomp_freed, dmu_tx_t *tx)
propagate_space_reduction(bpobj_info_t *bpi, int64_t freed,
int64_t comp_freed, int64_t uncomp_freed, dmu_tx_t *tx)
{
for (; bpi != NULL; bpi = bpi->bpi_parent) {
@ -263,22 +267,22 @@ propagate_space_reduction(bpobj_info_t *bpi, uint64_t freed,
static int
bpobj_iterate_blkptrs(bpobj_info_t *bpi, bpobj_itor_t func, void *arg,
dmu_tx_t *tx, boolean_t free)
int64_t start, dmu_tx_t *tx, boolean_t free)
{
int err = 0;
uint64_t freed = 0, comp_freed = 0, uncomp_freed = 0;
int64_t freed = 0, comp_freed = 0, uncomp_freed = 0;
dmu_buf_t *dbuf = NULL;
bpobj_t *bpo = bpi->bpi_bpo;
for (int64_t i = bpo->bpo_phys->bpo_num_blkptrs - 1; i >= 0; i--) {
for (int64_t i = bpo->bpo_phys->bpo_num_blkptrs - 1; i >= start; i--) {
uint64_t offset = i * sizeof (blkptr_t);
uint64_t blkoff = P2PHASE(i, bpo->bpo_epb);
if (dbuf == NULL || dbuf->db_offset > offset) {
if (dbuf)
dmu_buf_rele(dbuf, FTAG);
err = dmu_buf_hold(bpo->bpo_os, bpo->bpo_object, offset,
FTAG, &dbuf, 0);
err = dmu_buf_hold(bpo->bpo_os, bpo->bpo_object,
offset, FTAG, &dbuf, 0);
if (err)
break;
}
@ -288,18 +292,26 @@ bpobj_iterate_blkptrs(bpobj_info_t *bpi, bpobj_itor_t func, void *arg,
blkptr_t *bparray = dbuf->db_data;
blkptr_t *bp = &bparray[blkoff];
err = func(arg, bp, tx);
boolean_t bp_freed = BP_GET_FREE(bp);
err = func(arg, bp, bp_freed, tx);
if (err)
break;
if (free) {
int sign = bp_freed ? -1 : +1;
spa_t *spa = dmu_objset_spa(bpo->bpo_os);
freed += bp_get_dsize_sync(spa, bp);
comp_freed += BP_GET_PSIZE(bp);
uncomp_freed += BP_GET_UCSIZE(bp);
freed += sign * bp_get_dsize_sync(spa, bp);
comp_freed += sign * BP_GET_PSIZE(bp);
uncomp_freed += sign * BP_GET_UCSIZE(bp);
ASSERT(dmu_buf_is_dirty(bpo->bpo_dbuf, tx));
bpo->bpo_phys->bpo_num_blkptrs--;
ASSERT3S(bpo->bpo_phys->bpo_num_blkptrs, >=, 0);
if (bp_freed) {
ASSERT(bpo->bpo_havefreed);
bpo->bpo_phys->bpo_num_freed--;
ASSERT3S(bpo->bpo_phys->bpo_num_freed, >=, 0);
}
}
}
if (free) {
@ -328,7 +340,7 @@ bpobj_iterate_blkptrs(bpobj_info_t *bpi, bpobj_itor_t func, void *arg,
*/
static int
bpobj_iterate_impl(bpobj_t *initial_bpo, bpobj_itor_t func, void *arg,
dmu_tx_t *tx, boolean_t free)
dmu_tx_t *tx, boolean_t free, uint64_t *bpobj_size)
{
list_t stack;
bpobj_info_t *bpi;
@ -341,6 +353,10 @@ bpobj_iterate_impl(bpobj_t *initial_bpo, bpobj_itor_t func, void *arg,
list_create(&stack, sizeof (bpobj_info_t),
offsetof(bpobj_info_t, bpi_node));
mutex_enter(&initial_bpo->bpo_lock);
if (bpobj_size != NULL)
*bpobj_size = initial_bpo->bpo_phys->bpo_num_blkptrs;
list_insert_head(&stack, bpi_alloc(initial_bpo, NULL, 0));
while ((bpi = list_head(&stack)) != NULL) {
@ -354,7 +370,8 @@ bpobj_iterate_impl(bpobj_t *initial_bpo, bpobj_itor_t func, void *arg,
dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
if (bpi->bpi_visited == B_FALSE) {
err = bpobj_iterate_blkptrs(bpi, func, arg, tx, free);
err = bpobj_iterate_blkptrs(bpi, func, arg, 0, tx,
free);
bpi->bpi_visited = B_TRUE;
if (err != 0)
break;
@ -433,6 +450,7 @@ bpobj_iterate_impl(bpobj_t *initial_bpo, bpobj_itor_t func, void *arg,
* We have unprocessed subobjs. Process the next one.
*/
ASSERT(bpo->bpo_havecomp);
ASSERT3P(bpobj_size, ==, NULL);
/* Add the last subobj to stack. */
int64_t i = bpi->bpi_unprocessed_subobjs - 1;
@ -489,16 +507,45 @@ bpobj_iterate_impl(bpobj_t *initial_bpo, bpobj_itor_t func, void *arg,
int
bpobj_iterate(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx)
{
return (bpobj_iterate_impl(bpo, func, arg, tx, B_TRUE));
return (bpobj_iterate_impl(bpo, func, arg, tx, B_TRUE, NULL));
}
/*
* Iterate the entries. If func returns nonzero, iteration will stop.
*
* If there are no subobjs:
*
* *bpobj_size can be used to return the number of block pointers in the
* bpobj. Note that this may be different from the number of block pointers
* that are iterated over, if iteration is terminated early (e.g. by the func
* returning nonzero).
*
* If there are concurrent (or subsequent) modifications to the bpobj then the
* returned *bpobj_size can be passed as "start" to
* livelist_bpobj_iterate_from_nofree() to iterate the newly added entries.
*/
int
bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx)
bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg,
uint64_t *bpobj_size)
{
return (bpobj_iterate_impl(bpo, func, arg, tx, B_FALSE));
return (bpobj_iterate_impl(bpo, func, arg, NULL, B_FALSE, bpobj_size));
}
/*
* Iterate over the blkptrs in the bpobj beginning at index start. If func
* returns nonzero, iteration will stop. This is a livelist specific function
* since it assumes that there are no subobjs present.
*/
int
livelist_bpobj_iterate_from_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg,
int64_t start)
{
if (bpo->bpo_havesubobj)
VERIFY0(bpo->bpo_phys->bpo_subobjs);
bpobj_info_t *bpi = bpi_alloc(bpo, NULL, 0);
int err = bpobj_iterate_blkptrs(bpi, func, arg, start, NULL, B_FALSE);
kmem_free(bpi, sizeof (bpobj_info_t));
return (err);
}
/*
@ -724,7 +771,8 @@ bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx)
}
void
bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx)
bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, boolean_t bp_freed,
dmu_tx_t *tx)
{
blkptr_t stored_bp = *bp;
uint64_t offset;
@ -755,8 +803,8 @@ bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx)
bzero(&stored_bp.blk_cksum, sizeof (stored_bp.blk_cksum));
}
/* We never need the fill count. */
stored_bp.blk_fill = 0;
BP_SET_FREE(&stored_bp, bp_freed);
mutex_enter(&bpo->bpo_lock);
@ -779,11 +827,16 @@ bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx)
dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
bpo->bpo_phys->bpo_num_blkptrs++;
bpo->bpo_phys->bpo_bytes +=
int sign = bp_freed ? -1 : +1;
bpo->bpo_phys->bpo_bytes += sign *
bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp);
if (bpo->bpo_havecomp) {
bpo->bpo_phys->bpo_comp += BP_GET_PSIZE(bp);
bpo->bpo_phys->bpo_uncomp += BP_GET_UCSIZE(bp);
bpo->bpo_phys->bpo_comp += sign * BP_GET_PSIZE(bp);
bpo->bpo_phys->bpo_uncomp += sign * BP_GET_UCSIZE(bp);
}
if (bp_freed) {
ASSERT(bpo->bpo_havefreed);
bpo->bpo_phys->bpo_num_freed++;
}
mutex_exit(&bpo->bpo_lock);
}
@ -799,7 +852,7 @@ struct space_range_arg {
/* ARGSUSED */
static int
space_range_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
space_range_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx)
{
struct space_range_arg *sra = arg;
@ -863,3 +916,18 @@ bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg,
*uncompp = sra.uncomp;
return (err);
}
/*
* A bpobj_itor_t to append blkptrs to a bplist. Note that while blkptrs in a
* bpobj are designated as free or allocated that information is not preserved
* in bplists.
*/
/* ARGSUSED */
int
bplist_append_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
dmu_tx_t *tx)
{
bplist_t *bpl = arg;
bplist_append(bpl, bp);
return (0);
}

View File

@ -3286,6 +3286,13 @@ dbuf_hold_impl_arg(struct dbuf_hold_arg *dh)
*(dh->dh_dbp) = NULL;
/* If the pool has been created, verify the tx_sync_lock is not held */
spa_t *spa = dh->dh_dn->dn_objset->os_spa;
dsl_pool_t *dp = spa->spa_dsl_pool;
if (dp != NULL) {
ASSERT(!MUTEX_HELD(&dp->dp_tx.tx_sync_lock));
}
/* dbuf_find() returns with db_mtx held */
dh->dh_db = dbuf_find(dh->dh_dn->dn_objset, dh->dh_dn->dn_object,
dh->dh_level, dh->dh_blkid);
@ -4479,6 +4486,29 @@ dbuf_remap_impl(dnode_t *dn, blkptr_t *bp, krwlock_t *rw, dmu_tx_t *tx)
drica.drica_tx = tx;
if (spa_remap_blkptr(spa, &bp_copy, dbuf_remap_impl_callback,
&drica)) {
/*
* If the blkptr being remapped is tracked by a livelist,
* then we need to make sure the livelist reflects the update.
* First, cancel out the old blkptr by appending a 'FREE'
* entry. Next, add an 'ALLOC' to track the new version. This
* way we avoid trying to free an inaccurate blkptr at delete.
* Note that embedded blkptrs are not tracked in livelists.
*/
if (dn->dn_objset != spa_meta_objset(spa)) {
dsl_dataset_t *ds = dmu_objset_ds(dn->dn_objset);
if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) &&
bp->blk_birth > ds->ds_dir->dd_origin_txg) {
ASSERT(!BP_IS_EMBEDDED(bp));
ASSERT(dsl_dir_is_clone(ds->ds_dir));
ASSERT(spa_feature_is_enabled(spa,
SPA_FEATURE_LIVELIST));
bplist_append(&ds->ds_dir->dd_pending_frees,
bp);
bplist_append(&ds->ds_dir->dd_pending_allocs,
&bp_copy);
}
}
/*
* The db_rwlock prevents dbuf_read_impl() from
* dereferencing the BP while we are changing it. To

View File

@ -122,13 +122,12 @@ parent_delta(dsl_dataset_t *ds, int64_t delta)
void
dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
{
int used, compressed, uncompressed;
spa_t *spa = dmu_tx_pool(tx)->dp_spa;
int used = bp_get_dsize_sync(spa, bp);
int compressed = BP_GET_PSIZE(bp);
int uncompressed = BP_GET_UCSIZE(bp);
int64_t delta;
used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp);
compressed = BP_GET_PSIZE(bp);
uncompressed = BP_GET_UCSIZE(bp);
dprintf_bp(bp, "ds=%p", ds);
ASSERT(dmu_tx_is_syncing(tx));
@ -164,6 +163,19 @@ dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
ds->ds_feature_activation[f] = (void *)B_TRUE;
}
/*
* Track block for livelist, but ignore embedded blocks because
* they do not need to be freed.
*/
if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) &&
bp->blk_birth > ds->ds_dir->dd_origin_txg &&
!(BP_IS_EMBEDDED(bp))) {
ASSERT(dsl_dir_is_clone(ds->ds_dir));
ASSERT(spa_feature_is_enabled(spa,
SPA_FEATURE_LIVELIST));
bplist_append(&ds->ds_dir->dd_pending_allocs, bp);
}
mutex_exit(&ds->ds_lock);
dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, delta,
compressed, uncompressed, tx);
@ -207,8 +219,8 @@ dsl_dataset_block_remapped(dsl_dataset_t *ds, uint64_t vdev, uint64_t offset,
DVA_SET_VDEV(dva, vdev);
DVA_SET_OFFSET(dva, offset);
DVA_SET_ASIZE(dva, size);
dsl_deadlist_insert(&ds->ds_remap_deadlist, &fakebp, tx);
dsl_deadlist_insert(&ds->ds_remap_deadlist, &fakebp, B_FALSE,
tx);
}
}
@ -239,6 +251,19 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
ASSERT(!ds->ds_is_snapshot);
dmu_buf_will_dirty(ds->ds_dbuf, tx);
/*
* Track block for livelist, but ignore embedded blocks because
* they do not need to be freed.
*/
if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) &&
bp->blk_birth > ds->ds_dir->dd_origin_txg &&
!(BP_IS_EMBEDDED(bp))) {
ASSERT(dsl_dir_is_clone(ds->ds_dir));
ASSERT(spa_feature_is_enabled(spa,
SPA_FEATURE_LIVELIST));
bplist_append(&ds->ds_dir->dd_pending_frees, bp);
}
if (bp->blk_birth > dsl_dataset_phys(ds)->ds_prev_snap_txg) {
int64_t delta;
@ -267,7 +292,7 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
*/
bplist_append(&ds->ds_pending_deadlist, bp);
} else {
dsl_deadlist_insert(&ds->ds_deadlist, bp, tx);
dsl_deadlist_insert(&ds->ds_deadlist, bp, B_FALSE, tx);
}
ASSERT3U(ds->ds_prev->ds_object, ==,
dsl_dataset_phys(ds)->ds_prev_snap_obj);
@ -1241,6 +1266,14 @@ dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname,
ASSERT(dmu_tx_is_syncing(tx));
ASSERT(lastname[0] != '@');
/*
* Filesystems will eventually have their origin set to dp_origin_snap,
* but that's taken care of in dsl_dataset_create_sync_dd. When
* creating a filesystem, this function is called with origin equal to
* NULL.
*/
if (origin != NULL)
ASSERT3P(origin, !=, dp->dp_origin_snap);
ddobj = dsl_dir_create_sync(dp, pdd, lastname, tx);
VERIFY0(dsl_dir_hold_obj(dp, ddobj, lastname, FTAG, &dd));
@ -1250,6 +1283,20 @@ dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname,
dsl_deleg_set_create_perms(dd, tx, cr);
/*
* If we are creating a clone and the livelist feature is enabled,
* add the entry DD_FIELD_LIVELIST to ZAP.
*/
if (origin != NULL &&
spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LIVELIST)) {
objset_t *mos = dd->dd_pool->dp_meta_objset;
dsl_dir_zapify(dd, tx);
uint64_t obj = dsl_deadlist_alloc(mos, tx);
VERIFY0(zap_add(mos, dd->dd_object, DD_FIELD_LIVELIST,
sizeof (uint64_t), 1, &obj, tx));
spa_feature_incr(dp->dp_spa, SPA_FEATURE_LIVELIST, tx);
}
/*
* Since we're creating a new node we know it's a leaf, so we can
* initialize the counts if the limit feature is active.
@ -2036,12 +2083,149 @@ dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx)
}
}
static int
deadlist_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
/*
* Check if the percentage of blocks shared between the clone and the
* snapshot (as opposed to those that are clone only) is below a certain
* threshold
*/
boolean_t
dsl_livelist_should_disable(dsl_dataset_t *ds)
{
dsl_deadlist_t *dl = arg;
dsl_deadlist_insert(dl, bp, tx);
return (0);
uint64_t used, referenced;
int percent_shared;
used = dsl_dir_get_usedds(ds->ds_dir);
referenced = dsl_get_referenced(ds);
ASSERT3U(referenced, >=, 0);
ASSERT3U(used, >=, 0);
if (referenced == 0)
return (B_FALSE);
percent_shared = (100 * (referenced - used)) / referenced;
if (percent_shared <= zfs_livelist_min_percent_shared)
return (B_TRUE);
return (B_FALSE);
}
/*
* Check if it is possible to combine two livelist entries into one.
* This is the case if the combined number of 'live' blkptrs (ALLOCs that
* don't have a matching FREE) is under the maximum sublist size.
* We check this by subtracting twice the total number of frees from the total
* number of blkptrs. FREEs are counted twice because each FREE blkptr
* will cancel out an ALLOC blkptr when the livelist is processed.
*/
static boolean_t
dsl_livelist_should_condense(dsl_deadlist_entry_t *first,
dsl_deadlist_entry_t *next)
{
uint64_t total_free = first->dle_bpobj.bpo_phys->bpo_num_freed +
next->dle_bpobj.bpo_phys->bpo_num_freed;
uint64_t total_entries = first->dle_bpobj.bpo_phys->bpo_num_blkptrs +
next->dle_bpobj.bpo_phys->bpo_num_blkptrs;
if ((total_entries - (2 * total_free)) < zfs_livelist_max_entries)
return (B_TRUE);
return (B_FALSE);
}
typedef struct try_condense_arg {
spa_t *spa;
dsl_dataset_t *ds;
} try_condense_arg_t;
/*
* Iterate over the livelist entries, searching for a pair to condense.
* A nonzero return value means stop, 0 means keep looking.
*/
static int
dsl_livelist_try_condense(void *arg, dsl_deadlist_entry_t *first)
{
try_condense_arg_t *tca = arg;
spa_t *spa = tca->spa;
dsl_dataset_t *ds = tca->ds;
dsl_deadlist_t *ll = &ds->ds_dir->dd_livelist;
dsl_deadlist_entry_t *next;
/* The condense thread has not yet been created at import */
if (spa->spa_livelist_condense_zthr == NULL)
return (1);
/* A condense is already in progress */
if (spa->spa_to_condense.ds != NULL)
return (1);
next = AVL_NEXT(&ll->dl_tree, &first->dle_node);
/* The livelist has only one entry - don't condense it */
if (next == NULL)
return (1);
/* Next is the newest entry - don't condense it */
if (AVL_NEXT(&ll->dl_tree, &next->dle_node) == NULL)
return (1);
/* This pair is not ready to condense but keep looking */
if (!dsl_livelist_should_condense(first, next))
return (0);
/*
* Add a ref to prevent the dataset from being evicted while
* the condense zthr or synctask are running. Ref will be
* released at the end of the condense synctask
*/
dmu_buf_add_ref(ds->ds_dbuf, spa);
spa->spa_to_condense.ds = ds;
spa->spa_to_condense.first = first;
spa->spa_to_condense.next = next;
spa->spa_to_condense.syncing = B_FALSE;
spa->spa_to_condense.cancelled = B_FALSE;
zthr_wakeup(spa->spa_livelist_condense_zthr);
return (1);
}
static void
dsl_flush_pending_livelist(dsl_dataset_t *ds, dmu_tx_t *tx)
{
dsl_dir_t *dd = ds->ds_dir;
spa_t *spa = ds->ds_dir->dd_pool->dp_spa;
dsl_deadlist_entry_t *last = dsl_deadlist_last(&dd->dd_livelist);
/* Check if we need to add a new sub-livelist */
if (last == NULL) {
/* The livelist is empty */
dsl_deadlist_add_key(&dd->dd_livelist,
tx->tx_txg - 1, tx);
} else if (spa_sync_pass(spa) == 1) {
/*
* Check if the newest entry is full. If it is, make a new one.
* We only do this once per sync because we could overfill a
* sublist in one sync pass and don't want to add another entry
* for a txg that is already represented. This ensures that
* blkptrs born in the same txg are stored in the same sublist.
*/
bpobj_t bpobj = last->dle_bpobj;
uint64_t all = bpobj.bpo_phys->bpo_num_blkptrs;
uint64_t free = bpobj.bpo_phys->bpo_num_freed;
uint64_t alloc = all - free;
if (alloc > zfs_livelist_max_entries) {
dsl_deadlist_add_key(&dd->dd_livelist,
tx->tx_txg - 1, tx);
}
}
/* Insert each entry into the on-disk livelist */
bplist_iterate(&dd->dd_pending_allocs,
dsl_deadlist_insert_alloc_cb, &dd->dd_livelist, tx);
bplist_iterate(&dd->dd_pending_frees,
dsl_deadlist_insert_free_cb, &dd->dd_livelist, tx);
/* Attempt to condense every pair of adjacent entries */
try_condense_arg_t arg = {
.spa = spa,
.ds = ds
};
dsl_deadlist_iterate(&dd->dd_livelist, dsl_livelist_try_condense,
&arg);
}
void
@ -2050,7 +2234,14 @@ dsl_dataset_sync_done(dsl_dataset_t *ds, dmu_tx_t *tx)
objset_t *os = ds->ds_objset;
bplist_iterate(&ds->ds_pending_deadlist,
deadlist_enqueue_cb, &ds->ds_deadlist, tx);
dsl_deadlist_insert_alloc_cb, &ds->ds_deadlist, tx);
if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist)) {
dsl_flush_pending_livelist(ds, tx);
if (dsl_livelist_should_disable(ds)) {
dsl_dir_remove_livelist(ds->ds_dir, tx, B_TRUE);
}
}
dsl_bookmark_sync_done(ds, tx);
@ -3335,6 +3526,8 @@ dsl_dataset_promote_sync(void *arg, dmu_tx_t *tx)
uint64_t oldnext_obj;
int64_t delta;
ASSERT(nvlist_empty(ddpa->err_ds));
VERIFY0(promote_hold(ddpa, dp, FTAG));
hds = ddpa->ddpa_clone;
@ -3519,6 +3712,15 @@ dsl_dataset_promote_sync(void *arg, dmu_tx_t *tx)
dsl_dataset_phys(origin_ds)->ds_unique_bytes = ddpa->unique;
/*
* Since livelists are specific to a clone's origin txg, they
* are no longer accurate. Destroy the livelist from the clone being
* promoted. If the origin dataset is a clone, destroy its livelist
* as well.
*/
dsl_dir_remove_livelist(dd, tx, B_TRUE);
dsl_dir_remove_livelist(origin_ds->ds_dir, tx, B_TRUE);
/* log history record */
spa_history_log_internal_ds(hds, "promote", tx, "");
@ -3990,6 +4192,14 @@ dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone,
dsl_scan_ds_clone_swapped(origin_head, clone, tx);
/*
* Destroy any livelists associated with the clone or the origin,
* since after the swap the corresponding livelists are no longer
* valid.
*/
dsl_dir_remove_livelist(clone->ds_dir, tx, B_TRUE);
dsl_dir_remove_livelist(origin_head->ds_dir, tx, B_TRUE);
spa_history_log_internal_ds(clone, "clone swap", tx,
"parent=%s", origin_head->ds_dir->dd_myname);
}

View File

@ -20,16 +20,16 @@
*/
/*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2018 by Delphix. All rights reserved.
* Copyright (c) 2012, 2019 by Delphix. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
*/
#include <sys/dsl_dataset.h>
#include <sys/dmu.h>
#include <sys/refcount.h>
#include <sys/zap.h>
#include <sys/zfs_context.h>
#include <sys/dsl_pool.h>
#include <sys/dsl_dataset.h>
/*
* Deadlist concurrency:
@ -51,6 +51,68 @@
* provides its own locking, and dl_oldfmt is immutable.
*/
/*
* Livelist Overview
* ================
*
* Livelists use the same 'deadlist_t' struct as deadlists and are also used
* to track blkptrs over the lifetime of a dataset. Livelists however, belong
* to clones and track the blkptrs that are clone-specific (were born after
* the clone's creation). The exception is embedded block pointers which are
* not included in livelists because they do not need to be freed.
*
* When it comes time to delete the clone, the livelist provides a quick
* reference as to what needs to be freed. For this reason, livelists also track
* when clone-specific blkptrs are freed before deletion to prevent double
* frees. Each blkptr in a livelist is marked as a FREE or an ALLOC and the
* deletion algorithm iterates backwards over the livelist, matching
* FREE/ALLOC pairs and then freeing those ALLOCs which remain. livelists
* are also updated in the case when blkptrs are remapped: the old version
* of the blkptr is cancelled out with a FREE and the new version is tracked
* with an ALLOC.
*
* To bound the amount of memory required for deletion, livelists over a
* certain size are spread over multiple entries. Entries are grouped by
* birth txg so we can be sure the ALLOC/FREE pair for a given blkptr will
* be in the same entry. This allows us to delete livelists incrementally
* over multiple syncs, one entry at a time.
*
* During the lifetime of the clone, livelists can get extremely large.
* Their size is managed by periodic condensing (preemptively cancelling out
* FREE/ALLOC pairs). Livelists are disabled when a clone is promoted or when
* the shared space between the clone and its origin is so small that it
* doesn't make sense to use livelists anymore.
*/
/*
* The threshold sublist size at which we create a new sub-livelist for the
* next txg. However, since blkptrs of the same transaction group must be in
* the same sub-list, the actual sublist size may exceed this. When picking the
* size we had to balance the fact that larger sublists mean fewer sublists
* (decreasing the cost of insertion) against the consideration that sublists
* will be loaded into memory and shouldn't take up an inordinate amount of
* space. We settled on ~500000 entries, corresponding to roughly 128M.
*/
unsigned long zfs_livelist_max_entries = 500000;
/*
* We can approximate how much of a performance gain a livelist will give us
* based on the percentage of blocks shared between the clone and its origin.
* 0 percent shared means that the clone has completely diverged and that the
* old method is maximally effective: every read from the block tree will
* result in lots of frees. Livelists give us gains when they track blocks
* scattered across the tree, when one read in the old method might only
* result in a few frees. Once the clone has been overwritten enough,
* writes are no longer sparse and we'll no longer get much of a benefit from
* tracking them with a livelist. We chose a lower limit of 75 percent shared
* (25 percent overwritten). This means that 1/4 of all block pointers will be
* freed (e.g. each read frees 256, out of a max of 1024) so we expect livelists
* to make deletion 4x faster. Once the amount of shared space drops below this
* threshold, the clone will revert to the old deletion method.
*/
int zfs_livelist_min_percent_shared = 75;
static int
dsl_deadlist_compare(const void *arg1, const void *arg2)
{
@ -88,6 +150,23 @@ dsl_deadlist_load_tree(dsl_deadlist_t *dl)
dl->dl_havetree = B_TRUE;
}
void
dsl_deadlist_iterate(dsl_deadlist_t *dl, deadlist_iter_t func, void *args)
{
dsl_deadlist_entry_t *dle;
ASSERT(dsl_deadlist_is_open(dl));
mutex_enter(&dl->dl_lock);
dsl_deadlist_load_tree(dl);
mutex_exit(&dl->dl_lock);
for (dle = avl_first(&dl->dl_tree); dle != NULL;
dle = AVL_NEXT(&dl->dl_tree, dle)) {
if (func(args, dle) != 0)
break;
}
}
void
dsl_deadlist_open(dsl_deadlist_t *dl, objset_t *os, uint64_t object)
{
@ -188,7 +267,7 @@ dsl_deadlist_free(objset_t *os, uint64_t dlobj, dmu_tx_t *tx)
static void
dle_enqueue(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle,
const blkptr_t *bp, dmu_tx_t *tx)
const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx)
{
ASSERT(MUTEX_HELD(&dl->dl_lock));
if (dle->dle_bpobj.bpo_object ==
@ -200,7 +279,7 @@ dle_enqueue(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle,
VERIFY0(zap_update_int_key(dl->dl_os, dl->dl_object,
dle->dle_mintxg, obj, tx));
}
bpobj_enqueue(&dle->dle_bpobj, bp, tx);
bpobj_enqueue(&dle->dle_bpobj, bp, bp_freed, tx);
}
static void
@ -221,14 +300,15 @@ dle_enqueue_subobj(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle,
}
void
dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx)
dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, boolean_t bp_freed,
dmu_tx_t *tx)
{
dsl_deadlist_entry_t dle_tofind;
dsl_deadlist_entry_t *dle;
avl_index_t where;
if (dl->dl_oldfmt) {
bpobj_enqueue(&dl->dl_bpobj, bp, tx);
bpobj_enqueue(&dl->dl_bpobj, bp, bp_freed, tx);
return;
}
@ -236,10 +316,12 @@ dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx)
dsl_deadlist_load_tree(dl);
dmu_buf_will_dirty(dl->dl_dbuf, tx);
int sign = bp_freed ? -1 : +1;
dl->dl_phys->dl_used +=
bp_get_dsize_sync(dmu_objset_spa(dl->dl_os), bp);
dl->dl_phys->dl_comp += BP_GET_PSIZE(bp);
dl->dl_phys->dl_uncomp += BP_GET_UCSIZE(bp);
sign * bp_get_dsize_sync(dmu_objset_spa(dl->dl_os), bp);
dl->dl_phys->dl_comp += sign * BP_GET_PSIZE(bp);
dl->dl_phys->dl_uncomp += sign * BP_GET_UCSIZE(bp);
dle_tofind.dle_mintxg = bp->blk_birth;
dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
@ -255,10 +337,26 @@ dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx)
}
ASSERT3P(dle, !=, NULL);
dle_enqueue(dl, dle, bp, tx);
dle_enqueue(dl, dle, bp, bp_freed, tx);
mutex_exit(&dl->dl_lock);
}
int
dsl_deadlist_insert_alloc_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
{
dsl_deadlist_t *dl = arg;
dsl_deadlist_insert(dl, bp, B_FALSE, tx);
return (0);
}
int
dsl_deadlist_insert_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
{
dsl_deadlist_t *dl = arg;
dsl_deadlist_insert(dl, bp, B_TRUE, tx);
return (0);
}
/*
* Insert new key in deadlist, which must be > all current entries.
* mintxg is not inclusive.
@ -316,6 +414,108 @@ dsl_deadlist_remove_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx)
mutex_exit(&dl->dl_lock);
}
/*
* Remove a deadlist entry and all of its contents by removing the entry from
* the deadlist's avl tree, freeing the entry's bpobj and adjusting the
* deadlist's space accounting accordingly.
*/
void
dsl_deadlist_remove_entry(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx)
{
uint64_t used, comp, uncomp;
dsl_deadlist_entry_t dle_tofind;
dsl_deadlist_entry_t *dle;
objset_t *os = dl->dl_os;
if (dl->dl_oldfmt)
return;
mutex_enter(&dl->dl_lock);
dsl_deadlist_load_tree(dl);
dle_tofind.dle_mintxg = mintxg;
dle = avl_find(&dl->dl_tree, &dle_tofind, NULL);
VERIFY3P(dle, !=, NULL);
avl_remove(&dl->dl_tree, dle);
VERIFY0(zap_remove_int(os, dl->dl_object, mintxg, tx));
VERIFY0(bpobj_space(&dle->dle_bpobj, &used, &comp, &uncomp));
dl->dl_phys->dl_used -= used;
dl->dl_phys->dl_comp -= comp;
dl->dl_phys->dl_uncomp -= uncomp;
if (dle->dle_bpobj.bpo_object == dmu_objset_pool(os)->dp_empty_bpobj) {
bpobj_decr_empty(os, tx);
} else {
bpobj_free(os, dle->dle_bpobj.bpo_object, tx);
}
bpobj_close(&dle->dle_bpobj);
kmem_free(dle, sizeof (*dle));
mutex_exit(&dl->dl_lock);
}
/*
* Clear out the contents of a deadlist_entry by freeing its bpobj,
* replacing it with an empty bpobj and adjusting the deadlist's
* space accounting
*/
void
dsl_deadlist_clear_entry(dsl_deadlist_entry_t *dle, dsl_deadlist_t *dl,
dmu_tx_t *tx)
{
uint64_t new_obj, used, comp, uncomp;
objset_t *os = dl->dl_os;
mutex_enter(&dl->dl_lock);
VERIFY0(zap_remove_int(os, dl->dl_object, dle->dle_mintxg, tx));
VERIFY0(bpobj_space(&dle->dle_bpobj, &used, &comp, &uncomp));
dl->dl_phys->dl_used -= used;
dl->dl_phys->dl_comp -= comp;
dl->dl_phys->dl_uncomp -= uncomp;
if (dle->dle_bpobj.bpo_object == dmu_objset_pool(os)->dp_empty_bpobj)
bpobj_decr_empty(os, tx);
else
bpobj_free(os, dle->dle_bpobj.bpo_object, tx);
bpobj_close(&dle->dle_bpobj);
new_obj = bpobj_alloc_empty(os, SPA_OLD_MAXBLOCKSIZE, tx);
VERIFY0(bpobj_open(&dle->dle_bpobj, os, new_obj));
VERIFY0(zap_add_int_key(os, dl->dl_object, dle->dle_mintxg,
new_obj, tx));
ASSERT(bpobj_is_empty(&dle->dle_bpobj));
mutex_exit(&dl->dl_lock);
}
/*
* Return the first entry in deadlist's avl tree
*/
dsl_deadlist_entry_t *
dsl_deadlist_first(dsl_deadlist_t *dl)
{
dsl_deadlist_entry_t *dle;
mutex_enter(&dl->dl_lock);
dsl_deadlist_load_tree(dl);
dle = avl_first(&dl->dl_tree);
mutex_exit(&dl->dl_lock);
return (dle);
}
/*
* Return the last entry in deadlist's avl tree
*/
dsl_deadlist_entry_t *
dsl_deadlist_last(dsl_deadlist_t *dl)
{
dsl_deadlist_entry_t *dle;
mutex_enter(&dl->dl_lock);
dsl_deadlist_load_tree(dl);
dle = avl_last(&dl->dl_tree);
mutex_exit(&dl->dl_lock);
return (dle);
}
/*
* Walk ds's snapshots to regenerate generate ZAP & AVL.
*/
@ -478,10 +678,11 @@ dsl_deadlist_insert_bpobj(dsl_deadlist_t *dl, uint64_t obj, uint64_t birth,
}
static int
dsl_deadlist_insert_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
dsl_deadlist_insert_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
dmu_tx_t *tx)
{
dsl_deadlist_t *dl = arg;
dsl_deadlist_insert(dl, bp, tx);
dsl_deadlist_insert(dl, bp, bp_freed, tx);
return (0);
}
@ -572,3 +773,109 @@ dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg,
}
mutex_exit(&dl->dl_lock);
}
typedef struct livelist_entry {
const blkptr_t *le_bp;
avl_node_t le_node;
} livelist_entry_t;
static int
livelist_compare(const void *larg, const void *rarg)
{
const blkptr_t *l = ((livelist_entry_t *)larg)->le_bp;
const blkptr_t *r = ((livelist_entry_t *)rarg)->le_bp;
/* Sort them according to dva[0] */
uint64_t l_dva0_vdev = DVA_GET_VDEV(&l->blk_dva[0]);
uint64_t r_dva0_vdev = DVA_GET_VDEV(&r->blk_dva[0]);
if (l_dva0_vdev != r_dva0_vdev)
return (AVL_CMP(l_dva0_vdev, r_dva0_vdev));
/* if vdevs are equal, sort by offsets. */
uint64_t l_dva0_offset = DVA_GET_OFFSET(&l->blk_dva[0]);
uint64_t r_dva0_offset = DVA_GET_OFFSET(&r->blk_dva[0]);
if (l_dva0_offset == r_dva0_offset)
ASSERT3U(l->blk_birth, ==, r->blk_birth);
return (AVL_CMP(l_dva0_offset, r_dva0_offset));
}
struct livelist_iter_arg {
avl_tree_t *avl;
bplist_t *to_free;
zthr_t *t;
};
/*
* Expects an AVL tree which is incrementally filled will FREE blkptrs
* and used to match up ALLOC/FREE pairs. ALLOC'd blkptrs without a
* corresponding FREE are stored in the supplied bplist.
*/
static int
dsl_livelist_iterate(void *arg, const blkptr_t *bp, boolean_t bp_freed,
dmu_tx_t *tx)
{
struct livelist_iter_arg *lia = arg;
avl_tree_t *avl = lia->avl;
bplist_t *to_free = lia->to_free;
zthr_t *t = lia->t;
ASSERT(tx == NULL);
if ((t != NULL) && (zthr_has_waiters(t) || zthr_iscancelled(t)))
return (SET_ERROR(EINTR));
if (bp_freed) {
livelist_entry_t *node = kmem_alloc(sizeof (livelist_entry_t),
KM_SLEEP);
blkptr_t *temp_bp = kmem_alloc(sizeof (blkptr_t), KM_SLEEP);
*temp_bp = *bp;
node->le_bp = temp_bp;
avl_add(avl, node);
} else {
livelist_entry_t node;
node.le_bp = bp;
livelist_entry_t *found = avl_find(avl, &node, NULL);
if (found != NULL) {
avl_remove(avl, found);
kmem_free((blkptr_t *)found->le_bp, sizeof (blkptr_t));
kmem_free(found, sizeof (livelist_entry_t));
} else {
bplist_append(to_free, bp);
}
}
return (0);
}
/*
* Accepts a bpobj and a bplist. Will insert into the bplist the blkptrs
* which have an ALLOC entry but no matching FREE
*/
int
dsl_process_sub_livelist(bpobj_t *bpobj, bplist_t *to_free, zthr_t *t,
uint64_t *size)
{
avl_tree_t avl;
avl_create(&avl, livelist_compare, sizeof (livelist_entry_t),
offsetof(livelist_entry_t, le_node));
/* process the sublist */
struct livelist_iter_arg arg = {
.avl = &avl,
.to_free = to_free,
.t = t
};
int err = bpobj_iterate_nofree(bpobj, dsl_livelist_iterate, &arg, size);
avl_destroy(&avl);
return (err);
}
#if defined(_KERNEL)
/* CSTYLED */
module_param(zfs_livelist_max_entries, ulong, 0644);
MODULE_PARM_DESC(zfs_livelist_max_entries,
"Size to start the next sub-livelist in a livelist");
module_param(zfs_livelist_min_percent_shared, int, 0644);
MODULE_PARM_DESC(zfs_livelist_min_percent_shared,
"Threshold at which livelist is disabled");
#endif

View File

@ -45,6 +45,9 @@
#include <sys/dmu_impl.h>
#include <sys/zvol.h>
#include <sys/zcp.h>
#include <sys/dsl_deadlist.h>
#include <sys/zthr.h>
#include <sys/spa_impl.h>
int
dsl_destroy_snapshot_check_impl(dsl_dataset_t *ds, boolean_t defer)
@ -120,7 +123,7 @@ struct process_old_arg {
};
static int
process_old_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
process_old_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx)
{
struct process_old_arg *poa = arg;
dsl_pool_t *dp = poa->ds->ds_dir->dd_pool;
@ -128,7 +131,7 @@ process_old_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
ASSERT(!BP_IS_HOLE(bp));
if (bp->blk_birth <= dsl_dataset_phys(poa->ds)->ds_prev_snap_txg) {
dsl_deadlist_insert(&poa->ds->ds_deadlist, bp, tx);
dsl_deadlist_insert(&poa->ds->ds_deadlist, bp, bp_freed, tx);
if (poa->ds_prev && !poa->after_branch_point &&
bp->blk_birth >
dsl_dataset_phys(poa->ds_prev)->ds_prev_snap_txg) {
@ -852,6 +855,127 @@ dsl_dir_destroy_sync(uint64_t ddobj, dmu_tx_t *tx)
dmu_object_free_zapified(mos, ddobj, tx);
}
static void
dsl_clone_destroy_assert(dsl_dir_t *dd)
{
uint64_t used, comp, uncomp;
ASSERT(dsl_dir_is_clone(dd));
dsl_deadlist_space(&dd->dd_livelist, &used, &comp, &uncomp);
ASSERT3U(dsl_dir_phys(dd)->dd_used_bytes, ==, used);
ASSERT3U(dsl_dir_phys(dd)->dd_compressed_bytes, ==, comp);
/*
* Greater than because we do not track embedded block pointers in
* the livelist
*/
ASSERT3U(dsl_dir_phys(dd)->dd_uncompressed_bytes, >=, uncomp);
ASSERT(list_is_empty(&dd->dd_pending_allocs.bpl_list));
ASSERT(list_is_empty(&dd->dd_pending_frees.bpl_list));
}
/*
* Start the delete process for a clone. Free its zil, verify the space usage
* and queue the blkptrs for deletion by adding the livelist to the pool-wide
* delete queue.
*/
static void
dsl_async_clone_destroy(dsl_dataset_t *ds, dmu_tx_t *tx)
{
uint64_t zap_obj, to_delete, used, comp, uncomp;
objset_t *os;
dsl_dir_t *dd = ds->ds_dir;
dsl_pool_t *dp = dmu_tx_pool(tx);
objset_t *mos = dp->dp_meta_objset;
spa_t *spa = dmu_tx_pool(tx)->dp_spa;
VERIFY0(dmu_objset_from_ds(ds, &os));
/* Check that the clone is in a correct state to be deleted */
dsl_clone_destroy_assert(dd);
/* Destroy the zil */
zil_destroy_sync(dmu_objset_zil(os), tx);
VERIFY0(zap_lookup(mos, dd->dd_object,
DD_FIELD_LIVELIST, sizeof (uint64_t), 1, &to_delete));
/* Initialize deleted_clones entry to track livelists to cleanup */
int error = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT,
DMU_POOL_DELETED_CLONES, sizeof (uint64_t), 1, &zap_obj);
if (error == ENOENT) {
zap_obj = zap_create(mos, DMU_OTN_ZAP_METADATA,
DMU_OT_NONE, 0, tx);
VERIFY0(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT,
DMU_POOL_DELETED_CLONES, sizeof (uint64_t), 1,
&(zap_obj), tx));
spa->spa_livelists_to_delete = zap_obj;
} else if (error != 0) {
zfs_panic_recover("zfs: error %d was returned while looking "
"up DMU_POOL_DELETED_CLONES in the zap");
return;
}
VERIFY0(zap_add_int(mos, zap_obj, to_delete, tx));
/* Clone is no longer using space, now tracked by dp_free_dir */
dsl_deadlist_space(&dd->dd_livelist, &used, &comp, &uncomp);
dsl_dir_diduse_space(dd, DD_USED_HEAD,
-used, -comp, -dsl_dir_phys(dd)->dd_uncompressed_bytes,
tx);
dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD,
used, comp, uncomp, tx);
dsl_dir_remove_livelist(dd, tx, B_FALSE);
zthr_wakeup(spa->spa_livelist_delete_zthr);
}
/*
* Move the bptree into the pool's list of trees to clean up, update space
* accounting information and destroy the zil.
*/
void
dsl_async_dataset_destroy(dsl_dataset_t *ds, dmu_tx_t *tx)
{
uint64_t used, comp, uncomp;
objset_t *os;
VERIFY0(dmu_objset_from_ds(ds, &os));
dsl_pool_t *dp = dmu_tx_pool(tx);
objset_t *mos = dp->dp_meta_objset;
zil_destroy_sync(dmu_objset_zil(os), tx);
if (!spa_feature_is_active(dp->dp_spa,
SPA_FEATURE_ASYNC_DESTROY)) {
dsl_scan_t *scn = dp->dp_scan;
spa_feature_incr(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY,
tx);
dp->dp_bptree_obj = bptree_alloc(mos, tx);
VERIFY0(zap_add(mos,
DMU_POOL_DIRECTORY_OBJECT,
DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1,
&dp->dp_bptree_obj, tx));
ASSERT(!scn->scn_async_destroying);
scn->scn_async_destroying = B_TRUE;
}
used = dsl_dir_phys(ds->ds_dir)->dd_used_bytes;
comp = dsl_dir_phys(ds->ds_dir)->dd_compressed_bytes;
uncomp = dsl_dir_phys(ds->ds_dir)->dd_uncompressed_bytes;
ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) ||
dsl_dataset_phys(ds)->ds_unique_bytes == used);
rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
bptree_add(mos, dp->dp_bptree_obj,
&dsl_dataset_phys(ds)->ds_bp,
dsl_dataset_phys(ds)->ds_prev_snap_txg,
used, comp, uncomp, tx);
rrw_exit(&ds->ds_bp_rwlock, FTAG);
dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD,
-used, -comp, -uncomp, tx);
dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD,
used, comp, uncomp, tx);
}
void
dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx)
{
@ -911,7 +1035,7 @@ dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx)
}
/*
* Destroy the deadlist. Unless it's a clone, the
* Destroy the deadlist. Unless it's a clone, the
* deadlist should be empty since the dataset has no snapshots.
* (If it's a clone, it's safe to ignore the deadlist contents
* since they are still referenced by the origin snapshot.)
@ -924,51 +1048,18 @@ dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx)
if (dsl_dataset_remap_deadlist_exists(ds))
dsl_dataset_destroy_remap_deadlist(ds, tx);
objset_t *os;
VERIFY0(dmu_objset_from_ds(ds, &os));
if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY)) {
old_synchronous_dataset_destroy(ds, tx);
/*
* Each destroy is responsible for both destroying (enqueuing
* to be destroyed) the blkptrs comprising the dataset as well as
* those belonging to the zil.
*/
if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist)) {
dsl_async_clone_destroy(ds, tx);
} else if (spa_feature_is_enabled(dp->dp_spa,
SPA_FEATURE_ASYNC_DESTROY)) {
dsl_async_dataset_destroy(ds, tx);
} else {
/*
* Move the bptree into the pool's list of trees to
* clean up and update space accounting information.
*/
uint64_t used, comp, uncomp;
zil_destroy_sync(dmu_objset_zil(os), tx);
if (!spa_feature_is_active(dp->dp_spa,
SPA_FEATURE_ASYNC_DESTROY)) {
dsl_scan_t *scn = dp->dp_scan;
spa_feature_incr(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY,
tx);
dp->dp_bptree_obj = bptree_alloc(mos, tx);
VERIFY0(zap_add(mos,
DMU_POOL_DIRECTORY_OBJECT,
DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1,
&dp->dp_bptree_obj, tx));
ASSERT(!scn->scn_async_destroying);
scn->scn_async_destroying = B_TRUE;
}
used = dsl_dir_phys(ds->ds_dir)->dd_used_bytes;
comp = dsl_dir_phys(ds->ds_dir)->dd_compressed_bytes;
uncomp = dsl_dir_phys(ds->ds_dir)->dd_uncompressed_bytes;
ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) ||
dsl_dataset_phys(ds)->ds_unique_bytes == used);
rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
bptree_add(mos, dp->dp_bptree_obj,
&dsl_dataset_phys(ds)->ds_bp,
dsl_dataset_phys(ds)->ds_prev_snap_txg,
used, comp, uncomp, tx);
rrw_exit(&ds->ds_bp_rwlock, FTAG);
dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD,
-used, -comp, -uncomp, tx);
dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD,
used, comp, uncomp, tx);
old_synchronous_dataset_destroy(ds, tx);
}
if (ds->ds_prev != NULL) {

View File

@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2017 by Delphix. All rights reserved.
* Copyright (c) 2012, 2018 by Delphix. All rights reserved.
* Copyright (c) 2013 Martin Matuska. All rights reserved.
* Copyright (c) 2014 Joyent, Inc. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
@ -48,6 +48,7 @@
#include <sys/policy.h>
#include <sys/zfs_znode.h>
#include <sys/zvol.h>
#include <sys/zthr.h>
#include "zfs_namecheck.h"
#include "zfs_prop.h"
@ -155,6 +156,9 @@ dsl_dir_evict_async(void *dbu)
spa_async_close(dd->dd_pool->dp_spa, dd);
if (dsl_deadlist_is_open(&dd->dd_livelist))
dsl_dir_livelist_close(dd);
dsl_prop_fini(dd);
mutex_destroy(&dd->dd_lock);
kmem_free(dd, sizeof (dsl_dir_t));
@ -255,6 +259,16 @@ dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj,
dd->dd_origin_txg =
origin_phys->ds_creation_txg;
dmu_buf_rele(origin_bonus, FTAG);
if (dsl_dir_is_zapified(dd)) {
uint64_t obj;
err = zap_lookup(dp->dp_meta_objset,
dd->dd_object, DD_FIELD_LIVELIST,
sizeof (uint64_t), 1, &obj);
if (err == 0)
dsl_dir_livelist_open(dd, obj);
else if (err != ENOENT)
goto errout;
}
}
dmu_buf_init_user(&dd->dd_dbu, NULL, dsl_dir_evict_async,
@ -263,6 +277,8 @@ dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj,
if (winner != NULL) {
if (dd->dd_parent)
dsl_dir_rele(dd->dd_parent, dd);
if (dsl_deadlist_is_open(&dd->dd_livelist))
dsl_dir_livelist_close(dd);
dsl_prop_fini(dd);
mutex_destroy(&dd->dd_lock);
kmem_free(dd, sizeof (dsl_dir_t));
@ -291,6 +307,8 @@ dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj,
errout:
if (dd->dd_parent)
dsl_dir_rele(dd->dd_parent, dd);
if (dsl_deadlist_is_open(&dd->dd_livelist))
dsl_dir_livelist_close(dd);
dsl_prop_fini(dd);
mutex_destroy(&dd->dd_lock);
kmem_free(dd, sizeof (dsl_dir_t));
@ -2178,6 +2196,90 @@ dsl_dir_is_zapified(dsl_dir_t *dd)
return (doi.doi_type == DMU_OTN_ZAP_METADATA);
}
void
dsl_dir_livelist_open(dsl_dir_t *dd, uint64_t obj)
{
objset_t *mos = dd->dd_pool->dp_meta_objset;
ASSERT(spa_feature_is_active(dd->dd_pool->dp_spa,
SPA_FEATURE_LIVELIST));
dsl_deadlist_open(&dd->dd_livelist, mos, obj);
bplist_create(&dd->dd_pending_allocs);
bplist_create(&dd->dd_pending_frees);
}
void
dsl_dir_livelist_close(dsl_dir_t *dd)
{
dsl_deadlist_close(&dd->dd_livelist);
bplist_destroy(&dd->dd_pending_allocs);
bplist_destroy(&dd->dd_pending_frees);
}
void
dsl_dir_remove_livelist(dsl_dir_t *dd, dmu_tx_t *tx, boolean_t total)
{
uint64_t obj;
dsl_pool_t *dp = dmu_tx_pool(tx);
spa_t *spa = dp->dp_spa;
livelist_condense_entry_t to_condense = spa->spa_to_condense;
if (!dsl_deadlist_is_open(&dd->dd_livelist))
return;
/*
* If the livelist being removed is set to be condensed, stop the
* condense zthr and indicate the cancellation in the spa_to_condense
* struct in case the condense no-wait synctask has already started
*/
zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr;
if (ll_condense_thread != NULL &&
(to_condense.ds != NULL) && (to_condense.ds->ds_dir == dd)) {
/*
* We use zthr_wait_cycle_done instead of zthr_cancel
* because we don't want to destroy the zthr, just have
* it skip its current task.
*/
spa->spa_to_condense.cancelled = B_TRUE;
zthr_wait_cycle_done(ll_condense_thread);
/*
* If we've returned from zthr_wait_cycle_done without
* clearing the to_condense data structure it's either
* because the no-wait synctask has started (which is
* indicated by 'syncing' field of to_condense) and we
* can expect it to clear to_condense on its own.
* Otherwise, we returned before the zthr ran. The
* checkfunc will now fail as cancelled == B_TRUE so we
* can safely NULL out ds, allowing a different dir's
* livelist to be condensed.
*
* We can be sure that the to_condense struct will not
* be repopulated at this stage because both this
* function and dsl_livelist_try_condense execute in
* syncing context.
*/
if ((spa->spa_to_condense.ds != NULL) &&
!spa->spa_to_condense.syncing) {
dmu_buf_rele(spa->spa_to_condense.ds->ds_dbuf,
spa);
spa->spa_to_condense.ds = NULL;
}
}
dsl_dir_livelist_close(dd);
int err = zap_lookup(dp->dp_meta_objset, dd->dd_object,
DD_FIELD_LIVELIST, sizeof (uint64_t), 1, &obj);
if (err == 0) {
VERIFY0(zap_remove(dp->dp_meta_objset, dd->dd_object,
DD_FIELD_LIVELIST, tx));
if (total) {
dsl_deadlist_free(dp->dp_meta_objset, obj, tx);
spa_feature_decr(spa, SPA_FEATURE_LIVELIST, tx);
}
} else {
ASSERT3U(err, !=, ENOENT);
}
}
#if defined(_KERNEL)
EXPORT_SYMBOL(dsl_dir_set_quota);
EXPORT_SYMBOL(dsl_dir_set_reservation);

View File

@ -721,7 +721,8 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
* Now that the datasets have been completely synced, we can
* clean up our in-memory structures accumulated while syncing:
*
* - move dead blocks from the pending deadlist to the on-disk deadlist
* - move dead blocks from the pending deadlist and livelists
* to the on-disk versions
* - release hold from dsl_dataset_dirty()
* - release key mapping hold from dsl_dataset_dirty()
*/

View File

@ -3103,8 +3103,18 @@ dsl_scan_update_stats(dsl_scan_t *scn)
}
static int
dsl_scan_obsolete_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
bpobj_dsl_scan_free_block_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
dmu_tx_t *tx)
{
ASSERT(!bp_freed);
return (dsl_scan_free_block_cb(arg, bp, tx));
}
static int
dsl_scan_obsolete_block_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
dmu_tx_t *tx)
{
ASSERT(!bp_freed);
dsl_scan_t *scn = arg;
const dva_t *dva = &bp->blk_dva[0];
@ -3123,6 +3133,7 @@ dsl_scan_active(dsl_scan_t *scn)
{
spa_t *spa = scn->scn_dp->dp_spa;
uint64_t used = 0, comp, uncomp;
boolean_t clones_left;
if (spa->spa_load_state != SPA_LOAD_NONE)
return (B_FALSE);
@ -3136,7 +3147,8 @@ dsl_scan_active(dsl_scan_t *scn)
(void) bpobj_space(&scn->scn_dp->dp_free_bpobj,
&used, &comp, &uncomp);
}
return (used != 0);
clones_left = spa_livelist_delete_check(spa);
return ((used != 0) || (clones_left));
}
static boolean_t
@ -3233,7 +3245,7 @@ dsl_process_async_destroys(dsl_pool_t *dp, dmu_tx_t *tx)
scn->scn_zio_root = zio_root(spa, NULL,
NULL, ZIO_FLAG_MUSTSUCCEED);
err = bpobj_iterate(&dp->dp_free_bpobj,
dsl_scan_free_block_cb, scn, tx);
bpobj_dsl_scan_free_block_cb, scn, tx);
VERIFY0(zio_wait(scn->scn_zio_root));
scn->scn_zio_root = NULL;
@ -3330,7 +3342,8 @@ dsl_process_async_destroys(dsl_pool_t *dp, dmu_tx_t *tx)
-dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes, tx);
}
if (dp->dp_free_dir != NULL && !scn->scn_async_destroying) {
if (dp->dp_free_dir != NULL && !scn->scn_async_destroying &&
!spa_livelist_delete_check(spa)) {
/* finished; verify that space accounting went to zero */
ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes);
ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes);

View File

@ -232,6 +232,27 @@ uint64_t zfs_max_missing_tvds_scan = 0;
*/
boolean_t zfs_pause_spa_sync = B_FALSE;
/*
* Variables to indicate the livelist condense zthr func should wait at certain
* points for the livelist to be removed - used to test condense/destroy races
*/
int zfs_livelist_condense_zthr_pause = 0;
int zfs_livelist_condense_sync_pause = 0;
/*
* Variables to track whether or not condense cancellation has been
* triggered in testing.
*/
int zfs_livelist_condense_sync_cancel = 0;
int zfs_livelist_condense_zthr_cancel = 0;
/*
* Variable to track whether or not extra ALLOC blkptrs were added to a
* livelist entry while it was being condensed (caused by the way we track
* remapped blkptrs in dbuf_remap_impl)
*/
int zfs_livelist_condense_new_alloc = 0;
/*
* ==========================================================================
* SPA properties routines
@ -1481,6 +1502,27 @@ spa_unload_log_sm_metadata(spa_t *spa)
spa->spa_unflushed_stats.sus_blocklimit = 0;
}
static void
spa_destroy_aux_threads(spa_t *spa)
{
if (spa->spa_condense_zthr != NULL) {
zthr_destroy(spa->spa_condense_zthr);
spa->spa_condense_zthr = NULL;
}
if (spa->spa_checkpoint_discard_zthr != NULL) {
zthr_destroy(spa->spa_checkpoint_discard_zthr);
spa->spa_checkpoint_discard_zthr = NULL;
}
if (spa->spa_livelist_delete_zthr != NULL) {
zthr_destroy(spa->spa_livelist_delete_zthr);
spa->spa_livelist_delete_zthr = NULL;
}
if (spa->spa_livelist_condense_zthr != NULL) {
zthr_destroy(spa->spa_livelist_condense_zthr);
spa->spa_livelist_condense_zthr = NULL;
}
}
/*
* Opposite of spa_load().
*/
@ -1552,15 +1594,7 @@ spa_unload(spa_t *spa)
spa->spa_vdev_removal = NULL;
}
if (spa->spa_condense_zthr != NULL) {
zthr_destroy(spa->spa_condense_zthr);
spa->spa_condense_zthr = NULL;
}
if (spa->spa_checkpoint_discard_zthr != NULL) {
zthr_destroy(spa->spa_checkpoint_discard_zthr);
spa->spa_checkpoint_discard_zthr = NULL;
}
spa_destroy_aux_threads(spa);
spa_condense_fini(spa);
@ -2335,6 +2369,376 @@ spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err)
return (SET_ERROR(err));
}
boolean_t
spa_livelist_delete_check(spa_t *spa)
{
return (spa->spa_livelists_to_delete != 0);
}
/* ARGSUSED */
static boolean_t
spa_livelist_delete_cb_check(void *arg, zthr_t *z)
{
spa_t *spa = arg;
return (spa_livelist_delete_check(spa));
}
static int
delete_blkptr_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
{
spa_t *spa = arg;
zio_free(spa, tx->tx_txg, bp);
dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD,
-bp_get_dsize_sync(spa, bp),
-BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx);
return (0);
}
static int
dsl_get_next_livelist_obj(objset_t *os, uint64_t zap_obj, uint64_t *llp)
{
int err;
zap_cursor_t zc;
zap_attribute_t za;
zap_cursor_init(&zc, os, zap_obj);
err = zap_cursor_retrieve(&zc, &za);
zap_cursor_fini(&zc);
if (err == 0)
*llp = za.za_first_integer;
return (err);
}
/*
* Components of livelist deletion that must be performed in syncing
* context: freeing block pointers and updating the pool-wide data
* structures to indicate how much work is left to do
*/
typedef struct sublist_delete_arg {
spa_t *spa;
dsl_deadlist_t *ll;
uint64_t key;
bplist_t *to_free;
} sublist_delete_arg_t;
static void
sublist_delete_sync(void *arg, dmu_tx_t *tx)
{
sublist_delete_arg_t *sda = arg;
spa_t *spa = sda->spa;
dsl_deadlist_t *ll = sda->ll;
uint64_t key = sda->key;
bplist_t *to_free = sda->to_free;
bplist_iterate(to_free, delete_blkptr_cb, spa, tx);
dsl_deadlist_remove_entry(ll, key, tx);
}
typedef struct livelist_delete_arg {
spa_t *spa;
uint64_t ll_obj;
uint64_t zap_obj;
} livelist_delete_arg_t;
static void
livelist_delete_sync(void *arg, dmu_tx_t *tx)
{
livelist_delete_arg_t *lda = arg;
spa_t *spa = lda->spa;
uint64_t ll_obj = lda->ll_obj;
uint64_t zap_obj = lda->zap_obj;
objset_t *mos = spa->spa_meta_objset;
uint64_t count;
/* free the livelist and decrement the feature count */
VERIFY0(zap_remove_int(mos, zap_obj, ll_obj, tx));
dsl_deadlist_free(mos, ll_obj, tx);
spa_feature_decr(spa, SPA_FEATURE_LIVELIST, tx);
VERIFY0(zap_count(mos, zap_obj, &count));
if (count == 0) {
/* no more livelists to delete */
VERIFY0(zap_remove(mos, DMU_POOL_DIRECTORY_OBJECT,
DMU_POOL_DELETED_CLONES, tx));
VERIFY0(zap_destroy(mos, zap_obj, tx));
spa->spa_livelists_to_delete = 0;
}
}
/*
* Load in the value for the livelist to be removed and open it. Then,
* load its first sublist and determine which block pointers should actually
* be freed. Then, call a synctask which performs the actual frees and updates
* the pool-wide livelist data.
*/
/* ARGSUSED */
void
spa_livelist_delete_cb(void *arg, zthr_t *z)
{
spa_t *spa = arg;
uint64_t ll_obj = 0, count;
objset_t *mos = spa->spa_meta_objset;
uint64_t zap_obj = spa->spa_livelists_to_delete;
/*
* Determine the next livelist to delete. This function should only
* be called if there is at least one deleted clone.
*/
VERIFY0(dsl_get_next_livelist_obj(mos, zap_obj, &ll_obj));
VERIFY0(zap_count(mos, ll_obj, &count));
if (count > 0) {
dsl_deadlist_t ll = { 0 };
dsl_deadlist_entry_t *dle;
bplist_t to_free;
dsl_deadlist_open(&ll, mos, ll_obj);
dle = dsl_deadlist_first(&ll);
ASSERT3P(dle, !=, NULL);
bplist_create(&to_free);
int err = dsl_process_sub_livelist(&dle->dle_bpobj, &to_free,
z, NULL);
if (err == 0) {
sublist_delete_arg_t sync_arg = {
.spa = spa,
.ll = &ll,
.key = dle->dle_mintxg,
.to_free = &to_free
};
zfs_dbgmsg("deleting sublist (id %llu) from"
" livelist %llu, %d remaining",
dle->dle_bpobj.bpo_object, ll_obj, count - 1);
VERIFY0(dsl_sync_task(spa_name(spa), NULL,
sublist_delete_sync, &sync_arg, 0,
ZFS_SPACE_CHECK_DESTROY));
} else {
ASSERT(err == EINTR);
}
bplist_clear(&to_free);
bplist_destroy(&to_free);
dsl_deadlist_close(&ll);
} else {
livelist_delete_arg_t sync_arg = {
.spa = spa,
.ll_obj = ll_obj,
.zap_obj = zap_obj
};
zfs_dbgmsg("deletion of livelist %llu completed", ll_obj);
VERIFY0(dsl_sync_task(spa_name(spa), NULL, livelist_delete_sync,
&sync_arg, 0, ZFS_SPACE_CHECK_DESTROY));
}
}
void
spa_start_livelist_destroy_thread(spa_t *spa)
{
ASSERT3P(spa->spa_livelist_delete_zthr, ==, NULL);
spa->spa_livelist_delete_zthr = zthr_create(
spa_livelist_delete_cb_check, spa_livelist_delete_cb, spa);
}
typedef struct livelist_new_arg {
bplist_t *allocs;
bplist_t *frees;
} livelist_new_arg_t;
static int
livelist_track_new_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
dmu_tx_t *tx)
{
ASSERT(tx == NULL);
livelist_new_arg_t *lna = arg;
if (bp_freed) {
bplist_append(lna->frees, bp);
} else {
bplist_append(lna->allocs, bp);
zfs_livelist_condense_new_alloc++;
}
return (0);
}
typedef struct livelist_condense_arg {
spa_t *spa;
bplist_t to_keep;
uint64_t first_size;
uint64_t next_size;
} livelist_condense_arg_t;
static void
spa_livelist_condense_sync(void *arg, dmu_tx_t *tx)
{
livelist_condense_arg_t *lca = arg;
spa_t *spa = lca->spa;
bplist_t new_frees;
dsl_dataset_t *ds = spa->spa_to_condense.ds;
/* Have we been cancelled? */
if (spa->spa_to_condense.cancelled) {
zfs_livelist_condense_sync_cancel++;
goto out;
}
dsl_deadlist_entry_t *first = spa->spa_to_condense.first;
dsl_deadlist_entry_t *next = spa->spa_to_condense.next;
dsl_deadlist_t *ll = &ds->ds_dir->dd_livelist;
/*
* It's possible that the livelist was changed while the zthr was
* running. Therefore, we need to check for new blkptrs in the two
* entries being condensed and continue to track them in the livelist.
* Because of the way we handle remapped blkptrs (see dbuf_remap_impl),
* it's possible that the newly added blkptrs are FREEs or ALLOCs so
* we need to sort them into two different bplists.
*/
uint64_t first_obj = first->dle_bpobj.bpo_object;
uint64_t next_obj = next->dle_bpobj.bpo_object;
uint64_t cur_first_size = first->dle_bpobj.bpo_phys->bpo_num_blkptrs;
uint64_t cur_next_size = next->dle_bpobj.bpo_phys->bpo_num_blkptrs;
bplist_create(&new_frees);
livelist_new_arg_t new_bps = {
.allocs = &lca->to_keep,
.frees = &new_frees,
};
if (cur_first_size > lca->first_size) {
VERIFY0(livelist_bpobj_iterate_from_nofree(&first->dle_bpobj,
livelist_track_new_cb, &new_bps, lca->first_size));
}
if (cur_next_size > lca->next_size) {
VERIFY0(livelist_bpobj_iterate_from_nofree(&next->dle_bpobj,
livelist_track_new_cb, &new_bps, lca->next_size));
}
dsl_deadlist_clear_entry(first, ll, tx);
ASSERT(bpobj_is_empty(&first->dle_bpobj));
dsl_deadlist_remove_entry(ll, next->dle_mintxg, tx);
bplist_iterate(&lca->to_keep, dsl_deadlist_insert_alloc_cb, ll, tx);
bplist_iterate(&new_frees, dsl_deadlist_insert_free_cb, ll, tx);
bplist_destroy(&new_frees);
char dsname[ZFS_MAX_DATASET_NAME_LEN];
dsl_dataset_name(ds, dsname);
zfs_dbgmsg("txg %llu condensing livelist of %s (id %llu), bpobj %llu "
"(%llu blkptrs) and bpobj %llu (%llu blkptrs) -> bpobj %llu "
"(%llu blkptrs)", tx->tx_txg, dsname, ds->ds_object, first_obj,
cur_first_size, next_obj, cur_next_size,
first->dle_bpobj.bpo_object,
first->dle_bpobj.bpo_phys->bpo_num_blkptrs);
out:
dmu_buf_rele(ds->ds_dbuf, spa);
spa->spa_to_condense.ds = NULL;
bplist_clear(&lca->to_keep);
bplist_destroy(&lca->to_keep);
kmem_free(lca, sizeof (livelist_condense_arg_t));
spa->spa_to_condense.syncing = B_FALSE;
}
void
spa_livelist_condense_cb(void *arg, zthr_t *t)
{
while (zfs_livelist_condense_zthr_pause &&
!(zthr_has_waiters(t) || zthr_iscancelled(t)))
delay(1);
spa_t *spa = arg;
dsl_deadlist_entry_t *first = spa->spa_to_condense.first;
dsl_deadlist_entry_t *next = spa->spa_to_condense.next;
uint64_t first_size, next_size;
livelist_condense_arg_t *lca =
kmem_alloc(sizeof (livelist_condense_arg_t), KM_SLEEP);
bplist_create(&lca->to_keep);
/*
* Process the livelists (matching FREEs and ALLOCs) in open context
* so we have minimal work in syncing context to condense.
*
* We save bpobj sizes (first_size and next_size) to use later in
* syncing context to determine if entries were added to these sublists
* while in open context. This is possible because the clone is still
* active and open for normal writes and we want to make sure the new,
* unprocessed blockpointers are inserted into the livelist normally.
*
* Note that dsl_process_sub_livelist() both stores the size number of
* blockpointers and iterates over them while the bpobj's lock held, so
* the sizes returned to us are consistent which what was actually
* processed.
*/
int err = dsl_process_sub_livelist(&first->dle_bpobj, &lca->to_keep, t,
&first_size);
if (err == 0)
err = dsl_process_sub_livelist(&next->dle_bpobj, &lca->to_keep,
t, &next_size);
if (err == 0) {
while (zfs_livelist_condense_sync_pause &&
!(zthr_has_waiters(t) || zthr_iscancelled(t)))
delay(1);
dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
dmu_tx_mark_netfree(tx);
dmu_tx_hold_space(tx, 1);
err = dmu_tx_assign(tx, TXG_NOWAIT | TXG_NOTHROTTLE);
if (err == 0) {
/*
* Prevent the condense zthr restarting before
* the synctask completes.
*/
spa->spa_to_condense.syncing = B_TRUE;
lca->spa = spa;
lca->first_size = first_size;
lca->next_size = next_size;
dsl_sync_task_nowait(spa_get_dsl(spa),
spa_livelist_condense_sync, lca, 0,
ZFS_SPACE_CHECK_NONE, tx);
dmu_tx_commit(tx);
return;
}
}
/*
* Condensing can not continue: either it was externally stopped or
* we were unable to assign to a tx because the pool has run out of
* space. In the second case, we'll just end up trying to condense
* again in a later txg.
*/
ASSERT(err != 0);
bplist_clear(&lca->to_keep);
bplist_destroy(&lca->to_keep);
kmem_free(lca, sizeof (livelist_condense_arg_t));
dmu_buf_rele(spa->spa_to_condense.ds->ds_dbuf, spa);
spa->spa_to_condense.ds = NULL;
if (err == EINTR)
zfs_livelist_condense_zthr_cancel++;
}
/* ARGSUSED */
/*
* Check that there is something to condense but that a condense is not
* already in progress and that condensing has not been cancelled.
*/
static boolean_t
spa_livelist_condense_cb_check(void *arg, zthr_t *z)
{
spa_t *spa = arg;
if ((spa->spa_to_condense.ds != NULL) &&
(spa->spa_to_condense.syncing == B_FALSE) &&
(spa->spa_to_condense.cancelled == B_FALSE)) {
return (B_TRUE);
}
return (B_FALSE);
}
void
spa_start_livelist_condensing_thread(spa_t *spa)
{
spa->spa_to_condense.ds = NULL;
spa->spa_to_condense.first = NULL;
spa->spa_to_condense.next = NULL;
spa->spa_to_condense.syncing = B_FALSE;
spa->spa_to_condense.cancelled = B_FALSE;
ASSERT3P(spa->spa_livelist_condense_zthr, ==, NULL);
spa->spa_livelist_condense_zthr = zthr_create(
spa_livelist_condense_cb_check, spa_livelist_condense_cb, spa);
}
static void
spa_spawn_aux_threads(spa_t *spa)
{
@ -2343,6 +2747,8 @@ spa_spawn_aux_threads(spa_t *spa)
ASSERT(MUTEX_HELD(&spa_namespace_lock));
spa_start_indirect_condensing_thread(spa);
spa_start_livelist_destroy_thread(spa);
spa_start_livelist_condensing_thread(spa);
ASSERT3P(spa->spa_checkpoint_discard_zthr, ==, NULL);
spa->spa_checkpoint_discard_zthr =
@ -3603,6 +4009,15 @@ spa_ld_get_props(spa_t *spa)
if (error != 0 && error != ENOENT)
return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
/*
* Load the livelist deletion field. If a livelist is queued for
* deletion, indicate that in the spa
*/
error = spa_dir_prop(spa, DMU_POOL_DELETED_CLONES,
&spa->spa_livelists_to_delete, B_FALSE);
if (error != 0 && error != ENOENT)
return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
/*
* Load the history object. If we have an older pool, this
* will not be present.
@ -7571,6 +7986,14 @@ spa_async_suspend(spa_t *spa)
zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr;
if (discard_thread != NULL)
zthr_cancel(discard_thread);
zthr_t *ll_delete_thread = spa->spa_livelist_delete_zthr;
if (ll_delete_thread != NULL)
zthr_cancel(ll_delete_thread);
zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr;
if (ll_condense_thread != NULL)
zthr_cancel(ll_condense_thread);
}
void
@ -7589,6 +8012,14 @@ spa_async_resume(spa_t *spa)
zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr;
if (discard_thread != NULL)
zthr_resume(discard_thread);
zthr_t *ll_delete_thread = spa->spa_livelist_delete_zthr;
if (ll_delete_thread != NULL)
zthr_resume(ll_delete_thread);
zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr;
if (ll_condense_thread != NULL)
zthr_resume(ll_condense_thread);
}
static boolean_t
@ -7639,14 +8070,28 @@ spa_async_request(spa_t *spa, int task)
* ==========================================================================
*/
static int
bpobj_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
bpobj_enqueue_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
dmu_tx_t *tx)
{
bpobj_t *bpo = arg;
bpobj_enqueue(bpo, bp, tx);
bpobj_enqueue(bpo, bp, bp_freed, tx);
return (0);
}
int
bpobj_enqueue_alloc_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
{
return (bpobj_enqueue_cb(arg, bp, B_FALSE, tx));
}
int
bpobj_enqueue_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
{
return (bpobj_enqueue_cb(arg, bp, B_TRUE, tx));
}
static int
spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
{
@ -7657,6 +8102,14 @@ spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
return (0);
}
static int
bpobj_spa_free_sync_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
dmu_tx_t *tx)
{
ASSERT(!bp_freed);
return (spa_free_sync_cb(arg, bp, tx));
}
/*
* Note: this simple function is not inlined to make it easier to dtrace the
* amount of time spent syncing frees.
@ -7693,7 +8146,7 @@ spa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx)
*/
zio_t *zio = zio_root(spa, NULL, NULL, 0);
VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj,
spa_free_sync_cb, zio, tx), ==, 0);
bpobj_spa_free_sync_cb, zio, tx), ==, 0);
VERIFY0(zio_wait(zio));
}
@ -8296,7 +8749,7 @@ spa_sync_iterate_to_convergence(spa_t *spa, dmu_tx_t *tx)
* we sync the deferred frees later in pass 1.
*/
ASSERT3U(pass, >, 1);
bplist_iterate(free_bpl, bpobj_enqueue_cb,
bplist_iterate(free_bpl, bpobj_enqueue_alloc_cb,
&spa->spa_deferred_bpobj, tx);
}
@ -8884,4 +9337,24 @@ MODULE_PARM_DESC(zfs_max_missing_tvds,
" (in read-only mode)");
/* END CSTYLED */
module_param(zfs_livelist_condense_zthr_pause, int, 0644);
MODULE_PARM_DESC(zfs_livelist_condense_zthr_pause,
"Set the livelist condense zthr to pause");
module_param(zfs_livelist_condense_sync_pause, int, 0644);
MODULE_PARM_DESC(zfs_livelist_condense_sync_pause,
"Set the livelist condense synctask to pause");
module_param(zfs_livelist_condense_sync_cancel, int, 0644);
MODULE_PARM_DESC(zfs_livelist_condense_sync_cancel,
"Whether livelist condensing was canceled in the synctask");
module_param(zfs_livelist_condense_zthr_cancel, int, 0644);
MODULE_PARM_DESC(zfs_livelist_condense_zthr_cancel,
"Whether livelist condensing was canceled in the zthr function");
/* BEGIN CSTYLED */
module_param(zfs_livelist_condense_new_alloc, int, 0644);
MODULE_PARM_DESC(zfs_livelist_condense_new_alloc,
"Whether extra ALLOC blkptrs were added to a livelist entry while it"
" was being condensed");
/* END CSTYLED */
#endif

View File

@ -21,7 +21,7 @@
/*
* Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011, 2015 by Delphix. All rights reserved.
* Copyright (c) 2011, 2018 by Delphix. All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
* Copyright 2017 Joyent, Inc.
*/
@ -413,7 +413,6 @@ spa_history_log_nvl(spa_t *spa, nvlist_t *nvl)
/* spa_history_log_sync will free nvl */
return (err);
}
/*

View File

@ -207,12 +207,15 @@ struct zthr {
/* flag set to true if we are canceling the zthr */
boolean_t zthr_cancel;
/* flag set to true if we are waiting for the zthr to finish */
boolean_t zthr_haswaiters;
kcondvar_t zthr_wait_cv;
/*
* maximum amount of time that the zthr is spent sleeping;
* if this is 0, the thread doesn't wake up until it gets
* signaled.
*/
hrtime_t zthr_wait_time;
hrtime_t zthr_sleep_timeout;
/* consumer-provided callbacks & data */
zthr_checkfunc_t *zthr_checkfunc;
@ -239,14 +242,18 @@ zthr_procedure(void *arg)
* order to prevent this process from incorrectly
* contributing to the system load average when idle.
*/
if (t->zthr_wait_time == 0) {
if (t->zthr_sleep_timeout == 0) {
cv_wait_sig(&t->zthr_cv, &t->zthr_state_lock);
} else {
(void) cv_timedwait_sig_hires(&t->zthr_cv,
&t->zthr_state_lock, t->zthr_wait_time,
&t->zthr_state_lock, t->zthr_sleep_timeout,
MSEC2NSEC(1), 0);
}
}
if (t->zthr_haswaiters) {
t->zthr_haswaiters = B_FALSE;
cv_broadcast(&t->zthr_wait_cv);
}
}
/*
@ -280,12 +287,13 @@ zthr_create_timer(zthr_checkfunc_t *checkfunc, zthr_func_t *func,
mutex_init(&t->zthr_state_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&t->zthr_request_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&t->zthr_cv, NULL, CV_DEFAULT, NULL);
cv_init(&t->zthr_wait_cv, NULL, CV_DEFAULT, NULL);
mutex_enter(&t->zthr_state_lock);
t->zthr_checkfunc = checkfunc;
t->zthr_func = func;
t->zthr_arg = arg;
t->zthr_wait_time = max_sleep;
t->zthr_sleep_timeout = max_sleep;
t->zthr_thread = thread_create(NULL, 0, zthr_procedure, t,
0, &p0, TS_RUN, minclsyspri);
@ -303,6 +311,7 @@ zthr_destroy(zthr_t *t)
mutex_destroy(&t->zthr_request_lock);
mutex_destroy(&t->zthr_state_lock);
cv_destroy(&t->zthr_cv);
cv_destroy(&t->zthr_wait_cv);
kmem_free(t, sizeof (*t));
}
@ -355,9 +364,8 @@ zthr_cancel(zthr_t *t)
*
* [1] The thread has already been cancelled, therefore
* there is nothing for us to do.
* [2] The thread is sleeping, so we broadcast the CV first
* to wake it up and then we set the flag and we are
* waiting for it to exit.
* [2] The thread is sleeping so we set the flag, broadcast
* the CV and wait for it to exit.
* [3] The thread is doing work, in which case we just set
* the flag and wait for it to finish.
* [4] The thread was just created/resumed, in which case
@ -397,6 +405,7 @@ zthr_resume(zthr_t *t)
ASSERT3P(&t->zthr_checkfunc, !=, NULL);
ASSERT3P(&t->zthr_func, !=, NULL);
ASSERT(!t->zthr_cancel);
ASSERT(!t->zthr_haswaiters);
/*
* There are 4 states that we find the zthr in at this point
@ -451,3 +460,74 @@ zthr_iscancelled(zthr_t *t)
mutex_exit(&t->zthr_state_lock);
return (cancelled);
}
/*
* Wait for the zthr to finish its current function. Similar to
* zthr_iscancelled, you can use zthr_has_waiters to have the zthr_func end
* early. Unlike zthr_cancel, the thread is not destroyed. If the zthr was
* sleeping or cancelled, return immediately.
*/
void
zthr_wait_cycle_done(zthr_t *t)
{
mutex_enter(&t->zthr_state_lock);
/*
* Since we are holding the zthr_state_lock at this point
* we can find the state in one of the following 5 states:
*
* [1] The thread has already cancelled, therefore
* there is nothing for us to do.
* [2] The thread is sleeping so we set the flag, broadcast
* the CV and wait for it to exit.
* [3] The thread is doing work, in which case we just set
* the flag and wait for it to finish.
* [4] The thread was just created/resumed, in which case
* the behavior is similar to [3].
* [5] The thread is the middle of being cancelled, which is
* similar to [3]. We'll wait for the cancel, which is
* waiting for the zthr func.
*
* Since requests are serialized, by the time that we get
* control back we expect that the zthr has completed it's
* zthr_func.
*/
if (t->zthr_thread != NULL) {
t->zthr_haswaiters = B_TRUE;
/* broadcast in case the zthr is sleeping */
cv_broadcast(&t->zthr_cv);
while ((t->zthr_haswaiters) && (t->zthr_thread != NULL))
cv_wait(&t->zthr_wait_cv, &t->zthr_state_lock);
ASSERT(!t->zthr_haswaiters);
}
mutex_exit(&t->zthr_state_lock);
}
/*
* This function is intended to be used by the zthr itself
* to check if another thread is waiting on it to finish
*
* returns TRUE if we have been asked to finish.
*
* returns FALSE otherwise.
*/
boolean_t
zthr_has_waiters(zthr_t *t)
{
ASSERT3P(t->zthr_thread, ==, curthread);
mutex_enter(&t->zthr_state_lock);
/*
* Similarly to zthr_iscancelled(), we only grab the
* zthr_state_lock so that the zthr itself can use this
* to check for the request.
*/
boolean_t has_waiters = t->zthr_haswaiters;
mutex_exit(&t->zthr_state_lock);
return (has_waiters);
}

View File

@ -147,12 +147,15 @@ tests = ['zfs_create_001_pos', 'zfs_create_002_pos', 'zfs_create_003_pos',
tags = ['functional', 'cli_root', 'zfs_create']
[tests/functional/cli_root/zfs_destroy]
tests = ['zfs_destroy_001_pos', 'zfs_destroy_002_pos', 'zfs_destroy_003_pos',
tests = ['zfs_clone_livelist_condense_and_disable',
'zfs_clone_livelist_condense_races', 'zfs_destroy_001_pos',
'zfs_destroy_002_pos', 'zfs_destroy_003_pos',
'zfs_destroy_004_pos', 'zfs_destroy_005_neg', 'zfs_destroy_006_neg',
'zfs_destroy_007_neg', 'zfs_destroy_008_pos', 'zfs_destroy_009_pos',
'zfs_destroy_010_pos', 'zfs_destroy_011_pos', 'zfs_destroy_012_pos',
'zfs_destroy_013_neg', 'zfs_destroy_014_pos', 'zfs_destroy_015_pos',
'zfs_destroy_016_pos']
'zfs_destroy_016_pos', 'zfs_destroy_clone_livelist',
'zfs_destroy_dev_removal', 'zfs_destroy_dev_removal_condense']
tags = ['functional', 'cli_root', 'zfs_destroy']
[tests/functional/cli_root/zfs_diff]

View File

@ -22,7 +22,7 @@
#
# Copyright 2009 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
# Copyright (c) 2012, 2017 by Delphix. All rights reserved.
# Copyright (c) 2012, 2018 by Delphix. All rights reserved.
# Copyright (c) 2017 by Tim Chase. All rights reserved.
# Copyright (c) 2017 by Nexenta Systems, Inc. All rights reserved.
# Copyright (c) 2017 Lawrence Livermore National Security, LLC.

View File

@ -2,6 +2,8 @@ pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/cli_root/zfs_destro
dist_pkgdata_SCRIPTS = \
setup.ksh \
cleanup.ksh \
zfs_clone_livelist_condense_and_disable.ksh \
zfs_clone_livelist_condense_races.ksh \
zfs_destroy_001_pos.ksh \
zfs_destroy_002_pos.ksh \
zfs_destroy_003_pos.ksh \
@ -17,7 +19,10 @@ dist_pkgdata_SCRIPTS = \
zfs_destroy_013_neg.ksh \
zfs_destroy_014_pos.ksh \
zfs_destroy_015_pos.ksh \
zfs_destroy_016_pos.ksh
zfs_destroy_016_pos.ksh \
zfs_destroy_clone_livelist.ksh \
zfs_destroy_dev_removal.ksh \
zfs_destroy_dev_removal_condense.ksh
dist_pkgdata_DATA = \
zfs_destroy_common.kshlib \

View File

@ -0,0 +1,125 @@
#!/bin/ksh -p
#
# This file and its contents are supplied under the terms of the
# Common Development and Distribution License ("CDDL"), version 1.0.
# You may only use this file in accordance with the terms of version
# 1.0 of the CDDL.
#
# A full copy of the text of the CDDL should have accompanied this
# source. A copy of the CDDL is also available via the Internet at
# http://www.illumos.org/license/CDDL.
#
#
# Copyright (c) 2018 by Delphix. All rights reserved.
#
# DESCRIPTION
# Verify zfs destroy test for clones with the livelist feature
# enabled.
# STRATEGY
# 1. Clone where livelist is condensed
# - create clone, write several files, delete those files
# - check that the number of livelist entries decreases
# after the delete
# 2. Clone where livelist is deactivated
# - create clone, write files. Delete those files and the
# file in the filesystem when the snapshot was created
# so the clone and snapshot no longer share data
# - check that the livelist is destroyed
. $STF_SUITE/include/libtest.shlib
. $STF_SUITE/tests/functional/cli_root/zfs_destroy/zfs_destroy_common.kshlib
function cleanup
{
log_must zfs destroy -Rf $TESTPOOL/$TESTFS1
# reset the livelist sublist size to the original value
set_tunable64 zfs_livelist_max_entries $ORIGINAL_MAX
# reset the minimum percent shared to 75
set_tunable32 zfs_livelist_min_percent_shared $ORIGINAL_MIN
}
function check_ll_len
{
string="$(zdb -vvvvv $TESTPOOL | grep "Livelist")"
substring="$1"
msg=$2
if test "${string#*$substring}" != "$string"; then
return 0 # $substring is in $string
else
log_note $string
log_fail "$msg" # $substring is not in $string
fi
}
function test_condense
{
# set the max livelist entries to a small value to more easily
# trigger a condense
set_tunable64 zfs_livelist_max_entries 0x14
# set a small percent shared threshold so the livelist is not disabled
set_tunable32 zfs_livelist_min_percent_shared 0xa
clone_dataset $TESTFS1 snap $TESTCLONE
# sync between each write to make sure a new entry is created
for i in {0..4}; do
log_must mkfile 5m /$TESTPOOL/$TESTCLONE/testfile$i
log_must zpool sync $TESTPOOL
done
check_ll_len "5 entries" "Unexpected livelist size"
# sync between each write to allow for a condense of the previous entry
for i in {0..4}; do
log_must mkfile 5m /$TESTPOOL/$TESTCLONE/testfile$i
log_must zpool sync $TESTPOOL
done
check_ll_len "6 entries" "Condense did not occur"
log_must zfs destroy $TESTPOOL/$TESTCLONE
check_livelist_gone
}
function test_deactivated
{
# Threshold set to 50 percent
set_tunable32 zfs_livelist_min_percent_shared 0x32
clone_dataset $TESTFS1 snap $TESTCLONE
log_must mkfile 5m /$TESTPOOL/$TESTCLONE/$TESTFILE0
log_must mkfile 5m /$TESTPOOL/$TESTCLONE/$TESTFILE1
log_must zpool sync $TESTPOOL
# snapshot and clone share 'atestfile', 33 percent
check_livelist_gone
log_must zfs destroy -R $TESTPOOL/$TESTCLONE
# Threshold set to 20 percent
set_tunable32 zfs_livelist_min_percent_shared 0x14
clone_dataset $TESTFS1 snap $TESTCLONE
log_must mkfile 5m /$TESTPOOL/$TESTCLONE/$TESTFILE0
log_must mkfile 5m /$TESTPOOL/$TESTCLONE/$TESTFILE1
log_must mkfile 5m /$TESTPOOL/$TESTCLONE/$TESTFILE2
log_must zpool sync $TESTPOOL
# snapshot and clone share 'atestfile', 25 percent
check_livelist_exists $TESTCLONE
log_must rm /$TESTPOOL/$TESTCLONE/atestfile
# snapshot and clone share no files
check_livelist_gone
log_must zfs destroy -R $TESTPOOL/$TESTCLONE
}
ORIGINAL_MAX=$(get_tunable zfs_livelist_max_entries)
ORIGINAL_MIN=$(get_tunable zfs_livelist_min_percent_shared)
log_onexit cleanup
log_must zfs create $TESTPOOL/$TESTFS1
log_must mkfile 5m /$TESTPOOL/$TESTFS1/atestfile
log_must zfs snapshot $TESTPOOL/$TESTFS1@snap
test_condense
test_deactivated
log_pass "Clone's livelist condenses and disables as expected."

View File

@ -0,0 +1,116 @@
#!/bin/ksh -p
#
# This file and its contents are supplied under the terms of the
# Common Development and Distribution License ("CDDL"), version 1.0.
# You may only use this file in accordance with the terms of version
# 1.0 of the CDDL.
#
# A full copy of the text of the CDDL should have accompanied this
# source. A copy of the CDDL is also available via the Internet at
# http://www.illumos.org/license/CDDL.
#
#
# Copyright (c) 2018 by Delphix. All rights reserved.
#
# DESCRIPTION
# Test race conditions for livelist condensing
# STRATEGY
# These tests exercise code paths that deal with a livelist being
# simultaneously condensed and deactivated (deleted, exported or disabled).
# If a variable is set, the zthr will pause until it is cancelled or waited
# and then a counter variable keeps track of whether or not the code path is
# reached.
# 1. Deletion race: repeatedly overwrite the same file to trigger condense
# and then delete the clone.
# 2. Disable race: Overwrite enough files to trigger condenses and disabling of
# the livelist.
# 3. Export race: repeatedly overwrite the same file to trigger condense and
# then export the pool.
. $STF_SUITE/include/libtest.shlib
function cleanup
{
log_must zfs destroy -Rf $TESTPOOL/$TESTFS1
# reset the livelist sublist size to the original value
set_tunable64 zfs_livelist_max_entries $ORIGINAL_MAX
# reset the condense tests to 0
set_tunable32 zfs_livelist_condense_zthr_pause 0
set_tunable32 zfs_livelist_condense_sync_pause 0
}
function delete_race
{
set_tunable32 "$1" 0
log_must zfs clone $TESTPOOL/$TESTFS1@snap $TESTPOOL/$TESTCLONE
for i in {1..5}; do
log_must zpool sync $TESTPOOL
log_must mkfile 5m /$TESTPOOL/$TESTCLONE/out
done
log_must zfs destroy $TESTPOOL/$TESTCLONE
log_must zpool sync $TESTPOOL
[[ "1" == "$(get_tunable "$1")" ]] || \
log_fail "delete/condense race test failed"
}
function export_race
{
set_tunable32 "$1" 0
log_must zfs clone $TESTPOOL/$TESTFS1@snap $TESTPOOL/$TESTCLONE
for i in {1..5}; do
log_must zpool sync $TESTPOOL
log_must mkfile 5m /$TESTPOOL/$TESTCLONE/out
done
log_must zpool export $TESTPOOL
log_must zpool import $TESTPOOL
[[ "1" == "$(get_tunable "$1")" ]] || \
log_fail "export/condense race test failed"
log_must zfs destroy $TESTPOOL/$TESTCLONE
}
function disable_race
{
set_tunable32 "$1" 0
log_must zfs clone $TESTPOOL/$TESTFS1@snap $TESTPOOL/$TESTCLONE
for i in {1..5}; do
log_must zpool sync $TESTPOOL
log_must mkfile 5m /$TESTPOOL/$TESTCLONE/out
done
# overwrite the file shared with the origin to trigger disable
log_must mkfile 100m /$TESTPOOL/$TESTCLONE/atestfile
log_must zpool sync $TESTPOOL
[[ "1" == "$(get_tunable "$1")" ]] || \
log_fail "disable/condense race test failed"
log_must zfs destroy $TESTPOOL/$TESTCLONE
}
ORIGINAL_MAX=$(get_tunable zfs_livelist_max_entries)
log_onexit cleanup
log_must zfs create $TESTPOOL/$TESTFS1
log_must mkfile 100m /$TESTPOOL/$TESTFS1/atestfile
log_must zpool sync $TESTPOOL
log_must zfs snapshot $TESTPOOL/$TESTFS1@snap
# Reduce livelist size to trigger condense more easily
set_tunable64 zfs_livelist_max_entries 0x14
# Test cancellation path in the zthr
set_tunable32 zfs_livelist_condense_zthr_pause 1
set_tunable32 zfs_livelist_condense_sync_pause 0
disable_race "zfs_livelist_condense_zthr_cancel"
delete_race "zfs_livelist_condense_zthr_cancel"
export_race "zfs_livelist_condense_zthr_cancel"
# Test cancellation path in the synctask
set_tunable32 zfs_livelist_condense_zthr_pause 0
set_tunable32 zfs_livelist_condense_sync_pause 1
disable_race "zfs_livelist_condense_sync_cancel"
delete_race "zfs_livelist_condense_sync_cancel"
log_pass "Clone livelist condense race conditions passed."

View File

@ -0,0 +1,140 @@
#!/bin/ksh -p
#
# This file and its contents are supplied under the terms of the
# Common Development and Distribution License ("CDDL"), version 1.0.
# You may only use this file in accordance with the terms of version
# 1.0 of the CDDL.
#
# A full copy of the text of the CDDL should have accompanied this
# source. A copy of the CDDL is also available via the Internet at
# http://www.illumos.org/license/CDDL.
#
#
# Copyright (c) 2018 by Delphix. All rights reserved.
#
# DESCRIPTION
# Verify zfs destroy test for clones with the livelist feature
# enabled.
# STRATEGY
# 1. One clone with an empty livelist
# - create the clone, check that livelist exists
# - delete the clone, check that livelist is eventually
# destroyed
# 2. One clone with populated livelist
# - create the clone, check that livelist exists
# - write multiple files to the clone
# - delete the clone, check that livelist is eventually
# destroyed
# 3. Multiple clones with empty livelists
# - same as 1. but with multiple clones
# 4. Multuple clones with populated livelists
# - same as 2. but with multiple clones
. $STF_SUITE/include/libtest.shlib
. $STF_SUITE/tests/functional/cli_root/zfs_destroy/zfs_destroy_common.kshlib
function cleanup
{
datasetexists $TESTPOOL/$TESTFS1 && zfs destroy -R $TESTPOOL/$TESTFS1
# reset the livelist sublist size to its original value
set_tunable64 zfs_livelist_max_entries $ORIGINAL_MAX
}
function clone_write_file
{
log_must mkfile 1m /$TESTPOOL/$1/$2
log_must zpool sync $TESTPOOL
}
function test_one_empty
{
clone_dataset $TESTFS1 snap $TESTCLONE
log_must zfs destroy $TESTPOOL/$TESTCLONE
check_livelist_gone
}
function test_one
{
clone_dataset $TESTFS1 snap $TESTCLONE
clone_write_file $TESTCLONE $TESTFILE0
clone_write_file $TESTCLONE $TESTFILE1
clone_write_file $TESTCLONE $TESTFILE2
log_must rm /$TESTPOOL/$TESTCLONE/$TESTFILE0
log_must rm /$TESTPOOL/$TESTCLONE/$TESTFILE2
check_livelist_exists $TESTCLONE
log_must zfs destroy $TESTPOOL/$TESTCLONE
check_livelist_gone
}
function test_multiple_empty
{
clone_dataset $TESTFS1 snap $TESTCLONE
clone_dataset $TESTFS1 snap $TESTCLONE1
clone_dataset $TESTFS1 snap $TESTCLONE2
log_must zfs destroy $TESTPOOL/$TESTCLONE
log_must zfs destroy $TESTPOOL/$TESTCLONE1
log_must zfs destroy $TESTPOOL/$TESTCLONE2
check_livelist_gone
}
function test_multiple
{
clone_dataset $TESTFS1 snap $TESTCLONE
clone_dataset $TESTFS1 snap $TESTCLONE1
clone_dataset $TESTFS1 snap $TESTCLONE2
clone_write_file $TESTCLONE $TESTFILE0
clone_write_file $TESTCLONE1 $TESTFILE0
clone_write_file $TESTCLONE1 $TESTFILE1
clone_write_file $TESTCLONE1 $TESTFILE2
clone_write_file $TESTCLONE2 $TESTFILE0
log_must rm /$TESTPOOL/$TESTCLONE2/$TESTFILE0
clone_write_file $TESTCLONE2 $TESTFILE1
log_must rm /$TESTPOOL/$TESTCLONE2/$TESTFILE1
check_livelist_exists $TESTCLONE
check_livelist_exists $TESTCLONE1
check_livelist_exists $TESTCLONE2
log_must zfs destroy $TESTPOOL/$TESTCLONE
log_must zfs destroy $TESTPOOL/$TESTCLONE1
log_must zfs destroy $TESTPOOL/$TESTCLONE2
check_livelist_gone
}
function test_promote
{
clone_dataset $TESTFS1 snap $TESTCLONE
log_must zfs promote $TESTPOOL/$TESTCLONE
check_livelist_gone
log_must zfs destroy -R $TESTPOOL/$TESTCLONE
}
ORIGINAL_MAX=$(get_tunable zfs_livelist_max_entries)
log_onexit cleanup
log_must zfs create $TESTPOOL/$TESTFS1
log_must mkfile 20m /$TESTPOOL/$TESTFS1/atestfile
log_must zfs snapshot $TESTPOOL/$TESTFS1@snap
# set a small livelist entry size to more easily test multiple entry livelists
set_tunable64 zfs_livelist_max_entries 0x14
test_one_empty
test_one
test_multiple_empty
test_multiple
test_promote
log_pass "Clone with the livelist feature enabled could be destroyed," \
"also could be promoted and destroyed as expected."

View File

@ -25,7 +25,7 @@
#
#
# Copyright (c) 2012, 2016 by Delphix. All rights reserved.
# Copyright (c) 2012, 2018 by Delphix. All rights reserved.
#
. $STF_SUITE/include/libtest.shlib
@ -146,3 +146,43 @@ function check_dataset
done
fi
}
# Use zdb to see if a livelist exists for a given clone
# $1 clone name
function check_livelist_exists
{
zdb -vvvvv $TESTPOOL/$1 | grep "Livelist" || \
log_fail "zdb could not find Livelist"
}
# Wait for the deferred destroy livelists to be removed
function wait_for_deferred_destroy
{
sync
deleted=$(zdb -vvvvv $TESTPOOL | grep "Deleted Livelist")
while [[ "$deleted" != "" ]]; do
deleted=$(zdb -vvvvv $TESTPOOL | grep "Deleted Livelist")
done
}
# Check that a livelist has been removed, waiting for deferred destroy entries
# to be cleared from zdb.
function check_livelist_gone
{
wait_for_deferred_destroy
zdb -vvvvv $TESTPOOL | grep "Livelist" && \
log_fail "zdb found Livelist after the clone is deleted."
}
# Create a clone in the testpool based on $TESTFS@snap. Verify that the clone
# was created and that it includes a livelist
# $1 fs name
# $2 snap name
# $3 clone name
function clone_dataset
{
log_must zfs clone $TESTPOOL/$1@$2 $TESTPOOL/$3
datasetexists $TESTPOOL/$3 || \
log_fail "zfs clone $TESTPOOL/$3 fail."
check_livelist_exists $3
}

View File

@ -0,0 +1,68 @@
#!/bin/ksh -p
#
# This file and its contents are supplied under the terms of the
# Common Development and Distribution License ("CDDL"), version 1.0.
# You may only use this file in accordance with the terms of version
# 1.0 of the CDDL.
#
# A full copy of the text of the CDDL should have accompanied this
# source. A copy of the CDDL is also available via the Internet at
# http://www.illumos.org/license/CDDL.
#
#
# Copyright (c) 2018 by Delphix. All rights reserved.
#
# DESCRIPTION
# Verify that livelists tracking remapped blocks can be
# properly destroyed.
# STRATEGY
# 1. Create a pool with disk1 and create a filesystem, snapshot
# and clone. Write several files to the clone.
# 2. Add disk2 to the pool and then remove disk1, triggering a
# remap of the blkptrs tracked in the livelist.
# 3. Delete the clone
. $STF_SUITE/include/libtest.shlib
. $STF_SUITE/tests/functional/removal/removal.kshlib
function cleanup
{
poolexists $TESTPOOL2 && zpool destroy $TESTPOOL2
[[ -f $VIRTUAL_DISK1 ]] && log_must rm $VIRTUAL_DISK1
[[ -f $VIRTUAL_DISK2 ]] && lot_must rm $VIRTUAL_DISK2
}
log_onexit cleanup
VIRTUAL_DISK1=/var/tmp/disk1
VIRTUAL_DISK2=/var/tmp/disk2
log_must mkfile $(($MINVDEVSIZE * 8)) $VIRTUAL_DISK1
log_must mkfile $(($MINVDEVSIZE * 16)) $VIRTUAL_DISK2
log_must zpool create $TESTPOOL2 $VIRTUAL_DISK1
log_must poolexists $TESTPOOL2
log_must zfs create $TESTPOOL2/$TESTFS
log_must mkfile 25m /$TESTPOOL2/$TESTFS/atestfile
log_must zfs snapshot $TESTPOOL2/$TESTFS@snap
log_must zfs clone $TESTPOOL2/$TESTFS@snap $TESTPOOL2/$TESTCLONE
log_must mkfile 1m /$TESTPOOL2/$TESTCLONE/$TESTFILE0
log_must mkfile 1m /$TESTPOOL2/$TESTCLONE/$TESTFILE1
log_must mkfile 1m /$TESTPOOL2/$TESTCLONE/$TESTFILE2
log_must zpool add $TESTPOOL2 $VIRTUAL_DISK2
log_must zpool remove $TESTPOOL2 $VIRTUAL_DISK1
wait_for_removal $TESTPOOL2
log_must rm /$TESTPOOL2/$TESTCLONE/$TESTFILE0
log_must rm /$TESTPOOL2/$TESTCLONE/$TESTFILE1
log_must zfs destroy $TESTPOOL2/$TESTCLONE
log_pass "Clone with the livelist feature and remapped blocks," \
"can be destroyed."

View File

@ -0,0 +1,93 @@
#!/bin/ksh -p
#
# This file and its contents are supplied under the terms of the
# Common Development and Distribution License ("CDDL"), version 1.0.
# You may only use this file in accordance with the terms of version
# 1.0 of the CDDL.
#
# A full copy of the text of the CDDL should have accompanied this
# source. A copy of the CDDL is also available via the Internet at
# http://www.illumos.org/license/CDDL.
#
#
# Copyright (c) 2018 by Delphix. All rights reserved.
#
# DESCRIPTION
# Verify that livelists tracking remapped blocks can be
# properly condensed.
# STRATEGY
# 1. Create a pool with disk1 and create a filesystem, snapshot
# and clone. Create two files for the first livelist entry and
# pause condensing.
# 2. Add disk2 to the pool and then remove disk1, triggering a
# remap of the blkptrs tracked in the livelist.
# 3. Overwrite the first file several times to trigger a condense,
# overwrite the second file once and resume condensing, now with
# extra blkptrs added during the remap
# 4. Check that the test added new ALLOC blkptrs mid-condense using
# a variable set in that code path
. $STF_SUITE/include/libtest.shlib
. $STF_SUITE/tests/functional/removal/removal.kshlib
function cleanup
{
poolexists $TESTPOOL2 && zpool destroy $TESTPOOL2
# reset livelist max size
set_tunable64 zfs_livelist_max_entries $ORIGINAL_MAX
[[ -f $VIRTUAL_DISK1 ]] && log_must rm $VIRTUAL_DISK1
[[ -f $VIRTUAL_DISK2 ]] && lot_must rm $VIRTUAL_DISK2
}
log_onexit cleanup
ORIGINAL_MAX=$(get_tunable zfs_livelist_max_entries)
set_tunable64 zfs_livelist_max_entries 0x14
VIRTUAL_DISK1=/var/tmp/disk1
VIRTUAL_DISK2=/var/tmp/disk2
log_must mkfile $(($MINVDEVSIZE * 8)) $VIRTUAL_DISK1
log_must mkfile $(($MINVDEVSIZE * 16)) $VIRTUAL_DISK2
log_must zpool create $TESTPOOL2 $VIRTUAL_DISK1
log_must poolexists $TESTPOOL2
log_must zfs create $TESTPOOL2/$TESTFS
log_must mkfile 100m /$TESTPOOL2/$TESTFS/atestfile
log_must zfs snapshot $TESTPOOL2/$TESTFS@snap
log_must zfs clone $TESTPOOL2/$TESTFS@snap $TESTPOOL2/$TESTCLONE
# Create inital files and pause condense zthr on next execution
log_must mkfile 10m /$TESTPOOL2/$TESTCLONE/A
log_must mkfile 1m /$TESTPOOL2/$TESTCLONE/B
log_must zpool sync $TESTPOOL2
set_tunable32 zfs_livelist_condense_sync_pause 1
# Add a new dev and remove the old one
log_must zpool add $TESTPOOL2 $VIRTUAL_DISK2
log_must zpool remove $TESTPOOL2 $VIRTUAL_DISK1
wait_for_removal $TESTPOOL2
set_tunable32 zfs_livelist_condense_new_alloc 0
# Trigger a condense
log_must mkfile 10m /$TESTPOOL2/$TESTCLONE/A
log_must zpool sync $TESTPOOL2
log_must mkfile 10m /$TESTPOOL2/$TESTCLONE/A
log_must zpool sync $TESTPOOL2
# Write remapped blkptrs which will modify the livelist mid-condense
log_must mkfile 1m /$TESTPOOL2/$TESTCLONE/B
# Resume condense thr
set_tunable32 zfs_livelist_condense_sync_pause 0
log_must zpool sync $TESTPOOL2
# Check that we've added new ALLOC blkptrs during the condense
[[ "0" < "$(get_tunable zfs_livelist_condense_new_alloc)" ]] || \
log_fail "removal/condense test failed"
log_must zfs destroy $TESTPOOL2/$TESTCLONE
log_pass "Clone with the livelist feature and remapped blocks," \
"can be condensed."

View File

@ -93,5 +93,6 @@ if is_linux; then
"feature@allocation_classes"
"feature@resilver_defer"
"feature@bookmark_v2"
"feature@livelist"
)
fi